# Setting

In [1]:
import os
from typing import List, Dict, Union

import polars as pl

from scripts.metrics import map_at_k

In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./candidates/"

In [3]:
def explode_and_add_seq_no(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("seq_no").cast(pl.Int64))
    )
    return df

In [4]:
def generate_co_visit_matrix(df:pl.DataFrame) -> pl.DataFrame:
    # 2-hop先までの共起ペアの作成
    one_hop = df.join(df, on="session_id")
    one_hop = one_hop.filter(pl.col("yad_no") != pl.col("yad_no_right"))[["yad_no", "yad_no_right"]]
    two_hop = one_hop.join(one_hop, left_on="yad_no_right", right_on="yad_no")
    two_hop = two_hop.filter(pl.col("yad_no") != pl.col("yad_no_right_right"))

    # yad_noのペアごとに共起回数を計算
    df = two_hop.group_by(["yad_no", "yad_no_right_right"]).count()

    # rankを計算
    df = df.with_columns(
        pl.col("count").rank(descending=True).over("yad_no").alias("trend_two_hop_co_visit_weight_rank")
    ).filter(
        pl.col("trend_two_hop_co_visit_weight_rank") <= 100
    )

    # 整形
    df = df.rename({"yad_no_right_right":"candidate_yad_no"})[["yad_no", "candidate_yad_no", "trend_two_hop_co_visit_weight_rank"]]

    return df

# For local train/eval

In [5]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))

In [6]:
co_visit_matrix = generate_co_visit_matrix(train_log)

In [7]:
co_visit_matrix.write_parquet(os.path.join(OUTPUT_DIR, "trend_two_hop_co_visit_matrix_for_train_or_eval.parquet"))

In [8]:
co_visit_matrix.head()

yad_no,candidate_yad_no,trend_two_hop_co_visit_weight_rank
i64,i64,f64
10915,96,12.0
11380,898,5.0
4072,898,6.0
12491,898,4.0
7014,143,2.0


# MAP@k=10

In [9]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv")).rename({"yad_no":"label_yad_no"})

In [10]:
last_items = train_log.group_by("session_id").last()

In [11]:
co_visit_matrix = pl.read_parquet(os.path.join(OUTPUT_DIR, "trend_two_hop_co_visit_matrix_for_train_or_eval.parquet"))

In [12]:
prediction = last_items \
    .join(co_visit_matrix, on="yad_no", how="left") \
    .join(train_label, on="session_id", how="left") \
    .sort(["session_id", "trend_two_hop_co_visit_weight_rank"], descending=[False, False]) \
    .with_columns((pl.col("candidate_yad_no") == pl.col("label_yad_no")).cast(pl.Int8).alias("user_relevance")) \
    .fill_null(0)

In [13]:
user_relevances = prediction.group_by("session_id", maintain_order=True).all()["user_relevance"].to_list()

In [14]:
map_at_k(user_relevances, 10)

0.1596275126630377

# For test

In [15]:
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [16]:
co_visit_matrix = generate_co_visit_matrix(test_log)

In [17]:
co_visit_matrix.write_parquet(os.path.join(OUTPUT_DIR, "trend_two_hop_co_visit_matrix_for_test.parquet"))

In [18]:
co_visit_matrix.head()

yad_no,candidate_yad_no,trend_two_hop_co_visit_weight_rank
i64,i64,f64
6563,3560,10.0
11561,3560,8.0
10233,1959,7.0
4180,13610,7.0
277,13610,12.0
