# Setting

In [1]:
import os
from typing import List, Dict, Union

import polars as pl

from scripts.metrics import map_at_k

In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./candidates/"

WINDOW_N = 10
WEIGHTS = {-10:0.01, -9:0.01, -8:0.01, -7:0.01, -6:0.01, -5:0.05, -4:0.05, -3:0.1, -2:0.25, -1:1, 1:3, \
2:0.5, 3:0.25, 4:0.1, 5:0.05, 6:0.01, 7:0.01, 8:0.01, 9:0.01, 10:0.01, }

In [3]:
def explode_and_add_seq_no(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("seq_no").cast(pl.Int64))
    )
    return df

In [4]:
def generate_co_visit_matrix(df:pl.DataFrame, window_n:int, weights:Dict[int, float]) -> pl.DataFrame:
    # 共起ペアの作成
    df = df.join(df, on="session_id")

    # 共起した間隔を計算し、絞り込み
    df = df.with_columns(
        (pl.col("seq_no_right").cast(pl.Int64) - pl.col("seq_no").abs().cast(pl.Int64)).alias("diff_sequence_num")
    )
    df = df.filter(pl.col("diff_sequence_num") <= window_n)
    df = df.filter(pl.col("yad_no") != pl.col("yad_no_right"))

    # weightを計算し、共起ペアごとに和を計算
    df = df.with_columns(
        pl.col("diff_sequence_num").map_dict(weights).alias("co_visit_weight")
    )
    df = df.group_by(["yad_no", "yad_no_right"]).sum()
    df = df.with_columns(pl.col("co_visit_weight").rank(descending=True).over("yad_no").alias("co_visit_weight_rank"))
    
    df = df.rename({"yad_no_right":"candidate_yad_no"})[["yad_no", "candidate_yad_no", "co_visit_weight_rank"]]

    return df

# For local train/eval

In [5]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [6]:
log = pl.concat([train_log, test_log], how="vertical")

In [7]:
co_visit_matrix = generate_co_visit_matrix(log, window_n=WINDOW_N, weights=WEIGHTS)

In [8]:
co_visit_matrix.write_parquet(os.path.join(OUTPUT_DIR, "co_visit_matrix_for_train_or_eval.parquet"))

In [9]:
co_visit_matrix.head()

yad_no,candidate_yad_no,co_visit_weight_rank
i64,i64,f64
10095,12425,95.5
6514,7890,1.0
10856,11146,24.5
13198,3653,13.0
7787,12750,12.0


# MAP@k=10

In [10]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv")).rename({"yad_no":"label_yad_no"})

In [11]:
last_items = train_log.group_by("session_id").last()

In [12]:
co_visit_matrix = pl.read_parquet(os.path.join(OUTPUT_DIR, "co_visit_matrix_for_train_or_eval.parquet"))

In [13]:
prediction = last_items \
    .join(co_visit_matrix, on="yad_no", how="left") \
    .join(train_label, on="session_id", how="left") \
    .sort(["session_id", "co_visit_weight_rank"], descending=[False, False]) \
    .with_columns((pl.col("candidate_yad_no") == pl.col("label_yad_no")).cast(pl.Int8).alias("user_relevance")) \
    .fill_null(0)

In [14]:
user_relevances = prediction.group_by("session_id", maintain_order=True).all()["user_relevance"].to_list()

In [15]:
map_at_k(user_relevances, 10)

0.2160277568573609

# For test

In [16]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [17]:
# trainのlabelをlogにappendする

prev_items_list = (
    train_log
    .sort(["session_id", "seq_no"])
    .group_by("session_id", maintain_order=True)
    .agg(pl.col("yad_no"))
)["yad_no"].to_list()

next_item_list = (
    train_label
    .sort("session_id")
)["yad_no"].to_list()

prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train_log = train_label.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

train_log = explode_and_add_seq_no(train_log) \
    .drop("yad_no") \
    .rename({"prev_items" : "yad_no"}) \
    [["session_id", "seq_no", "yad_no"]] # カラム並び替え

In [18]:
log = pl.concat([train_log, test_log], how="vertical")

In [19]:
co_visit_matrix = generate_co_visit_matrix(log, window_n=WINDOW_N, weights=WEIGHTS)

In [20]:
co_visit_matrix.write_parquet(os.path.join(OUTPUT_DIR, "co_visit_matrix_for_test.parquet"))

In [21]:
co_visit_matrix.head()

yad_no,candidate_yad_no,co_visit_weight_rank
i64,i64,f64
6868,4823,1.5
13106,10362,4.0
8833,8322,24.5
3079,7920,64.0
13120,7725,2.0
