# Setting

In [1]:
import os
from typing import List, Dict, Union

import polars as pl

from scripts.metrics import map_at_k

In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./candidates/"

In [3]:
def generate_already_clicked_candidates(df:pl.DataFrame) -> pl.DataFrame:
    # 最後から数えて何番目のアクションか？
    df = df \
        .sort(["session_id", "seq_no"], descending=[False, True]) \
        .with_columns(
            pl.col("seq_no")
            .cumcount()
            .over("session_id")
            .alias("seq_no_inverse")
        )
    
    # 一番最近のものは削除
    df = df.filter(pl.col("seq_no_inverse") != 0)
    
    # 整形
    df = df[["session_id", "yad_no", "seq_no_inverse"]].rename({"yad_no":"candidate_yad_no"})
    
    return df

# For local train/eval

In [4]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))

In [5]:
already_clicked = generate_already_clicked_candidates(train_log)

In [6]:
already_clicked.write_parquet(os.path.join(OUTPUT_DIR, "already_clicked_for_train_or_eval.parquet"))

In [7]:
already_clicked.head()

session_id,candidate_yad_no,seq_no_inverse
str,i64,u32
"""000104bdffaaad…",96,1
"""00026fd325b5d6…",756,1
"""0003439cbd15fa…",143,1
"""0003948318658b…",569,1
"""00044db9da5da4…",1383,1


# MAP@k=10

In [8]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv")).rename({"yad_no":"label_yad_no"})

In [9]:
already_clicked = pl.read_parquet(os.path.join(OUTPUT_DIR, "already_clicked_for_train_or_eval.parquet"))

In [10]:
prediction = train_label \
    .join(already_clicked, on="session_id", how="left") \
    .sort(["session_id", "seq_no_inverse"], descending=[False, False]) \
    .with_columns((pl.col("candidate_yad_no") == pl.col("label_yad_no")).cast(pl.Int8).alias("user_relevance")) \
    .fill_null(0)

In [11]:
user_relevances = prediction.group_by("session_id", maintain_order=True).all()["user_relevance"].to_list()

In [12]:
map_at_k(user_relevances, 10)

0.29503056030319813

# For test

In [13]:
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [14]:
already_clicked = generate_already_clicked_candidates(test_log)

In [15]:
already_clicked.write_parquet(os.path.join(OUTPUT_DIR, "already_clicked_for_test.parquet"))