# Setting

In [1]:
import os

import polars as pl
from tqdm import tqdm

from scripts.metrics import map_at_k

In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./candidates/"

In [3]:
def explode_and_add_seq_no(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("seq_no").cast(pl.Int64))
    )
    return df

In [4]:
def generate_popular_yados_per_lrg_cd(logs, yados):
    logs = logs.join(yados[["yad_no", "lrg_cd"]], on="yad_no", how="left")
    
    popular_yados = logs.group_by("yad_no").count().rename({"count":"popularity"})
    popular_yados = yados.join(popular_yados, on="yad_no", how="left").fill_null(0)
    popular_yados = popular_yados \
        .sort(["lrg_cd", "popularity"], descending=[False, True]) \
        .group_by("lrg_cd", maintain_order=True) \
        .head(10)
    popular_yados = popular_yados.with_columns(pl.col("popularity").rank(descending=True).over("lrg_cd").alias("trend_popularity_rank_lrg"))
    
    df = yados[["yad_no", "lrg_cd"]] \
        .join(popular_yados[["yad_no", "lrg_cd", "trend_popularity_rank_lrg"]], on="lrg_cd", how="left") \
        .rename({"yad_no_right":"candidate_yad_no"}) \
        .filter(pl.col("yad_no") != pl.col("candidate_yad_no"))

    return df

# For local train/eval

In [5]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))

In [6]:
yados = pl.read_csv(os.path.join(INPUT_DIR, "yado.csv"))

In [7]:
popular_yados_per_lrg_cd = generate_popular_yados_per_lrg_cd(train_log, yados)

In [8]:
popular_yados_per_lrg_cd.write_parquet(os.path.join(OUTPUT_DIR, "trend_popular_yados_per_lrg_cd_for_train_or_eval.parquet"))

In [9]:
popular_yados_per_lrg_cd.head()

yad_no,lrg_cd,candidate_yad_no,trend_popularity_rank_lrg
i64,str,i64,f64
1,"""449c52ef581d5f…",1818,1.0
1,"""449c52ef581d5f…",13017,2.0
1,"""449c52ef581d5f…",693,3.0
1,"""449c52ef581d5f…",9971,4.0
1,"""449c52ef581d5f…",4646,5.0


# Map@k=10

In [10]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv")).rename({"yad_no":"label_yad_no"})

In [11]:
last_items = train_log.group_by("session_id").last()

In [12]:
popular_yados_per_lrg_cd = pl.read_parquet(os.path.join(OUTPUT_DIR, "trend_popular_yados_per_lrg_cd_for_train_or_eval.parquet"))

In [13]:
prediction = last_items \
    .join(popular_yados_per_lrg_cd, on="yad_no", how="left") \
    .join(train_label, on="session_id", how="left") \
    .sort(["session_id", "trend_popularity_rank_lrg"], descending=[False, False]) \
    .with_columns((pl.col("candidate_yad_no") == pl.col("label_yad_no")).cast(pl.Int8).alias("user_relevance")) \
    .fill_null(0)

In [14]:
user_relevances = prediction.group_by("session_id", maintain_order=True).all()["user_relevance"].to_list()
map_at_k(user_relevances, 10)

0.13150770806028203

# For Test

In [15]:
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [16]:
popular_yados_per_lrg_cd = generate_popular_yados_per_lrg_cd(test_log, yados)

In [17]:
popular_yados_per_lrg_cd.write_parquet(os.path.join(OUTPUT_DIR, "trend_popular_yados_per_lrg_cd_for_test.parquet"))

In [18]:
popular_yados_per_lrg_cd.head()

yad_no,lrg_cd,candidate_yad_no,trend_popularity_rank_lrg
i64,str,i64,f64
1,"""449c52ef581d5f…",693,1.0
1,"""449c52ef581d5f…",4228,2.0
1,"""449c52ef581d5f…",1818,3.0
1,"""449c52ef581d5f…",3441,4.0
1,"""449c52ef581d5f…",3988,5.0
