# Setting

In [1]:
import os
from typing import List, Dict, Union

import polars as pl

from scripts.metrics import map_at_k

In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./features/"

In [3]:
def explode_and_add_seq_no(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("seq_no").cast(pl.Int64))
    )
    return df

In [4]:
def generate_lift(df:pl.DataFrame) -> pl.DataFrame:

    session_count = df["session_id"].n_unique()
    item_count = df.group_by("yad_no").count()
    
    # 共起ペアの作成計算
    df = df.join(df, on="session_id")
    
    # それぞれの宿の回数と、共起の回数を計算
    df = df.filter(pl.col("yad_no") != pl.col("yad_no_right"))[["yad_no", "yad_no_right"]]
    df = df.group_by(["yad_no", "yad_no_right"]).count().rename({"count":"co_occurrence"})
    df = df.join(
        item_count,
        on = "yad_no",
        how = "left"
    ).rename(
        {"count":"yad_no_occurrence"}
    ).join(
        item_count,
        left_on = "yad_no_right",
        right_on = "yad_no",
        how = "left"
    ).rename(
        {"count":"yad_no_right_occurrence"}
    )
    
    # リフト値の計算
    df = df.with_columns(
        ((pl.col("co_occurrence") / session_count) / ((pl.col("yad_no_occurrence") / session_count) * (pl.col("yad_no_right_occurrence") / session_count))).alias("lift")
    )

    # 整形
    df = df[["yad_no", "yad_no_right", "lift"]]

    return df

# For local train/eval

In [5]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [6]:
log = pl.concat([train_log, test_log], how="vertical")

In [7]:
trend_lift = generate_lift(train_log)
trend_lift = trend_lift.rename({"lift":"trend_lift"})

In [8]:
lift = generate_lift(log)

In [9]:
cf_features = lift.join(trend_lift, on=["yad_no", "yad_no_right"], how="left").fill_null(-1)

In [10]:
cf_features.write_parquet(os.path.join(OUTPUT_DIR, "cf_feat_for_train_or_eval.parquet"))

In [11]:
cf_features.head()

yad_no,yad_no_right,lift,trend_lift
i64,i64,f64,f64
10095,12425,0.681333,0.822581
6514,7890,671.591304,855.401481
10856,11146,53.466944,65.464399
13198,3653,39.244411,34.108932
7787,12750,77.053209,278.935266


# For test

In [12]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [13]:
# trainのlabelをlogにappendする

prev_items_list = (
    train_log
    .sort(["session_id", "seq_no"])
    .group_by("session_id", maintain_order=True)
    .agg(pl.col("yad_no"))
)["yad_no"].to_list()

next_item_list = (
    train_label
    .sort("session_id")
)["yad_no"].to_list()

prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train_log = train_label.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

train_log = explode_and_add_seq_no(train_log) \
    .drop("yad_no") \
    .rename({"prev_items" : "yad_no"}) \
    [["session_id", "seq_no", "yad_no"]] # カラム並び替え

In [14]:
log = pl.concat([train_log, test_log], how="vertical")

In [15]:
trend_lift = generate_lift(test_log)
trend_lift = trend_lift.rename({"lift":"trend_lift"})

In [16]:
lift = generate_lift(log)

In [17]:
cf_features = lift.join(trend_lift, on=["yad_no", "yad_no_right"], how="left").fill_null(-1)

In [18]:
cf_features.write_parquet(os.path.join(OUTPUT_DIR, "cf_feat_for_test.parquet"))

In [19]:
cf_features.head()

yad_no,yad_no_right,lift,trend_lift
i64,i64,f64,f64
6868,4823,2797.171026,-1.0
13106,10362,137.388161,150.344234
8833,8322,29.813936,-1.0
3079,7920,11.493006,-1.0
13120,7725,3777.701087,-1.0
