In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from collections import defaultdict, Counter
from typing import List, Dict, Union

from tqdm import tqdm
import polars as pl

In [4]:
WINDOW_N = 10
WEIGHTS = {-10:0.01, -9:0.01, -8:0.01, -7:0.01, -6:0.01, -5:0.05, -4:0.05, -3:0.1, -2:0.25, -1:0.5, 1:3, \
2:0.5, 3:0.25, 4:0.1, 5:0.05, 6:0.01, 7:0.01, 8:0.01, 9:0.01, 10:0.01, }

DIR = "/gdrive/MyDrive/amazon_kdd_2023/"
LOCALES = ["IT", "FR", "ES"]
VER = "02"

In [5]:
def preprocess(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("sequence_num"))
    )
    return df

In [6]:
def generate_cf_score_1(df:pl.DataFrame) -> pl.DataFrame:
    # 共起ペアの作成
    df = df[["session_id", "prev_items", "sequence_num"]].join(df[["session_id", "prev_items", "sequence_num"]], on="session_id")

    # 共起した間隔を計算し、絞り込み
    df = df.with_columns(
        (pl.col("sequence_num_right").cast(pl.Int64) - pl.col("sequence_num").cast(pl.Int64)).alias("diff_sequence_num")
    )
    df = df.filter(pl.col("diff_sequence_num").abs() <= WINDOW_N)
    df = df.filter(pl.col("prev_items") != pl.col("prev_items_right"))

    # weightを計算し、共起ペアごとに和を計算
    df = df.with_columns(
        pl.col("diff_sequence_num").map_dict(WEIGHTS).alias("cf_score_1")
    )
    df = df.groupby(["prev_items", "prev_items_right"]).sum()
    df = df.rename({"prev_items":"item_x", "prev_items_right":"item_y"})[["item_x", "item_y", "cf_score_1"]]

    return df

def generate_cf_score_2(df:pl.DataFrame) -> pl.DataFrame:
    # 共起ペアの作成
    df = df[["session_id", "prev_items", "sequence_num"]].join(df[["session_id", "prev_items", "sequence_num"]], on="session_id")

    # 共起した間隔を計算し、絞り込み
    df = df.with_columns(
        (pl.col("sequence_num_right").cast(pl.Int64) - pl.col("sequence_num").cast(pl.Int64)).alias("diff_sequence_num")
    )
    df = df.filter(pl.col("prev_items") != pl.col("prev_items_right"))

    # weightを計算し、共起ペアごとに和を計算
    df = df.with_columns(
        (1 / (pl.col("diff_sequence_num").abs().log() + 1)).alias("cf_score_2")
    )
    df = df.groupby(["prev_items", "prev_items_right"]).sum()
    df = df.rename({"prev_items":"item_x", "prev_items_right":"item_y"})[["item_x", "item_y", "cf_score_2"]]

    return df

def generate_lift_1(df:pl.DataFrame) -> pl.DataFrame:
    # 各アイテムの出現数
    item_count = df.groupby("prev_items").count()

    # 共起ペアの作成
    df = df[["session_id", "prev_items", "sequence_num"]].join(df[["session_id", "prev_items", "sequence_num"]], on="session_id")

    # 共起した間隔を計算し、絞り込み
    df = df.with_columns(
        (pl.col("sequence_num_right").cast(pl.Int64) - pl.col("sequence_num").cast(pl.Int64)).alias("diff_sequence_num")
    )
    df = df.filter(pl.col("diff_sequence_num").abs() <= WINDOW_N)
    df = df.filter(pl.col("prev_items") != pl.col("prev_items_right"))

    # weightを計算し、共起ペアごとに和を計算
    df = df.with_columns(
        pl.col("diff_sequence_num").map_dict(WEIGHTS).alias("co_visit_weight")
    )
    df = df.groupby(["prev_items", "prev_items_right"]).sum()
    df = df.rename({"prev_items":"item_x", "prev_items_right":"item_y"})[["item_x", "item_y", "co_visit_weight"]]

    # 各アイテムの出現数を計算して結合し、リフト値を計算する
    df = df.join(item_count, left_on="item_x", right_on="prev_items", how="left").rename({"count":"item_count"})
    df = df.join(item_count, left_on="item_y", right_on="prev_items", how="left").rename({"count":"candidate_item_count"})
    df = df.with_columns((pl.col("co_visit_weight") / pl.col("item_count").sqrt() / pl.col("candidate_item_count").sqrt()).alias("lift_1"))

    return df[["item_x", "item_y", "lift_1"]]

def generate_lift_2(df:pl.DataFrame) -> pl.DataFrame:
    # 各アイテムの出現数
    item_count = df.groupby("prev_items").count()

    # 共起ペアの作成
    df = df[["session_id", "prev_items", "sequence_num"]].join(df[["session_id", "prev_items", "sequence_num"]], on="session_id")

    # 共起した間隔を計算し、絞り込み
    df = df.with_columns(
        (pl.col("sequence_num_right").cast(pl.Int64) - pl.col("sequence_num").cast(pl.Int64)).alias("diff_sequence_num")
    )
    df = df.filter(pl.col("prev_items") != pl.col("prev_items_right"))

    # weightを計算し、共起ペアごとに和を計算
    df = df.with_columns(
        (1 / (pl.col("diff_sequence_num").abs().log() + 1)).alias("co_visit_weight")
    )
    df = df.groupby(["prev_items", "prev_items_right"]).sum()
    df = df.rename({"prev_items":"item_x", "prev_items_right":"item_y"})[["item_x", "item_y", "co_visit_weight"]]

    # 各アイテムの出現数を計算して結合し、リフト値を計算する
    df = df.join(item_count, left_on="item_x", right_on="prev_items", how="left").rename({"count":"item_count"})
    df = df.join(item_count, left_on="item_y", right_on="prev_items", how="left").rename({"count":"candidate_item_count"})
    df = df.with_columns((pl.col("co_visit_weight") / pl.col("item_count").sqrt() / pl.col("candidate_item_count").sqrt()).alias("lift_2"))

    return df[["item_x", "item_y", "lift_2"]]

def generate_consective_n(df, n):
    # 共起ペアの作成
    df = df[["session_id", "prev_items", "sequence_num"]].join(df[["session_id", "prev_items", "sequence_num"]], on="session_id")

    # 共起した間隔を計算し、絞り込み
    df = df.with_columns(
        (pl.col("sequence_num_right").cast(pl.Int64) - pl.col("sequence_num").cast(pl.Int64)).alias("diff_sequence_num")
    )
    df = df.filter((pl.col("diff_sequence_num") > 0)&(pl.col("diff_sequence_num") <= n))
    df = df.filter(pl.col("prev_items") != pl.col("prev_items_right"))

    # weightを計算し、共起ペアごとに和を計算
    df = df.with_columns(
        pl.lit(1).alias(f"consective_{n}_score")
    )
    df = df.groupby(["prev_items", "prev_items_right"]).sum()
    df = df.rename({"prev_items":"item_x", "prev_items_right":"item_y"})[["item_x", "item_y", f"consective_{n}_score"]]

    return df

In [7]:
def filter_by_locale_availability(feat:pl.DataFrame, product:pl.DataFrame, feat_name:str):
    product = product.unique(subset=["id"])
    product = product[["id", "available_locales"]]
    feat = feat.join(product, left_on="item_x", right_on="id", how="left").rename({"available_locales":"item_locales"})
    feat = feat.join(product, left_on="item_y", right_on="id", how="left").rename({"available_locales":"candidate_item_locales"})
    dfs = []
    for locale in LOCALES:
        df = feat.filter(pl.lit(locale).is_in(pl.col("item_locales")) & pl.lit(locale).is_in(pl.col("candidate_item_locales"))) 
        df = df.with_columns(pl.lit(locale).alias("locale"))
        df = df[["item_x", "item_y", feat_name, "locale"]]
        dfs.append(df)
    df = pl.concat(dfs)
    
    return df

In [8]:
def generate_features(df, product):
    print("generating cf score 1...")
    cf_score_1 = generate_cf_score_1(df)
    cf_score_1 = filter_by_locale_availability(cf_score_1, product, "cf_score_1")

    print("generating cf score 1...")
    cf_score_2 = generate_cf_score_2(df)
    cf_score_2 = filter_by_locale_availability(cf_score_2, product, "cf_score_2")

    print("generating lift 1...")
    lift_1 = generate_lift_1(df)
    lift_1 = filter_by_locale_availability(lift_1, product, "lift_1")

    print("generating lift 2...")
    lift_2 = generate_lift_2(df)
    lift_2 = filter_by_locale_availability(lift_2, product, "lift_2")

    print("generating consective 1...")
    consective_1 = generate_consective_n(df, 1)
    consective_1 = filter_by_locale_availability(consective_1, product, "consective_1_score")

    print("generating consective 2...")
    consective_2 = generate_consective_n(df, 2)
    consective_2 = filter_by_locale_availability(consective_2, product, "consective_2_score")

    print("generating consective 3...")
    consective_3 = generate_consective_n(df, 3)
    consective_3 = filter_by_locale_availability(consective_3, product, "consective_3_score")

    print("generating consective 4...")
    consective_4 = generate_consective_n(df, 4)
    consective_4 = filter_by_locale_availability(consective_4, product, "consective_4_score")

    print("generating consective 5...")
    consective_5 = generate_consective_n(df, 5)
    consective_5 = filter_by_locale_availability(consective_5, product, "consective_5_score")

    print("generating consective 10...")
    consective_10 = generate_consective_n(df, 10)
    consective_10 = filter_by_locale_availability(consective_10, product, "consective_10_score")

    print("merge features...")
    features = cf_score_1.join(cf_score_2, on=["item_x", "item_y", "locale"], how="outer")
    features = features.join(lift_1, on=["item_x", "item_y", "locale"], how="outer")
    features = features.join(lift_2, on=["item_x", "item_y", "locale"], how="outer")
    features = features.join(consective_1, on=["item_x", "item_y", "locale"], how="outer")
    features = features.join(consective_2, on=["item_x", "item_y", "locale"], how="outer")
    features = features.join(consective_3, on=["item_x", "item_y", "locale"], how="outer")
    features = features.join(consective_4, on=["item_x", "item_y", "locale"], how="outer")
    features = features.join(consective_5, on=["item_x", "item_y", "locale"], how="outer")
    features = features.join(consective_10, on=["item_x", "item_y", "locale"], how="outer")
    features = features.fill_null(0)

    return features

# For local train/eval

In [9]:
train1 = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")
train2 = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
train1 = train1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
train = pl.concat([train1, train2])

test1_1 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase1.parquet")
test1_2 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet")
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet")
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet")
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet")
test1_1 = test1_1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test1_2 = test1_2.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test1_1, test1_2, test2_1, test2_2, test3_1, test3_2])

In [10]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])

In [11]:
product = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product_03.parquet")

In [12]:
features = generate_features(session_df, product)

generating cf score 1...
generating cf score 1...
generating lift 1...
generating lift 2...
generating consective 1...
generating consective 2...
generating consective 3...
generating consective 4...
generating consective 5...
generating consective 10...
merge features...


In [13]:
file_name = f"cf_feature_{VER}_for_train_or_eval.parquet"
features.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/features/task2/" + file_name)

In [14]:
features.head()

item_x,item_y,locale,cf_score_1,cf_score_2,lift_1,lift_2,consective_1_score,consective_2_score,consective_3_score,consective_4_score,consective_5_score,consective_10_score
str,str,str,f64,f64,f64,f64,i32,i32,i32,i32,i32,i32
"""B0079G4ZH2""","""B088JYNSRD""","""IT""",0.35,1.067121,0.0175,0.053356,0,0,0,0,0,0
"""B07ZH7HYDX""","""B00MPHNFTE""","""IT""",0.35,1.067121,0.00959,0.029239,0,0,0,0,0,0
"""B007B9NXAC""","""B079KHS5S9""","""IT""",0.1,0.802284,0.000387,0.003108,0,0,0,0,0,0
"""B09DD32T3Q""","""B0132XX152""","""IT""",0.1,0.476505,0.002113,0.010068,0,0,0,0,0,0
"""B09W176DLC""","""B0B9K3WFYK""","""IT""",0.05,0.41906,0.007372,0.061787,0,0,0,0,0,0


In [15]:
features.to_pandas()

Unnamed: 0,item_x,item_y,locale,cf_score_1,cf_score_2,lift_1,lift_2,consective_1_score,consective_2_score,consective_3_score,consective_4_score,consective_5_score,consective_10_score
0,B0079G4ZH2,B088JYNSRD,IT,0.35,1.067121,0.017500,0.053356,0,0,0,0,0,0
1,B07ZH7HYDX,B00MPHNFTE,IT,0.35,1.067121,0.009590,0.029239,0,0,0,0,0,0
2,B007B9NXAC,B079KHS5S9,IT,0.10,0.802284,0.000387,0.003108,0,0,0,0,0,0
3,B09DD32T3Q,B0132XX152,IT,0.10,0.476505,0.002113,0.010068,0,0,0,0,0,0
4,B09W176DLC,B0B9K3WFYK,IT,0.05,0.419060,0.007372,0.061787,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3358029,B09R4DMQ6C,B082WB4Q4S,ES,0.10,0.476505,0.000461,0.002199,0,0,0,0,0,0
3358030,B0B4WNVXG2,B0BDJZLSDH,ES,0.10,0.419060,0.000571,0.002392,0,0,0,1,1,1
3358031,B07Z662186,B09R1QFSNL,ES,2.57,5.767640,0.014777,0.033163,0,3,4,6,7,9
3358032,B09ND17G4W,B01JRXTIOE,ES,3.85,3.067121,0.065049,0.051822,1,1,1,1,1,1


# For test inference

In [16]:
train1 = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")
train2 = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
train1 = train1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
train = pl.concat([train1, train2])

test1_1 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase1.parquet")
test1_2 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet")
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet")
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet")
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet")
test1_1 = test1_1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test1_2 = test1_2.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test1_1, test1_2, test2_1, test2_2, test3_1, test3_2])

In [17]:
# trainのnext_itemをprev_itemsにappendする
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [18]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])

In [19]:
product = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product_03.parquet")

In [20]:
features = generate_features(session_df, product)

generating cf score 1...
generating cf score 1...
generating lift 1...
generating lift 2...
generating consective 1...
generating consective 2...
generating consective 3...
generating consective 4...
generating consective 5...
generating consective 10...
merge features...


In [21]:
file_name = f"cf_feature_{VER}_for_inference.parquet"
features.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/features/task2/" + file_name)

In [22]:
features.head()

item_x,item_y,locale,cf_score_1,cf_score_2,lift_1,lift_2,consective_1_score,consective_2_score,consective_3_score,consective_4_score,consective_5_score,consective_10_score
str,str,str,f64,f64,f64,f64,i32,i32,i32,i32,i32,i32
"""B00FS5FV60""","""B07J4YZ5DZ""","""IT""",4.32,4.679081,0.084158,0.091153,1,1,3,3,3,4
"""B09RGMM3R6""","""B08WSVRJ59""","""IT""",3.5,1.590616,0.020261,0.009208,1,2,2,2,2,2
"""B00A7Q5XA4""","""B000FCMSRQ""","""IT""",3.25,1.590616,0.077712,0.038034,1,1,1,1,1,1
"""B08GLZ1VRG""","""B07JFC3QVY""","""IT""",0.5,1.0,0.041667,0.083333,0,0,0,0,0,0
"""B092K5R5WH""","""B07MDK7FZR""","""IT""",0.02,0.625543,0.000514,0.016087,0,0,0,0,0,0
