In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from collections import defaultdict, Counter
from typing import List, Dict, Union

from tqdm import tqdm
import polars as pl

In [4]:
DIR = "/gdrive/MyDrive/amazon_kdd_2023/"

LOCALES = ["IT", "FR", "ES"]
VER = "01"

In [5]:
def preprocess(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("sequence_num"))
    )
    return df

In [6]:
def generate_co_visit_matrix(df:pl.DataFrame) -> pl.DataFrame:
    # 共起ペアの作成
    df = df[["session_id", "prev_items", "sequence_num"]].join(df[["session_id", "prev_items", "sequence_num"]], on="session_id")

    # 共起した間隔を計算し、絞り込み
    df = df.with_columns(
        (pl.col("sequence_num_right").cast(pl.Int64) - pl.col("sequence_num").cast(pl.Int64)).alias("diff_sequence_num")
    )
    df = df.filter(pl.col("prev_items") != pl.col("prev_items_right"))

    # weightを計算し、共起ペアごとに和を計算
    df = df.with_columns(
        (1 / (pl.col("diff_sequence_num").abs().log() + 1)).alias(f"cf_score_{VER}")
    )
    df = df.with_columns(
        ((1 + 0.3 * pl.col("diff_sequence_num").sign()) * pl.col(f"cf_score_{VER}")).alias(f"cf_score_{VER}")
    )
    df = df.groupby(["prev_items", "prev_items_right"]).sum()
    df = df.rename({"prev_items":"item_x", "prev_items_right":"item_y"})[["item_x", "item_y", f"cf_score_{VER}"]]

    return df

In [7]:
def filter_by_locale_availability(co_visit_matrix:pl.DataFrame, product:pl.DataFrame):
    product = product.unique(subset=["id"])
    product = product[["id", "available_locales"]]
    co_visit_matrix = co_visit_matrix.join(product, left_on="item_x", right_on="id", how="left").rename({"available_locales":"item_locales"})
    co_visit_matrix = co_visit_matrix.join(product, left_on="item_y", right_on="id", how="left").rename({"available_locales":"candidate_item_locales"})
    dfs = []
    for locale in LOCALES:
        df = co_visit_matrix.filter(pl.lit(locale).is_in(pl.col("item_locales")) & pl.lit(locale).is_in(pl.col("candidate_item_locales"))) 
        df = df.with_columns(pl.lit(locale).alias("locale"))
        df = df[["item_x", "item_y", f"cf_score_{VER}", "locale"]]
        dfs.append(df)
    co_visit_matrix = pl.concat(dfs)
    return co_visit_matrix

# For local train/eval

In [8]:
train1 = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")
train2 = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
train1 = train1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
train = pl.concat([train1, train2])

test1_1 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase1.parquet")
test1_2 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet")
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet")
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet")
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet")
test1_1 = test1_1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test1_2 = test1_2.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test1_1, test1_2, test2_1, test2_2, test3_1, test3_2])

In [9]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])

In [10]:
product = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product_03.parquet")

In [11]:
co_visit_matrix = generate_co_visit_matrix(session_df)

In [12]:
co_visit_matrix = filter_by_locale_availability(co_visit_matrix, product)

In [13]:
file_name = f"cf_feature_{VER}_for_train_or_eval.parquet"
co_visit_matrix.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/features/task2/" + file_name)

In [14]:
co_visit_matrix.head()

item_x,item_y,cf_score_01,locale
str,str,f64,str
"""B094JX8KNG""","""B09WGXMRBZ""",1.3,"""IT"""
"""B09NS737DT""","""B07FKRWG6D""",5.030646,"""IT"""
"""B09FTMYLQ2""","""B081F2MNDH""",2.367335,"""IT"""
"""B017TIHCNI""","""B07HCH9WVF""",0.413431,"""IT"""
"""B076VNTDBY""","""B09BZ64R7S""",0.7,"""IT"""


# For test inference

In [15]:
train1 = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")
train2 = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
train1 = train1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
train = pl.concat([train1, train2])

test1_1 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase1.parquet")
test1_2 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet")
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet")
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet")
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet")
test1_1 = test1_1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test1_2 = test1_2.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test1_1, test1_2, test2_1, test2_2, test3_1, test3_2])

In [16]:
# trainのnext_itemをprev_itemsにappendする
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [17]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])

In [18]:
product = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product_03.parquet")

In [19]:
co_visit_matrix = generate_co_visit_matrix(session_df)

In [20]:
co_visit_matrix = filter_by_locale_availability(co_visit_matrix, product)

In [21]:
file_name = f"cf_feature_{VER}_for_inference.parquet"
co_visit_matrix.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/features/task2/" + file_name)

In [22]:
co_visit_matrix.head()

item_x,item_y,cf_score_01,locale
str,str,f64,str
"""B01B1FW4Q0""","""B09NLZRKZC""",0.333554,"""IT"""
"""B0711LLR83""","""B09BXDZ9BD""",1.033554,"""IT"""
"""B0999NQPKR""","""B077GPZ7DC""",0.767801,"""IT"""
"""B087XLYH41""","""B07B8NWMMN""",0.422154,"""IT"""
"""B06ZZ68SBZ""","""B004XISZ8E""",0.544778,"""IT"""
