In [9]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [10]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
from collections import defaultdict, Counter
from typing import List, Dict, Union

from tqdm import tqdm
import polars as pl

In [12]:
TOP_N = 100
LOCALES = ["IT", "FR", "ES"]
VER = "38"
DIR = "/gdrive/MyDrive/amazon_kdd_2023/"

In [13]:
def preprocess(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("sequence_num"))
    )
    return df

In [14]:
def generate_co_visit_matrix(df:pl.DataFrame) -> pl.DataFrame:
    # 共起ペアの作成
    df = df[["session_id", "prev_items", "sequence_num"]].join(df[["session_id", "prev_items", "sequence_num"]], on="session_id")

    # 共起した間隔を計算し、絞り込み
    df = df.with_columns(
        (pl.col("sequence_num_right").cast(pl.Int64) - pl.col("sequence_num").cast(pl.Int64)).alias("diff_sequence_num")
    )
    df = df.filter(pl.col("diff_sequence_num") == 1)
    df = df.filter(pl.col("prev_items") != pl.col("prev_items_right"))

    # weightを計算し、共起ペアごとに和を計算
    df = df.with_columns(
        pl.lit(1).alias("consective_1_weight")
    )
    df = df.groupby(["prev_items", "prev_items_right"]).sum()
    df = df.rename({"prev_items":"item", "prev_items_right":"candidate_item"})[["item", "candidate_item", "consective_1_weight"]]

    return df

In [15]:
def filter_by_locale_availability(co_visit_matrix:pl.DataFrame, product:pl.DataFrame):
    product = product.unique(subset=["id"])
    product = product[["id", "available_locales"]]
    co_visit_matrix = co_visit_matrix.join(product, left_on="item", right_on="id", how="left").rename({"available_locales":"item_locales"})
    co_visit_matrix = co_visit_matrix.join(product, left_on="candidate_item", right_on="id", how="left").rename({"available_locales":"candidate_item_locales"})
    dfs = []
    for locale in LOCALES:
        df = co_visit_matrix.filter(pl.lit(locale).is_in(pl.col("item_locales")) & pl.lit(locale).is_in(pl.col("candidate_item_locales"))) 
        df = df.with_columns(pl.lit(locale).alias("locale"))
        df = df[["item", "candidate_item", "consective_1_weight", "locale"]]
        df = df.sort(["item", "consective_1_weight"], descending=[False, True])
        df = df.groupby("item", maintain_order=True).head(TOP_N)
        df = df.with_columns(
            pl.col("consective_1_weight").rank(descending=True, method="min").over("item").alias("consective_1_rank")
        )
        dfs.append(df)
    co_visit_matrix = pl.concat(dfs)
    return co_visit_matrix

# For local train/eval

In [16]:
train1 = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")
train2 = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2_augmented.parquet")
train1 = train1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
train = pl.concat([train1, train2])

test1_1 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase1.parquet")
test1_2 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase2.parquet")
test = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_leftover.parquet")
test1_1 = test1_1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test1_2 = test1_2.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test = pl.concat([test1_1, test1_2, test])

In [17]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])

In [18]:
product = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product_03.parquet")

In [19]:
co_visit_matrix = generate_co_visit_matrix(session_df)

In [20]:
co_visit_matrix = filter_by_locale_availability(co_visit_matrix, product)

In [21]:
file_name = f"co_visit_matrix_{VER}_for_train_or_eval.parquet"
co_visit_matrix.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/" + file_name)

In [22]:
co_visit_matrix.head()

item,candidate_item,consective_1_weight,locale,consective_1_rank
str,str,i32,str,u32
"""0007477155""","""0008402787""",11,"""IT""",1
"""0007477155""","""B08X3MMG2M""",1,"""IT""",2
"""0007477155""","""8804633301""",1,"""IT""",2
"""0007477155""","""880475172X""",1,"""IT""",2
"""0007477155""","""B00007KQF8""",1,"""IT""",2


## MRR@100

In [23]:
train = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/train_task2.parquet")

In [24]:
# last_itemの抽出
last_item_list = []
prev_items_list = train["prev_items"].to_list()
for prev_items in prev_items_list:
    last_item_list.append(prev_items[-1])
train = train.with_columns(pl.Series(name="last_item", values=last_item_list))

In [25]:
train = train[["session_id", "locale", "last_item", "next_item"]]

In [26]:
co_visit_matrix = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/" + file_name)

In [27]:
# candidateの結合とlabelの付与
df = train.join(co_visit_matrix, left_on=["locale", "last_item"], right_on=["locale", "item"], how="left")
df = df.sort(["session_id", "consective_1_weight"], descending=[False, True])
df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
label_lists = df.groupby("session_id", maintain_order=True).all()["label"].to_list()

In [28]:
# MRRの計算
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.27481


# For test inference

In [29]:
train1 = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")
train2 = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2_augmented.parquet")
train1 = train1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
train = pl.concat([train1, train2])

test1_1 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase1.parquet")
test1_2 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase2.parquet")
test = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_leftover.parquet")
test1_1 = test1_1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test1_2 = test1_2.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test = pl.concat([test1_1, test1_2, test])

In [30]:
# trainのnext_itemをprev_itemsにappendする
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [31]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])

In [32]:
product = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product_03.parquet")

In [33]:
co_visit_matrix = generate_co_visit_matrix(session_df)

In [34]:
co_visit_matrix = filter_by_locale_availability(co_visit_matrix, product)

In [35]:
file_name = f"co_visit_matrix_{VER}_for_inference.parquet"
co_visit_matrix.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/" + file_name)

In [36]:
co_visit_matrix.head()

item,candidate_item,consective_1_weight,locale,consective_1_rank
str,str,i32,str,u32
"""0007477155""","""0008402787""",15,"""IT""",1
"""0007477155""","""B08X3MMG2M""",1,"""IT""",2
"""0007477155""","""B00007KQF8""",1,"""IT""",2
"""0007477155""","""8804633301""",1,"""IT""",2
"""0007477155""","""880475172X""",1,"""IT""",2
