In [3]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [4]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
!pip install gensim==4.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==4.0.1
  Downloading gensim-4.0.1.tar.gz (23.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gensim
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for gensim (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for gensim[0m[31m
[0m[?25h  Running setup.py clean for gensim
Failed to build gensim
[31mERROR: Could not build wheels for gensim, which is required to install pyproject.toml-based projects[0m[31m

In [6]:
from collections import defaultdict, Counter
from typing import List, Dict, Union
import pickle

from tqdm import tqdm
import numpy as np
import polars as pl
from gensim.models import Word2Vec

In [7]:
DIR = "/gdrive/MyDrive/amazon_kdd_2023/"
TOP_N = 50
LOCALES = ["IT", "FR", "ES"]
VER = "18"
SEED = 42

In [8]:
def preprocess(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("sequence_num"))
    )
    return df

In [9]:
def train_word2vec(df:pl.DataFrame) -> Word2Vec:

    # sessionごとにaidのシーケンスを作成
    aid_sequences = list(df.groupby("session_id", maintain_order=True).all()["prev_items"].to_list())

    # word2vecの学習
    model = Word2Vec(
        sentences=aid_sequences,
        epochs=50,
        workers=8,
        min_count=1,
        seed=SEED,
    )

    return model

In [10]:
def make_nns_matrix(w2v, k):
    aid_xs = []
    aid_ys = []
    sims = []
    ranks = []
    for aid in tqdm(w2v.wv.index_to_key):
        nns = w2v.wv.most_similar(aid, topn=k)
        aid_y = [x[0] for x in nns]
        sim = [x[1] for x in nns]
        rank = list(range(1, len(nns)+1))
        aid_xs.extend([aid] * k)
        aid_ys.extend(aid_y)
        sims.extend(sim)
        ranks.extend(rank)

    return pl.DataFrame({"item": aid_xs, "candidate_item": aid_ys, 'nns_similality': sims, 'nns_rank': ranks})

# For local train/eval

In [11]:
train = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet")
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet")
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test2_1, test2_2, test3_1, test3_2])

In [12]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])

In [13]:
# train word2vec model
for locale in LOCALES:
    # filter by locale
    df = session_df.filter(pl.col("locale") == locale)

    # train and save word2vec model
    model = train_word2vec(df)
    model.save(DIR + f"models/task2/item2vec_{locale}_{VER}_for_train_or_eval.model")

## MRR@100

In [14]:
train = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task2/train_task2.parquet")

In [15]:
# last_itemの抽出
last_item_list = []
prev_items_list = train["prev_items"].to_list()
for prev_items in prev_items_list:
    last_item_list.append(prev_items[-1])
train = train.with_columns(pl.Series(name="last_item", values=last_item_list))

In [16]:
train = train[["session_id", "locale", "last_item", "next_item"]]

In [17]:
nns_matrices = []
for locale in LOCALES:
    # calculate nearest neighbors
    w2v = Word2Vec.load(DIR + f"models/task2/item2vec_{locale}_{VER}_for_train_or_eval.model")
    nns_matrix = make_nns_matrix(w2v, TOP_N)
    nns_matrix = nns_matrix.with_columns(pl.lit(locale).alias("locale"))
    nns_matrices.append(nns_matrix)
nns_matrix = pl.concat(nns_matrices)

100%|██████████| 49357/49357 [01:35<00:00, 517.77it/s]
100%|██████████| 43752/43752 [01:12<00:00, 603.89it/s]
100%|██████████| 40718/40718 [01:04<00:00, 631.27it/s]


In [18]:
file_name = f"nns_matrix_{VER}_for_train_or_eval.parquet"
nns_matrix.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/" + file_name)

In [19]:
train = train.join(nns_matrix, left_on=["locale", "last_item"], right_on=["locale", "item"], how="left")
train = train.sort(["session_id", "nns_similality"], descending=[False, True])
train = train.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
label_lists = train.groupby("session_id", maintain_order=True).all()["label"].to_list()

In [20]:
# MRRの計算
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.23495


# For test inference

In [21]:
train = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet")
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet")
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test2_1, test2_2, test3_1, test3_2])

In [22]:
# trainのnext_itemをprev_itemsにappendする
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [23]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])

In [24]:
# train word2vec model
for locale in LOCALES:
    # filter by locale
    df = session_df.filter(pl.col("locale") == locale)

    # train and save word2vec model
    model = train_word2vec(df)
    model.save(DIR + f"models/task2/item2vec_{locale}_{VER}_for_inference.model")

In [25]:
nns_matrices = []
for locale in LOCALES:
    # calculate nearest neighbors
    w2v = Word2Vec.load(DIR + f"models/task2/item2vec_{locale}_{VER}_for_inference.model")
    nns_matrix = make_nns_matrix(w2v, TOP_N)
    nns_matrix = nns_matrix.with_columns(pl.lit(locale).alias("locale"))
    nns_matrices.append(nns_matrix)
nns_matrix = pl.concat(nns_matrices)

100%|██████████| 50461/50461 [01:34<00:00, 532.80it/s]
100%|██████████| 44577/44577 [01:18<00:00, 571.10it/s]
100%|██████████| 42503/42503 [01:12<00:00, 585.78it/s]


In [26]:
file_name = f"nns_matrix_{VER}_for_inference.parquet"
nns_matrix.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task2/" + file_name)

In [27]:
nns_matrix

item,candidate_item,nns_similality,nns_rank,locale
str,str,f64,i64,str
"""B01LQQQWG2""","""B01L01ES3M""",0.876205,1,"""IT"""
"""B01LQQQWG2""","""B01N0SJCRQ""",0.871894,2,"""IT"""
"""B01LQQQWG2""","""B07JBWLQTH""",0.853216,3,"""IT"""
"""B01LQQQWG2""","""B06W54NMJY""",0.847767,4,"""IT"""
"""B01LQQQWG2""","""8822755707""",0.814391,5,"""IT"""
"""B01LQQQWG2""","""B01L2B30WA""",0.80623,6,"""IT"""
"""B01LQQQWG2""","""B096M6LD9P""",0.805644,7,"""IT"""
"""B01LQQQWG2""","""B07FWGHT2D""",0.791575,8,"""IT"""
"""B01LQQQWG2""","""B07CY424RP""",0.790759,9,"""IT"""
"""B01LQQQWG2""","""B07CMFYHGJ""",0.788006,10,"""IT"""
