In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp310-cp310-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [3]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import random
from collections import defaultdict, Counter
from typing import List, Dict, Union
import pickle

import numpy as np
import implicit
from scipy.sparse import lil_matrix
from tqdm import tqdm
import pandas as pd
import polars as pl



In [5]:
TOP_N = 50
LOCALES = ["IT", "FR", "ES"]
VER = "58"
DIR = "/gdrive/MyDrive/amazon_kdd_2023/"

# parameters for IMF
FACTORS = 50
LAMBDA = 0.01
ITERATIONS = 500
SEED = 42

In [6]:
def preprocess(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("sequence_num"))
    )
    return df

In [7]:
def train_imf_and_generate_candidates(df:pd.DataFrame):
    # generate sparse matrix
    unique_user_ids = sorted(df["session_id"].unique())
    unique_item_ids = sorted(df["prev_items"].unique())
    user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
    item_id2index = dict(zip(unique_item_ids, range(len(unique_item_ids))))

    matrix = lil_matrix((len(unique_user_ids), len(unique_item_ids)))
    for _, row in tqdm(df.iterrows()):
        user_index = user_id2index[row["session_id"]]
        item_index = item_id2index[row["prev_items"]]
        matrix[user_index, item_index] = 1.0

    # convert LIL to CSR
    matrix = matrix.tocsr()

    # model initialization
    model = implicit.bpr.BayesianPersonalizedRanking(
        factors = FACTORS,
        regularization = LAMBDA,
        iterations = ITERATIONS,
        random_state = SEED,
    )

    # train model
    model.fit(matrix)

    # generate candidate
    users = []
    candidates = []
    imf_scores = []

    for user_id, user_index in tqdm(user_id2index.items()):
        item_indexes, scores = model.recommend(user_index, matrix[user_index], N=TOP_N, filter_already_liked_items=True)
        for item_index, score in zip(item_indexes, scores):
            users.append(user_id)
            candidates.append(unique_item_ids[item_index])
            imf_scores.append(score)

    candidates = pd.DataFrame({
        "session_id": users,
        "candidate_item": candidates,
        "bpr_score": imf_scores,
    })

    candidates = pl.from_pandas(candidates)
    candidates = candidates \
    .sort(["session_id", "bpr_score"], descending=[False, True]) \
    .with_columns(pl.col("bpr_score").rank(descending=True, method="min").over("session_id").alias("bpr_rank"))
    candidates = candidates.to_pandas()

    return model, user_id2index, item_id2index, candidates

## For local train/eval

In [8]:
train = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet")
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet")
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test2_1, test2_2, test3_1, test3_2])

In [9]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
session_df = session_df.to_pandas()

In [10]:
%%time
model, user_id2index, item_id2index, candidates = train_imf_and_generate_candidates(session_df)

1644829it [01:28, 18529.67it/s]


  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 455754/455754 [11:21<00:00, 669.05it/s]


CPU times: user 47min 41s, sys: 48min 53s, total: 1h 36min 35s
Wall time: 14min 19s


In [11]:
# save models
model_name = f"bpr_{VER}_model_for_train_or_eval.npz"
user_ids_name = f"bpr_{VER}_user_id2index_for_train_or_eval.pickle"
item_ids_name = f"bpr_{VER}_item_id2index_for_train_or_eval.pickle"
model.save(DIR + "models/task2/" + model_name)
with open(DIR + "models/task2/" + user_ids_name, "wb") as f:
    pickle.dump(user_id2index, f)
with open(DIR + "models/task2/" + item_ids_name, "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
file_name = f"bpr_{VER}_for_train_or_eval.parquet"
candidates.to_parquet(DIR + "data/interim/candidates/task2/" + file_name)

In [12]:
candidates.head()

Unnamed: 0,session_id,candidate_item,bpr_score,bpr_rank
0,test_phase1_0,B0BJQ6H9JX,3.978669,1
1,test_phase1_0,B07G3GMRYF,3.956034,2
2,test_phase1_0,B07H9DVLBB,3.950323,3
3,test_phase1_0,B089M5KV4Y,3.931445,4
4,test_phase1_0,B09FFD6R2B,3.906698,5


## MRR@100

In [13]:
train = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
file_name = f"bpr_{VER}_for_train_or_eval.parquet"
candidates = pl.read_parquet(DIR + "data/interim/candidates/task2/" + file_name)

In [14]:
# candidateの結合とlabelの付与
df = train.join(candidates, on="session_id", how="left")
df = df.sort(["session_id", "bpr_score"], descending=[False, True])
df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
label_lists = df.groupby("session_id", maintain_order=True).all()["label"].to_list()

In [15]:
# MRRの計算
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.17325


# For test inference

In [16]:
train = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet")
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet")
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test2_1, test2_2, test3_1, test3_2])

In [17]:
# trainのnext_itemをprev_itemsにappendする
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [18]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
session_df = session_df.to_pandas()

In [19]:
%%time
model, user_id2index, item_id2index, candidates = train_imf_and_generate_candidates(session_df)

1978362it [01:45, 18763.33it/s]


  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 455754/455754 [11:38<00:00, 652.34it/s]


CPU times: user 51min 16s, sys: 49min 49s, total: 1h 41min 6s
Wall time: 15min 12s


In [20]:
# save models
model_name = f"bpr_{VER}_model_for_inference.npz"
user_ids_name = f"bpr_{VER}_user_id2index_for_inference.pickle"
item_ids_name = f"bpr_{VER}_item_id2index_for_inference.pickle"
model.save(DIR + "models/task2/" + model_name)
with open(DIR + "models/task2/" + user_ids_name, "wb") as f:
    pickle.dump(user_id2index, f)
with open(DIR + "models/task2/" + item_ids_name, "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
file_name = f"bpr_{VER}_for_inference.parquet"
candidates.to_parquet(DIR + "data/interim/candidates/task2/" + file_name)

In [21]:
candidates.head()

Unnamed: 0,session_id,candidate_item,bpr_score,bpr_rank
0,test_phase1_0,B0BJQ6H9JX,4.152624,1
1,test_phase1_0,B08GY8NHF2,4.036832,2
2,test_phase1_0,B08HCK44DK,4.027184,3
3,test_phase1_0,B0BG283KR4,4.024742,4
4,test_phase1_0,B07T9JJV6V,3.957455,5
