In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp310-cp310-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [3]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import random
from collections import defaultdict, Counter
from typing import List, Dict, Union
import pickle

import numpy as np
import implicit
from scipy.sparse import lil_matrix
from tqdm import tqdm
import pandas as pd
import polars as pl



In [5]:
TOP_N = 50
LOCALE = "JP"
VER = "01"
DIR = "/gdrive/MyDrive/amazon_kdd_2023/"

# parameters for IMF
FACTORS = 50
LAMBDA = 0.01
ITERATIONS = 500
SEED = 42

In [6]:
def preprocess(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("sequence_num"))
    )
    return df

In [7]:
def train_imf_and_generate_candidates(df:pd.DataFrame):
    # generate sparse matrix
    unique_user_ids = sorted(df["session_id"].unique())
    unique_item_ids = sorted(df["prev_items"].unique())
    user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
    item_id2index = dict(zip(unique_item_ids, range(len(unique_item_ids))))

    matrix = lil_matrix((len(unique_user_ids), len(unique_item_ids)))
    for _, row in tqdm(df.iterrows()):
        user_index = user_id2index[row["session_id"]]
        item_index = item_id2index[row["prev_items"]]
        matrix[user_index, item_index] = 1.0

    # convert LIL to CSR
    matrix = matrix.tocsr()

    # model initialization
    model = implicit.bpr.BayesianPersonalizedRanking(
        factors = FACTORS,
        regularization = LAMBDA,
        iterations = ITERATIONS,
        random_state = SEED,
    )

    # train model
    model.fit(matrix)

    # generate candidate
    users = []
    candidates = []
    imf_scores = []

    for user_id, user_index in tqdm(user_id2index.items()):
        item_indexes, scores = model.recommend(user_index, matrix[user_index], N=TOP_N, filter_already_liked_items=True)
        for item_index, score in zip(item_indexes, scores):
            users.append(user_id)
            candidates.append(unique_item_ids[item_index])
            imf_scores.append(score)

    candidates = pd.DataFrame({
        "session_id": users,
        "candidate_item": candidates,
        "bpr_score": imf_scores,
    })

    candidates = pl.from_pandas(candidates)
    candidates = candidates \
    .sort(["session_id", "bpr_score"], descending=[False, True]) \
    .with_columns(pl.col("bpr_score").rank(descending=True, method="min").over("session_id").alias("bpr_rank"))
    candidates = candidates.to_pandas()

    return model, user_id2index, item_id2index, candidates

## For local train/eval

In [8]:
train = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task1/train_task1.parquet")
test = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task1/test_task1_phase2.parquet")

train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
session_df = session_df.to_pandas()
session_df = session_df[session_df["locale"] == LOCALE]

In [9]:
%%time
model, user_id2index, item_id2index, candidates = train_imf_and_generate_candidates(session_df)

4823065it [04:25, 18138.78it/s]


  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 1075586/1075586 [1:30:28<00:00, 198.12it/s]


CPU times: user 7h 8min 23s, sys: 4h 49min 56s, total: 11h 58min 20s
Wall time: 1h 39min 26s


In [10]:
# save models
model_name = f"bpr_{VER}_{LOCALE}_model_for_train_or_eval.npz"
user_ids_name = f"bpr_{VER}_{LOCALE}_user_id2index_for_train_or_eval.pickle"
item_ids_name = f"bpr_{VER}_{LOCALE}_item_id2index_for_train_or_eval.pickle"
model.save(DIR + "models/task1/" + model_name)
with open(DIR + "models/task1/" + user_ids_name, "wb") as f:
    pickle.dump(user_id2index, f)
with open(DIR + "models/task1/" + item_ids_name, "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
file_name = f"bpr_{VER}_{LOCALE}_for_train_or_eval.parquet"
candidates.to_parquet(DIR + "data/interim/candidates/task1/" + file_name)

In [11]:
candidates.head()

Unnamed: 0,session_id,candidate_item,bpr_score,bpr_rank
0,test_phase2_104568,B005JWG1FM,3.430395,1
1,test_phase2_104568,B08VWS6XLC,3.344284,2
2,test_phase2_104568,B005JWG0Z8,3.336448,3
3,test_phase2_104568,B00OTHCGK2,3.287585,4
4,test_phase2_104568,B00GXSDTBU,3.275993,5


# For test inference

In [12]:
train = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task1/train_task1.parquet")
test = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task1/test_task1_phase2.parquet")

In [13]:
# trainのnext_itemをprev_itemsにappendする
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [14]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
session_df = session_df.to_pandas()
session_df = session_df[session_df["locale"] == LOCALE]

In [15]:
%%time
model, user_id2index, item_id2index, candidates = train_imf_and_generate_candidates(session_df)

5802184it [05:18, 18242.61it/s]


  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 1075586/1075586 [1:34:30<00:00, 189.68it/s]


CPU times: user 7h 40min 22s, sys: 4h 57min 41s, total: 12h 38min 4s
Wall time: 1h 45min 22s


In [16]:
# save models
model_name = f"bpr_{VER}_{LOCALE}_model_for_inference.npz"
user_ids_name = f"bpr_{VER}_{LOCALE}_user_id2index_for_inference.pickle"
item_ids_name = f"bpr_{VER}_{LOCALE}_item_id2index_for_inference.pickle"
model.save(DIR + "models/task1/" + model_name)
with open(DIR + "models/task1/" + user_ids_name, "wb") as f:
    pickle.dump(user_id2index, f)
with open(DIR + "models/task1/" + item_ids_name, "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
file_name = f"bpr_{VER}_{LOCALE}_for_inference.parquet"
candidates.to_parquet(DIR + "data/interim/candidates/task1/" + file_name)

In [17]:
candidates.head()

Unnamed: 0,session_id,candidate_item,bpr_score,bpr_rank
0,test_phase2_104568,B08VW3JRDD,3.530384,1
1,test_phase2_104568,B08VWS6XLC,3.47415,2
2,test_phase2_104568,B005JWG1FM,3.439897,3
3,test_phase2_104568,B005JWG0Z8,3.408632,4
4,test_phase2_104568,B00F36TMIM,3.353942,5
