In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp310-cp310-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [3]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import random
from collections import defaultdict, Counter
from typing import List, Dict, Union
import pickle

import numpy as np
import implicit
from scipy.sparse import lil_matrix
from tqdm import tqdm
import pandas as pd
import polars as pl



In [5]:
TOP_N = 200
LOCALES = ["IT", "FR", "ES"]
VER = "35"
DIR = "/gdrive/MyDrive/amazon_kdd_2023/"

# parameters for IMF
ALPHA = 12500
LAMBDA = 0.01
FACTORS = 200
N_EPOCHS = 15
SEED = 42

In [6]:
def preprocess(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("sequence_num"))
    )
    return df

In [7]:
def train_imf_and_generate_candidates(df:pd.DataFrame):
    # generate sparse matrix
    unique_user_ids = sorted(df["session_id"].unique())
    unique_item_ids = sorted(df["prev_items"].unique())
    user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
    item_id2index = dict(zip(unique_item_ids, range(len(unique_item_ids))))

    matrix = lil_matrix((len(unique_user_ids), len(unique_item_ids)))
    for _, row in tqdm(df.iterrows()):
        user_index = user_id2index[row["session_id"]]
        item_index = item_id2index[row["prev_items"]]
        matrix[user_index, item_index] = 1.0

    # convert LIL to CSR
    matrix = matrix.tocsr()

    # model initialization
    model = implicit.als.AlternatingLeastSquares(
        factors = FACTORS,
        alpha = ALPHA,
        iterations = N_EPOCHS,
        regularization = LAMBDA,
        calculate_training_loss = True,
        random_state = SEED,
    )

    # train model
    model.fit(matrix)


    # generate candidate
    users = []
    candidates = []
    imf_scores = []

    for user_id, user_index in tqdm(user_id2index.items()):
        item_indexes, scores = model.recommend(user_index, matrix[user_index], N=TOP_N, filter_already_liked_items=True)
        for item_index, score in zip(item_indexes, scores):
            users.append(user_id)
            candidates.append(unique_item_ids[item_index])
            imf_scores.append(score)

    candidates = pd.DataFrame({
        "session_id": users,
        "candidate_item": candidates,
        "imf_score": imf_scores,
    })

    candidates = pl.from_pandas(candidates)
    candidates = candidates \
    .sort(["session_id", "imf_score"], descending=[False, True]) \
    .with_columns(pl.col("imf_score").rank(descending=True, method="min").over("session_id").alias("imf_rank"))
    candidates = candidates.to_pandas()

    return model, user_id2index, item_id2index, candidates

## For local train/eval

In [8]:
train = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2_augmented.parquet")
test = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_leftover.parquet")

In [9]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
session_df = session_df.to_pandas()

In [10]:
%%time
model, user_id2index, item_id2index, candidates = train_imf_and_generate_candidates(session_df)

1595272it [01:29, 17805.64it/s]


  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 455754/455754 [26:10<00:00, 290.25it/s]


CPU times: user 2h 13min 38s, sys: 1h 33min 27s, total: 3h 47min 5s
Wall time: 32min 4s


In [11]:
# save models
model_name = f"imf_{VER}_model_for_train_or_eval.npz"
user_ids_name = f"imf_{VER}_user_id2index_for_train_or_eval.pickle"
item_ids_name = f"imf_{VER}_item_id2index_for_train_or_eval.pickle"
model.save(DIR + "models/task2/" + model_name)
with open(DIR + "models/task2/" + user_ids_name, "wb") as f:
    pickle.dump(user_id2index, f)
with open(DIR + "models/task2/" + item_ids_name, "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
file_name = f"imf_{VER}_for_train_or_eval.parquet"
candidates.to_parquet(DIR + "data/interim/candidates/task2/" + file_name)

In [12]:
candidates.head()

Unnamed: 0,session_id,candidate_item,imf_score,imf_rank
0,test_leftover_0,B06XGHV89F,0.959849,1
1,test_leftover_0,B07MCFD1MK,0.673939,2
2,test_leftover_0,B07Z3L1NN2,0.666873,3
3,test_leftover_0,B07HRGNG8K,0.603244,4
4,test_leftover_0,B01M3RT7MT,0.596283,5


## MRR@100

In [13]:
train = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
file_name = f"imf_{VER}_for_train_or_eval.parquet"
candidates = pl.read_parquet(DIR + "data/interim/candidates/task2/" + file_name)

In [14]:
# candidateの結合とlabelの付与
df = train.join(candidates, on="session_id", how="left")
df = df.sort(["session_id", "imf_score"], descending=[False, True])
df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
label_lists = df.groupby("session_id", maintain_order=True).all()["label"].to_list()

In [15]:
# MRRの計算
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.28422


# For test inference

In [16]:
train = pl.read_parquet(DIR + "data/preprocessed/task2/train_task2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet")
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet")
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet").filter(pl.col("locale").is_in(LOCALES))
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test2_1, test2_2, test3_1, test3_2])

In [17]:
# trainのnext_itemをprev_itemsにappendする
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [18]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
session_df = session_df.to_pandas()

In [19]:
%%time
model, user_id2index, item_id2index, candidates = train_imf_and_generate_candidates(session_df)

1978362it [01:50, 17855.50it/s]


  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 455754/455754 [26:37<00:00, 285.24it/s]


CPU times: user 2h 15min 14s, sys: 1h 36min 15s, total: 3h 51min 30s
Wall time: 33min 19s


In [20]:
# save models
model_name = f"imf_{VER}_model_for_inference.npz"
user_ids_name = f"imf_{VER}_user_id2index_for_inference.pickle"
item_ids_name = f"imf_{VER}_item_id2index_for_inference.pickle"
model.save(DIR + "models/task2/" + model_name)
with open(DIR + "models/task2/" + user_ids_name, "wb") as f:
    pickle.dump(user_id2index, f)
with open(DIR + "models/task2/" + item_ids_name, "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
file_name = f"imf_{VER}_for_inference.parquet"
candidates.to_parquet(DIR + "data/interim/candidates/task2/" + file_name)

In [21]:
candidates.head()

Unnamed: 0,session_id,candidate_item,imf_score,imf_rank
0,test_phase1_0,B08GY8NHF2,0.682636,1
1,test_phase1_0,B08ZJMPY2T,0.610372,2
2,test_phase1_0,B08GYG5SVQ,0.584222,3
3,test_phase1_0,B08GYBBBBH,0.53317,4
4,test_phase1_0,B08GXXKY6P,0.464025,5
