# Setting

In [1]:
!pip install implicit

[0m

In [2]:
import os
from typing import List, Dict, Union
import pickle

import pandas as pd
import polars as pl
import implicit
from scipy.sparse import lil_matrix
from tqdm import tqdm

from scripts.metrics import map_at_k



In [3]:
INPUT_DIR = "../../input/raw/"
CANDIDATE_DIR = "./candidates/"
FEATURE_DIR = "./features/"

TOP_N = 10

# parameters for BPR
FACTORS = 200
LAMBDA = 0.075
ITERATIONS = 2000
SEED = 42

In [4]:
def explode_and_add_seq_no(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("seq_no").cast(pl.Int64))
    )
    return df

In [5]:
def train_imf_and_generate_candidates(df:pd.DataFrame):
    # generate sparse matrix
    unique_user_ids = sorted(df["session_id"].unique())
    unique_item_ids = sorted(df["yad_no"].unique())
    user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
    item_id2index = dict(zip(unique_item_ids, range(len(unique_item_ids))))

    matrix = lil_matrix((len(unique_user_ids), len(unique_item_ids)))
    for _, row in tqdm(df.iterrows()):
        user_index = user_id2index[row["session_id"]]
        item_index = item_id2index[row["yad_no"]]
        matrix[user_index, item_index] = 1.0

    # convert LIL to CSR
    matrix = matrix.tocsr()

    # model initialization
    model = implicit.bpr.BayesianPersonalizedRanking(
        factors = FACTORS,
        regularization = LAMBDA,
        iterations = ITERATIONS,
        random_state = SEED,
    )

    # train model
    model.fit(matrix)

    # generate candidate
    users = []
    candidates = []
    imf_scores = []

    for user_id, user_index in tqdm(user_id2index.items()):
        item_indexes, scores = model.recommend(user_index, matrix[user_index], N=TOP_N, filter_already_liked_items=True)
        for item_index, score in zip(item_indexes, scores):
            users.append(user_id)
            candidates.append(unique_item_ids[item_index])
            imf_scores.append(score)

    candidates = pd.DataFrame({
        "session_id": users,
        "candidate_yad_no": candidates,
        "bpr_score": imf_scores,
    })

    candidates = pl.from_pandas(candidates)
    
    candidates = candidates \
    .sort(["session_id", "bpr_score"], descending=[False, True]) \
    .with_columns(pl.col("bpr_score").rank(descending=True, method="min").over("session_id").alias("bpr_rank"))
    candidates = candidates.drop("bpr_score")
    
    candidates = candidates.to_pandas()

    return model, user_id2index, item_id2index, candidates

# For local train/eval

In [6]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [7]:
log = pl.concat([train_log, test_log], how="vertical")

In [8]:
%%time
model, user_id2index, item_id2index, candidates = train_imf_and_generate_candidates(log.to_pandas())

669575it [00:25, 25754.92it/s]


  0%|          | 0/2000 [00:00<?, ?it/s]

100% 463398/463398 [04:17<00:00, 1800.21it/s]


CPU times: user 58min 17s, sys: 1h 2min 48s, total: 2h 1min 5s
Wall time: 7min 15s


In [9]:
# save models
model_name = "bpr_model_for_train_or_eval.npz"
user_ids_name = "bpr_user_id2index_for_train_or_eval.pickle"
item_ids_name = "bpr_item_id2index_for_train_or_eval.pickle"
model.save(os.path.join(FEATURE_DIR, model_name))
with open(os.path.join(FEATURE_DIR, user_ids_name), "wb") as f:
    pickle.dump(user_id2index, f)
with open(os.path.join(FEATURE_DIR, item_ids_name), "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
candidates.to_parquet(os.path.join(CANDIDATE_DIR, "bpr_for_train_or_eval.parquet"))

In [10]:
candidates.head(10)

Unnamed: 0,session_id,candidate_yad_no,bpr_rank
0,000007603d533d30453cc45d0f3d119f,11882,1
1,000007603d533d30453cc45d0f3d119f,2808,2
2,000007603d533d30453cc45d0f3d119f,8668,3
3,000007603d533d30453cc45d0f3d119f,3324,4
4,000007603d533d30453cc45d0f3d119f,5289,5
5,000007603d533d30453cc45d0f3d119f,4101,6
6,000007603d533d30453cc45d0f3d119f,11277,7
7,000007603d533d30453cc45d0f3d119f,6520,8
8,000007603d533d30453cc45d0f3d119f,9303,9
9,000007603d533d30453cc45d0f3d119f,12325,10


# MAP@k=10

In [11]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv")).rename({"yad_no":"label_yad_no"})

In [12]:
bpr = pl.read_parquet(os.path.join(CANDIDATE_DIR, "bpr_for_train_or_eval.parquet"))

In [13]:
prediction = train_label \
    .join(bpr, on="session_id", how="left") \
    .sort(["session_id", "bpr_rank"], descending=[False, False]) \
    .with_columns((pl.col("candidate_yad_no") == pl.col("label_yad_no")).cast(pl.Int8).alias("user_relevance")) \
    .fill_null(0)

In [14]:
user_relevances = prediction.group_by("session_id", maintain_order=True).all()["user_relevance"].to_list()

In [15]:
map_at_k(user_relevances, 10)

0.10399960847755776

# For test

In [16]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv"))
test_log = pl.read_csv(os.path.join(INPUT_DIR, "test_log.csv"))

In [17]:
# trainのlabelをlogにappendする

prev_items_list = (
    train_log
    .sort(["session_id", "seq_no"])
    .group_by("session_id", maintain_order=True)
    .agg(pl.col("yad_no"))
)["yad_no"].to_list()

next_item_list = (
    train_label
    .sort("session_id")
)["yad_no"].to_list()

prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train_log = train_label.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

train_log = explode_and_add_seq_no(train_log) \
    .drop("yad_no") \
    .rename({"prev_items" : "yad_no"}) \
    [["session_id", "seq_no", "yad_no"]] # カラム並び替え

In [18]:
log = pl.concat([train_log, test_log], how="vertical")

In [19]:
%%time
model, user_id2index, item_id2index, candidates = train_imf_and_generate_candidates(log.to_pandas())

958273it [00:35, 26743.29it/s]


  0%|          | 0/2000 [00:00<?, ?it/s]

100% 463398/463398 [03:42<00:00, 2081.33it/s]


CPU times: user 1h 9min 50s, sys: 54min 41s, total: 2h 4min 31s
Wall time: 7min 39s


In [20]:
# save models
model_name = "bpr_model_for_test.npz"
user_ids_name = "bpr_user_id2index_for_test.pickle"
item_ids_name = "bpr_item_id2index_for_test.pickle"
model.save(os.path.join(FEATURE_DIR, model_name))
with open(os.path.join(FEATURE_DIR, user_ids_name), "wb") as f:
    pickle.dump(user_id2index, f)
with open(os.path.join(FEATURE_DIR, item_ids_name), "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
candidates.to_parquet(os.path.join(CANDIDATE_DIR, "bpr_for_test.parquet"))

In [21]:
candidates.head()

Unnamed: 0,session_id,candidate_yad_no,bpr_rank
0,000007603d533d30453cc45d0f3d119f,2808,1
1,000007603d533d30453cc45d0f3d119f,3324,2
2,000007603d533d30453cc45d0f3d119f,5289,3
3,000007603d533d30453cc45d0f3d119f,11882,4
4,000007603d533d30453cc45d0f3d119f,12837,5
