In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import concurrent.futures
import math

from tqdm import tqdm
import numpy as np
import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine

In [4]:
TOP_N = 50
VER = "01"
DIR = "/gdrive/MyDrive/amazon_kdd_2023/"

# MRR@100

In [5]:
train = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")

In [6]:
candidates = pl.concat([
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_DE_for_train_or_eval.parquet").filter(pl.col("session_id").str.starts_with("train")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_UK_for_train_or_eval.parquet").filter(pl.col("session_id").str.starts_with("train")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_JP_for_train_or_eval.parquet").filter(pl.col("session_id").str.starts_with("train")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
])

In [7]:
candidates = candidates.filter(pl.col("bpr_score") != 0)
candidates.write_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_for_train_or_eval.parquet")

In [8]:
candidates.head()

session_id,candidate_item,bpr_score,bpr_rank
str,str,f32,u32
"""train_0""","""B07MSMBJCM""",2.195564,1
"""train_0""","""B07M7N9NC4""",2.100213,2
"""train_0""","""B08ZHZGJND""",2.058558,3
"""train_0""","""B09GKJKT7W""",2.049722,4
"""train_0""","""B09C8L7LNS""",2.021401,5


In [9]:
label_lists = []
n_rows = 400_000
for df in tqdm(train.iter_slices(n_rows=n_rows), total=math.ceil(train.height/n_rows)): # specify "total" parameter to display tqdm progress bar 
    # process data
    df = df.join(candidates, on="session_id", how="left")
    df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
    label_lists.extend(df.groupby("session_id", maintain_order=True).all()["label"].to_list())

100%|██████████| 9/9 [02:07<00:00, 14.14s/it]


In [10]:
# MRRの計算
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.11606


# for inference

In [11]:
candidates = pl.concat([
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_DE_for_inference.parquet").filter(pl.col("session_id").str.starts_with("test")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_UK_for_inference.parquet").filter(pl.col("session_id").str.starts_with("test")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_JP_for_inference.parquet").filter(pl.col("session_id").str.starts_with("test")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
])

In [12]:
candidates = candidates.filter(pl.col("bpr_score") != 0)
candidates.write_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_for_inference.parquet")

In [13]:
candidates.head()

session_id,candidate_item,bpr_score,bpr_rank
str,str,f32,u32
"""test_phase2_0""","""B0765C95PG""",4.043446,1
"""test_phase2_0""","""B0B5LHNGQ8""",4.040701,2
"""test_phase2_0""","""B0B3HMH1JP""",4.018159,3
"""test_phase2_0""","""B08CY14VGD""",4.012033,4
"""test_phase2_0""","""B09Y57N2S1""",4.008018,5
