In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install polars



In [None]:
import concurrent.futures
import math

from tqdm import tqdm
import numpy as np
import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine

In [None]:
TOP_N = 50
VER = "01"
DIR = "/content/drive/MyDrive/kddcup2023/"

# MRR@100

In [None]:
train = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")

In [None]:
candidates = pl.concat([
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_DE_for_train_or_eval.parquet").filter(pl.col("session_id").str.starts_with("train")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_UK_for_train_or_eval.parquet").filter(pl.col("session_id").str.starts_with("train")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_JP_for_train_or_eval.parquet").filter(pl.col("session_id").str.starts_with("train")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
])

In [None]:
candidates = candidates.filter(pl.col("bpr_score") != 0)
candidates.write_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_for_train_or_eval.parquet")

In [None]:
candidates.head()

session_id,candidate_item,bpr_score,bpr_rank
str,str,f32,u32
"""train_0""","""B002QZX9K0""",2.203246,1
"""train_0""","""B081LL9TV2""",2.090522,2
"""train_0""","""B07HYXKQXW""",2.090392,3
"""train_0""","""B07MSMBJCM""",2.077719,4
"""train_0""","""B09GKJKT7W""",2.077087,5


In [None]:
label_lists = []
n_rows = 400_000
for df in tqdm(train.iter_slices(n_rows=n_rows), total=math.ceil(train.height/n_rows)): # specify "total" parameter to display tqdm progress bar
    # process data
    df = df.join(candidates, on="session_id", how="left")
    df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
    label_lists.extend(df.groupby("session_id", maintain_order=True).all()["label"].to_list())

100%|██████████| 9/9 [03:24<00:00, 22.68s/it]


In [None]:
# MRR
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.11601


# for inference

In [None]:
candidates = pl.concat([
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_DE_for_inference.parquet").filter(pl.col("session_id").str.starts_with("test")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_UK_for_inference.parquet").filter(pl.col("session_id").str.starts_with("test")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
    pl.read_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_JP_for_inference.parquet").filter(pl.col("session_id").str.starts_with("test")).sort(["session_id", "bpr_score"], descending=[False, True]).groupby("session_id", maintain_order=True).head(TOP_N),
])

In [None]:
candidates = candidates.filter(pl.col("bpr_score") != 0)
candidates.write_parquet(DIR + f"data/interim/candidates/task1/bpr_{VER}_for_inference.parquet")

In [None]:
candidates.head()

session_id,candidate_item,bpr_score,bpr_rank
str,str,f32,u32
"""test_phase2_0""","""B00ZQW91DE""",4.037469,1
"""test_phase2_0""","""B0B3HMH1JP""",4.017115,2
"""test_phase2_0""","""B07T4NH1W4""",3.995347,3
"""test_phase2_0""","""B0B87N98MM""",3.992003,4
"""test_phase2_0""","""B09Y57N2S1""",3.982798,5
