# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /gdrive


In [None]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import gc
import math
import random
from glob import glob
from collections import defaultdict, Counter
from typing import List, Dict
import joblib
import pickle

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import polars as pl
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GroupKFold

## constants

In [None]:
EXP_NAME = "exp117"
DIR = "/content/drive/MyDrive/kddcup2023/"
K_FOLDS = 2
SEED = 42
LOCALES = ["UK", "JP", "DE"]

# This parameter controls to which end item the candidate is tied.
# For example, if [1,2], candidates are generated from the last item and second last item in each session.
LAST_NS = [1, 2, 3]

In [None]:
USE_FEATURES = [
    # === candidate features ===
    "co_visit_weight_last1", "consective_1_weight_last1", "consective_3_weight_last1", "consective_5_weight_last1", "similarity_score_last1", "bert_distance_last1", "lift_last1", "prone_distance_last1",
    "co_visit_weight_last2", "consective_1_weight_last2", "consective_3_weight_last2", "consective_5_weight_last2", "similarity_score_last2", "bert_distance_last2", "lift_last2", "prone_distance_last2",
    "co_visit_weight_last3", "consective_1_weight_last3", "consective_3_weight_last3", "consective_5_weight_last3", "similarity_score_last3", "bert_distance_last3", "lift_last3", "prone_distance_last3",
    "imf_score", "bpr_score",
    "co_visit_rank_last1", "consective_1_rank_last1", "consective_3_rank_last1", "consective_5_rank_last1", "similarity_rank_last1", "bert_rank_last1", "lift_rank_last1", "prone_rank_last1",
    "co_visit_rank_last2", "consective_1_rank_last2", "consective_3_rank_last2", "consective_5_rank_last2", "similarity_rank_last2", "bert_rank_last2", "lift_rank_last2", "prone_rank_last2",
    "co_visit_rank_last3", "consective_1_rank_last3", "consective_3_rank_last3", "consective_5_rank_last3", "similarity_rank_last3", "bert_rank_last3", "lift_rank_last3", "prone_rank_last3",
    "imf_rank", "bpr_rank",
    # === session features ===
    "S_session_length",
    "S_nunique_brand",
    "S_ratio_unique_brand",
    "S_nunique_item",
    "S_ratio_repurchase",
    "S_locale",
    "S_mean_price", "S_max_price", "S_min_price", "S_std_price", "S_total_amount",
    "S_color_not_null_count", "S_size_not_null_count", "S_model_not_null_count", "S_material_not_null_count", "S_author_not_null_count",
    "S_last_item_price",
    # === product features ===
    "P_price",
    "P_purchase_count", "P_purchase_count_global",
    "P_total_amount",
    "P_brand_purchase_count", "P_brand_purchase_count_global",
    "P_brand_mean_price", "P_brand_max_price", "P_brand_min_price", "P_brand_std_price", "P_total_brand_amount",
    "P_price_diff_to_avg_brand_price",
    "P_n_unique_locale",
    "P_is_color_null", "P_is_size_null", "P_is_model_null", "P_is_material_null", "P_is_author_null",
    "P_purchase_count_ratio_to_locale", "P_purchase_amount_ratio_to_locale", "P_purchase_count_ratio_to_brand", "P_purchase_amount_ratio_to_brand",
    # === session * product features ===
    "SP_price_diff_to_mean_price", "SP_price_diff_to_min_price", "SP_price_diff_to_max_price", "SP_price_diff_to_last_price",
    "SP_brand_price_diff_to_mean_price", "SP_brand_price_diff_to_min_price", "SP_brand_price_diff_to_max_price", "SP_brand_price_diff_to_last_price",
    "SP_same_brand_last1", "SP_same_brand_last2", "SP_same_brand_last3",
    "SP_same_color_last1", "SP_same_color_last2", "SP_same_color_last3",
    "SP_same_size_last1", "SP_same_size_last2", "SP_same_size_last3",
    "SP_same_model_last1", "SP_same_model_last2", "SP_same_model_last3",
    "SP_same_material_last1", "SP_same_material_last2", "SP_same_material_last3",
    "SP_same_author_last1", "SP_same_author_last2", "SP_same_author_last3",
    "SP_same_brand_sum", "SP_same_color_sum", "SP_same_size_sum", "SP_same_model_sum", "SP_same_material_sum", "SP_same_author_sum",
    # === similality features ===
    "imf_similarity", "bpr_similarity",
    "graph_emb_similarity_last1", "graph_emb_similarity_last2", "graph_emb_similarity_last3",
    "i2v_similarity_last1", "i2v_similarity_last2", "i2v_similarity_last3",
]

## functions

In [None]:
# functions for model training, prediction and evaluation

def predict(df:pl.DataFrame, chunk_size:int=30_000_000) -> np.array:
    preds = np.zeros((df.shape[0],))
    for fold in range(K_FOLDS):
        # load model
        model = pickle.load(open(f'{DIR}models/task1/{EXP_NAME}_{fold+1}.pkl', "rb"))
        # chunk data and predict to prevent OOM
        preds_by_one_model = []
        for frame in df.iter_slices(n_rows=chunk_size):
            preds_chunk = model.predict(frame[USE_FEATURES].to_pandas(), num_iteration=model.best_iteration)
            preds_by_one_model.append(preds_chunk)
        preds += np.concatenate(preds_by_one_model)
    preds /= K_FOLDS
    return preds


def rr_at_k(user_relevances:List[int], k:int=100) -> float:
    user_relevances = user_relevances[:k]
    rr = 0
    for i, label in enumerate(user_relevances):
        if label == 1:
            rr = 1/(i+1)
            break
    return rr

def mrr_at_k(users_relevances:List[List[int]], k:int=100) -> float:
    rr = 0
    for user_relevances in users_relevances:
        rr += rr_at_k(user_relevances, k)
    mrr = rr / len(users_relevances)
    return mrr

## fix seed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(SEED)

# Train and Inference

## evaluation

In [None]:
# files = glob(DIR + f"data/interim/for_ranker/task1/valid_chunk_{EXP_NAME}_*.parquet")
# rr = 0
# session_num = 0
# for f in tqdm(files):
#     print(f)
#     df = pl.read_parquet(f)
#     session_num += df["session_id"].n_unique()

#     # prediction
#     df = df.with_columns(
#         pl.Series(name="pred", values=predict(df))
#     )
#     preds = df[["session_id", "label", "pred"]]
#     preds = preds.sort(["session_id", "pred"], descending=[False, True])

#     # calculate RR
#     label_lists = preds.groupby("session_id", maintain_order=True).all()["label"].to_list()
#     for label_list in label_lists:
#         rr += rr_at_k(label_list, 100)

# # calculate MRR
# print(" ")
# print("MRR@100:", round(rr/session_num, 5))

## inference

In [None]:
files = glob(DIR + f"data/interim/for_ranker/task1/test_chunk_exp115_*")
dfs = []
for f in tqdm(files):
    print(f)
    df = pl.read_parquet(f)

    # prediction
    df = df.with_columns(
        pl.Series(name="pred", values=predict(df))
    )
    df = df[["session_id", "locale", "candidate_item", "pred"]]
    df = df.sort(["session_id", "pred"], descending=[False, True])
    df = df.groupby("session_id", maintain_order=True).head(100)
    dfs.append(df)

test = pl.concat(dfs)

  0%|          | 0/22 [00:00<?, ?it/s]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_0


  5%|▍         | 1/22 [13:02<4:34:02, 782.98s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_1


  9%|▉         | 2/22 [25:59<4:19:37, 778.89s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_2


 14%|█▎        | 3/22 [39:08<4:08:10, 783.71s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_3


 18%|█▊        | 4/22 [52:17<3:55:46, 785.92s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_4


 23%|██▎       | 5/22 [1:05:29<3:43:17, 788.08s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_5


 27%|██▋       | 6/22 [1:18:40<3:30:21, 788.85s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_6


 32%|███▏      | 7/22 [1:31:50<3:17:20, 789.34s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_7


 36%|███▋      | 8/22 [1:46:03<3:08:56, 809.75s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_8


 41%|████      | 9/22 [2:00:08<2:57:46, 820.52s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_9


 45%|████▌     | 10/22 [2:14:24<2:46:20, 831.68s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_10


 50%|█████     | 11/22 [2:28:32<2:33:21, 836.52s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_11


 55%|█████▍    | 12/22 [2:42:51<2:20:35, 843.52s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_12


 59%|█████▉    | 13/22 [2:56:58<2:06:41, 844.66s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_13


 64%|██████▎   | 14/22 [3:10:17<1:50:46, 830.85s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_14


 68%|██████▊   | 15/22 [3:23:15<1:35:04, 814.86s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_15


 73%|███████▎  | 16/22 [3:36:21<1:20:35, 805.95s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_16


 77%|███████▋  | 17/22 [3:49:17<1:06:25, 797.08s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_17


 82%|████████▏ | 18/22 [4:02:23<52:54, 793.64s/it]  

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_18


 86%|████████▋ | 19/22 [4:15:20<39:26, 788.91s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_19


 91%|█████████ | 20/22 [4:28:24<26:14, 787.35s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_20


 95%|█████████▌| 21/22 [4:41:14<13:02, 782.05s/it]

/gdrive/MyDrive/amazon_kdd_2023/data/interim/for_ranker/task1/test_chunk_exp115_21


100%|██████████| 22/22 [4:42:53<00:00, 771.51s/it]


In [None]:
test = test.with_columns(
    pl.col("session_id").str.slice(12).cast(pl.Int32).alias("session_id")
)

In [None]:
preds = test.sort(["session_id", "pred"], descending=[False, True])
preds = preds.groupby("session_id", maintain_order=True).head(100)

In [None]:
preds = pl.DataFrame({
    "locale": preds.groupby("session_id", maintain_order=True).first()["locale"].to_list(),
    "next_item_prediction": preds.groupby("session_id", maintain_order=True).all()["candidate_item"].to_list()
})

In [None]:
preds.head()

locale,next_item_prediction
str,list[str]
"""DE""","[""B07SDFLVKD"", ""B091CK241X"", … ""B08L6K3H3Y""]"
"""DE""","[""B084CB7GX9"", ""B0B7KCP6SL"", … ""B07Y8BQC67""]"
"""DE""","[""B09Z4PZQBF"", ""B09KBCTXF5"", … ""B099FB3T56""]"
"""DE""","[""B07Y1KLF25"", ""B07T5XY2CJ"", … ""B09N35PZCD""]"
"""DE""","[""B0B2JY9THB"", ""B08SXLWXH9"", … ""3518390457""]"


In [None]:
preds.write_parquet(DIR + "data/output/task1/" + EXP_NAME + ".parquet", use_pyarrow=True)

In [None]:
preds = preds.with_columns(
    pl.col("next_item_prediction").apply(len).alias("pred_count")
)
preds["pred_count"].describe()

statistic,value
str,f64
"""count""",316972.0
"""null_count""",0.0
"""mean""",100.0
"""std""",0.0
"""min""",100.0
"""max""",100.0
"""median""",100.0
"""25%""",100.0
"""75%""",100.0
