# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install polars



In [None]:
!pip install gensim==4.0.1

Collecting gensim==4.0.1
  Downloading gensim-4.0.1.tar.gz (23.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.1/23.1 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gensim
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for gensim (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for gensim[0m[31m
[0m[?25h  Running setup.py clean for gensim
Failed to build gensim
[31mERROR: Could not build wheels for gensim, which is required to install pyproject.toml-based projects[0m[31m
[0m

In [None]:
import os
import gc
import math
import random
from collections import defaultdict, Counter
from typing import List, Dict
import joblib
import pickle

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numba import njit
import polars as pl
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from gensim.models import Word2Vec

## constants

In [None]:
EXP_NAME = "exp115"
DIR = "/content/drive/MyDrive/kddcup2023/"
K_FOLDS = 3
SEED = 42
LOCALES = ["UK", "JP", "DE"]
MAKE_TRAIN = False
MAKE_TEST = True

# This parameter controls to which end item the candidate is tied.
# For example, if [1,2], candidates are generated from the last item and second last item in each session.
LAST_NS = [1, 2, 3]

In [None]:
USE_FEATURES = [
    # === candidate features ===
    "co_visit_weight_last1", "consective_1_weight_last1", "consective_3_weight_last1", "consective_5_weight_last1", "similarity_score_last1", "bert_distance_last1", "lift_last1", "prone_distance_last1",
    "co_visit_weight_last2", "consective_1_weight_last2", "consective_3_weight_last2", "consective_5_weight_last2", "similarity_score_last2", "bert_distance_last2", "lift_last2", "prone_distance_last2",
    "co_visit_weight_last3", "consective_1_weight_last3", "consective_3_weight_last3", "consective_5_weight_last3", "similarity_score_last3", "bert_distance_last3", "lift_last3", "prone_distance_last3",
    "imf_score", "bpr_score",
    "co_visit_rank_last1", "consective_1_rank_last1", "consective_3_rank_last1", "consective_5_rank_last1", "similarity_rank_last1", "bert_rank_last1", "lift_rank_last1", "prone_rank_last1",
    "co_visit_rank_last2", "consective_1_rank_last2", "consective_3_rank_last2", "consective_5_rank_last2", "similarity_rank_last2", "bert_rank_last2", "lift_rank_last2", "prone_rank_last2",
    "co_visit_rank_last3", "consective_1_rank_last3", "consective_3_rank_last3", "consective_5_rank_last3", "similarity_rank_last3", "bert_rank_last3", "lift_rank_last3", "prone_rank_last3",
    "imf_rank", "bpr_rank",
    # === session features ===
    "S_session_length",
    "S_nunique_brand",
    "S_ratio_unique_brand",
    "S_nunique_item",
    "S_ratio_repurchase",
    "S_locale",
    "S_mean_price", "S_max_price", "S_min_price", "S_std_price", "S_total_amount",
    "S_color_not_null_count", "S_size_not_null_count", "S_model_not_null_count", "S_material_not_null_count", "S_author_not_null_count",
    "S_last_item_price",
    # === product features ===
    "P_price",
    "P_purchase_count", "P_purchase_count_global",
    "P_total_amount",
    "P_brand_purchase_count", "P_brand_purchase_count_global",
    "P_brand_mean_price", "P_brand_max_price", "P_brand_min_price", "P_brand_std_price", "P_total_brand_amount",
    "P_price_diff_to_avg_brand_price",
    "P_n_unique_locale",
    "P_is_color_null", "P_is_size_null", "P_is_model_null", "P_is_material_null", "P_is_author_null",
    "P_purchase_count_ratio_to_locale", "P_purchase_amount_ratio_to_locale", "P_purchase_count_ratio_to_brand", "P_purchase_amount_ratio_to_brand",
    # === session * product features ===
    "SP_price_diff_to_mean_price", "SP_price_diff_to_min_price", "SP_price_diff_to_max_price", "SP_price_diff_to_last_price",
    "SP_brand_price_diff_to_mean_price", "SP_brand_price_diff_to_min_price", "SP_brand_price_diff_to_max_price", "SP_brand_price_diff_to_last_price",
    "SP_same_brand_last1", "SP_same_brand_last2", "SP_same_brand_last3",
    "SP_same_color_last1", "SP_same_color_last2", "SP_same_color_last3",
    "SP_same_size_last1", "SP_same_size_last2", "SP_same_size_last3",
    "SP_same_model_last1", "SP_same_model_last2", "SP_same_model_last3",
    "SP_same_material_last1", "SP_same_material_last2", "SP_same_material_last3",
    "SP_same_author_last1", "SP_same_author_last2", "SP_same_author_last3",
    "SP_same_brand_sum", "SP_same_color_sum", "SP_same_size_sum", "SP_same_model_sum", "SP_same_material_sum", "SP_same_author_sum",
    # === similality features ===
    "imf_similarity", "bpr_similarity",
    "graph_emb_similarity_last1", "graph_emb_similarity_last2", "graph_emb_similarity_last3",
    "i2v_similarity_last1", "i2v_similarity_last2", "i2v_similarity_last3",
]

## load data

In [None]:
class CandidateMatrix:
    def __init__(self, matrix: pl.DataFrame, feat_name: List[str], join_key: str):
        self.matrix = matrix
        self.feat_name = feat_name
        self.join_key = join_key

In [None]:
train = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")
test = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase2.parquet")

In [None]:
session_feat = pl.read_parquet(DIR + "data/interim/features/task1/session_feature_06.parquet")
product_feat_train = pl.read_parquet(DIR + "data/interim/features/task1/product_feature_train_08.parquet")
product_feat_test = pl.read_parquet(DIR + "data/interim/features/task1/product_feature_test_08.parquet")

In [None]:
similar_products1 = pl.read_parquet(DIR + "data/interim/candidates/task1/similar_products_13.parquet")
similar_products2 = pl.read_parquet(DIR + "data/interim/candidates/task1/similar_products_19.parquet")

In [None]:
if MAKE_TRAIN:
    imf_candidates_train = pl.read_parquet(DIR + "data/interim/candidates/task1/imf_15_for_train_or_eval.parquet")
    bpr_candidates_train = pl.read_parquet(DIR + "data/interim/candidates/task1/bpr_01_for_train_or_eval.parquet")
    co_visit_matrix_train_1 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_25_for_train_or_eval.parquet")
    co_visit_matrix_train_2 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_30_for_train_or_eval.parquet")
    co_visit_matrix_train_3 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_29_for_train_or_eval.parquet")
    co_visit_matrix_train_4 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_27_for_train_or_eval.parquet")
    co_visit_matrix_train_5 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_28_for_train_or_eval.parquet")
    prone_matrix_train = pl.read_parquet(DIR + "data/interim/candidates/task1/prone_03_for_local_or_eval.parquet")

    candidate_matrices_train =[
        CandidateMatrix(co_visit_matrix_train_1, ["co_visit_weight", "co_visit_rank"], "item"),
        CandidateMatrix(co_visit_matrix_train_2, ["lift", "lift_rank"], "item"),
        CandidateMatrix(co_visit_matrix_train_3, ["consective_1_weight", "consective_1_rank"], "item"),
        CandidateMatrix(co_visit_matrix_train_4, ["consective_3_weight", "consective_3_rank"], "item"),
        CandidateMatrix(co_visit_matrix_train_5, ["consective_5_weight", "consective_5_rank"], "item"),
        CandidateMatrix(similar_products1, ["similarity_score", "similarity_rank"], "item"),
        CandidateMatrix(similar_products2, ["bert_distance", "bert_rank"], "item"),
        CandidateMatrix(bpr_candidates_train, ["bpr_score", "bpr_rank"], "session"),
        CandidateMatrix(imf_candidates_train, ["imf_score", "imf_rank"], "session"),
        CandidateMatrix(prone_matrix_train, ["prone_distance", "prone_rank"], "item"),
    ]

    # item2vec model
    i2v_models_train = {}
    for locale in LOCALES:
        i2v_models_train[locale] = Word2Vec.load(DIR + f"models/task1/item2vec_{locale}_05_for_train_or_eval.model")

    # imf
    imf_model_train = {}
    user_id2index_train = {}
    item_id2index_train = {}
    for locale in LOCALES:
        imf_model_train[locale] = np.load(DIR + f"models/task1/imf_15_{locale}_model_for_train_or_eval.npz")
        with open(DIR + f"models/task1/imf_15_{locale}_user_id2index_for_train_or_eval.pickle", "rb") as f:
            user_id2index_train[locale] = pickle.load(f)
        with open(DIR + f"models/task1/imf_15_{locale}_item_id2index_for_train_or_eval.pickle", "rb") as f:
            item_id2index_train[locale] = pickle.load(f)

    # bpr
    bpr_model_train = {}
    bpr_user_id2index_train = {}
    bpr_item_id2index_train = {}
    for locale in LOCALES:
        bpr_model_train[locale] = np.load(DIR + f"models/task1/bpr_01_{locale}_model_for_train_or_eval.npz")
        with open(DIR + f"models/task1/bpr_01_{locale}_user_id2index_for_train_or_eval.pickle", "rb") as f:
            bpr_user_id2index_train[locale] = pickle.load(f)
        with open(DIR + f"models/task1/bpr_01_{locale}_item_id2index_for_train_or_eval.pickle", "rb") as f:
            bpr_user_id2index_train[locale] = pickle.load(f)

    # prone
    graph_embs_train = {}
    item_id2indices_prone_train = {}
    for locale in LOCALES:
        graph_embs_train[locale] = np.load(DIR + f"models/task1/graph_embedding_03_{locale}_for_local_train_or_eval.npy")
        with open(DIR + "data/interim/graph/task1/graph_" + f"item_id2index_03_{locale}_for_train_or_eval.pickle", "rb") as f:
            item_id2indices_prone_train[locale] = pickle.load(f)

In [None]:

if MAKE_TEST:
    imf_candidates_test = pl.read_parquet(DIR + "data/interim/candidates/task1/imf_15_for_inference.parquet")
    bpr_candidates_test = pl.read_parquet(DIR + "data/interim/candidates/task1/bpr_01_for_inference.parquet")
    co_visit_matrix_test_1 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_25_for_inference.parquet")
    co_visit_matrix_test_2 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_30_for_inference.parquet")
    co_visit_matrix_test_3 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_29_for_inference.parquet")
    co_visit_matrix_test_4 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_27_for_inference.parquet")
    co_visit_matrix_test_5 = pl.read_parquet(DIR + "data/interim/candidates/task1/co_visit_matrix_28_for_inference.parquet")
    prone_matrix_test = pl.read_parquet(DIR + "data/interim/candidates/task1/prone_03_for_inference.parquet")

    candidate_matrices_test =[
        CandidateMatrix(co_visit_matrix_test_1, ["co_visit_weight", "co_visit_rank"], "item"),
        CandidateMatrix(co_visit_matrix_test_2, ["lift", "lift_rank"], "item"),
        CandidateMatrix(co_visit_matrix_test_3, ["consective_1_weight", "consective_1_rank"], "item"),
        CandidateMatrix(co_visit_matrix_test_4, ["consective_3_weight", "consective_3_rank"], "item"),
        CandidateMatrix(co_visit_matrix_test_5, ["consective_5_weight", "consective_5_rank"], "item"),
        CandidateMatrix(similar_products1, ["similarity_score", "similarity_rank"], "item"),
        CandidateMatrix(similar_products2, ["bert_distance", "bert_rank"], "item"),
        CandidateMatrix(imf_candidates_test, ["imf_score", "imf_rank"], "session"),
        CandidateMatrix(bpr_candidates_test, ["bpr_score", "bpr_rank"], "session"),
        CandidateMatrix(prone_matrix_test, ["prone_distance", "prone_rank"], "item"),
    ]

    # item2vec
    i2v_models_test = {}
    for locale in LOCALES:
        i2v_models_test[locale] = Word2Vec.load(DIR + f"models/task1/item2vec_{locale}_05_for_inference.model")

    # imf model
    imf_model_test = {}
    user_id2index_test = {}
    item_id2index_test = {}
    for locale in LOCALES:
        imf_model_test[locale] = np.load(DIR + f"models/task1/imf_15_{locale}_model_for_inference.npz")
        with open(DIR + f"models/task1/imf_15_{locale}_user_id2index_for_inference.pickle", "rb") as f:
            user_id2index_test[locale] = pickle.load(f)
        with open(DIR + f"models/task1/imf_15_{locale}_item_id2index_for_inference.pickle", "rb") as f:
            item_id2index_test[locale] = pickle.load(f)

    # imf model
    bpr_model_test = {}
    bpr_user_id2index_test = {}
    bpr_item_id2index_test = {}
    for locale in LOCALES:
        bpr_model_test[locale] = np.load(DIR + f"models/task1/bpr_01_{locale}_model_for_inference.npz")
        with open(DIR + f"models/task1/bpr_01_{locale}_user_id2index_for_inference.pickle", "rb") as f:
            bpr_user_id2index_test[locale] = pickle.load(f)
        with open(DIR + f"models/task1/bpr_01_{locale}_item_id2index_for_inference.pickle", "rb") as f:
            bpr_item_id2index_test[locale] = pickle.load(f)

    # prone
    graph_embs_test = {}
    item_id2indices_prone_test = {}
    for locale in LOCALES:
        graph_embs_test[locale] = np.load(DIR + f"models/task1/graph_embedding_03_{locale}_for_inference.npy")
        with open(DIR + "data/interim/graph/task1/graph_" + f"item_id2index_03_{locale}_for_inference.pickle", "rb") as f:
            item_id2indices_prone_test[locale] = pickle.load(f)

## functions

In [None]:
# functions for data processing
def generate_candidates(df: pl.DataFrame, candidate_matrices:List[CandidateMatrix]) -> pl.DataFrame:

    def add_last_n_item(df: pl.DataFrame, last_n: int) -> pl.DataFrame:
        last_item_list = []
        prev_items_list = df["prev_items"].to_list()
        for prev_items in prev_items_list:
            try:
                last_item_list.append(prev_items[-last_n])
            except IndexError:
                last_item_list.append(None)
        df = df.with_columns(pl.Series(name=f"last_item_{last_n}", values=last_item_list))
        return df

    # add last_item columns
    for last_n in LAST_NS:
        df = add_last_n_item(df, last_n)

    # generate candidates
    candidates = []

    # candidates tied to items
    for last_n in LAST_NS:
        for candidate_matrix in candidate_matrices:
            if candidate_matrix.join_key == "item":
                # join candidates to last_n item
                candidate = df.join(candidate_matrix.matrix, left_on=[f"last_item_{last_n}", "locale"], right_on=["item", "locale"], how="left")
                candidate = candidate.filter(~pl.col("candidate_item").is_in(pl.col("prev_items"))) # remove already purchased items

                # keep candidates for feature addition later
                original_feat_names = candidate_matrix.feat_name
                feat_names = [f"{x}_last{last_n}" for x in original_feat_names]
                tmp = candidate[["session_id", "candidate_item"] + original_feat_names]
                for original_feat_name, feat_name in zip(original_feat_names, feat_names):
                    tmp = tmp.rename({original_feat_name:feat_name})
                candidates.append(tmp)

    # candidates tied to session
    for candidate_matrix in candidate_matrices:
        if candidate_matrix.join_key == "session":
            # join candidates to session
            candidate = df.join(candidate_matrix.matrix, on="session_id", how="left")
            candidate = candidate.filter(~pl.col("candidate_item").is_in(pl.col("prev_items"))) # remove already purchased items

            # keep candidates for feature addition later
            candidates.append(candidate[["session_id", "candidate_item"] + candidate_matrix.feat_name])

    cand_all = pl.concat([df[["session_id", "candidate_item"]] for df in candidates])

    # remove duplicate candidates
    cand_all = cand_all.unique(subset=["session_id", "candidate_item"])

    # join candidates
    df = df.join(cand_all, on=["session_id"], how="left")

    # add features derived from the candidate
    for candidate in candidates:
        df = df.join(candidate, on=["session_id", "candidate_item"], how="left")

    return df


def add_label(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
    return df

def filter_null(df: pl.DataFrame, candidate_matrices:List[CandidateMatrix]) -> pl.DataFrame:
    feat_names = []
    for candidate_matrix in candidate_matrices:
        if candidate_matrix.join_key == "item":
            for last_n in LAST_NS:
                for feat_name in candidate_matrix.feat_name:
                    feat_names.append(f"{feat_name}_last{last_n}")
        elif candidate_matrix.join_key == "session":
            feat_names.extend(candidate_matrix.feat_name)
    df = df.filter(
        ~pl.all(pl.col(feat_names).is_null())
    )
    return df

def negative_sample(df: pl.DataFrame) -> pl.DataFrame:
    negatives = df.filter(df["label"] == 0)
    negatives = negatives.sample(fraction=0.1, seed=SEED)
    df = pl.concat([df.filter(df["label"] > 0), negatives])
    return df

def filter_session_not_include_positive(df: pl.DataFrame) -> pl.DataFrame:
    positive_sessions = df.filter(pl.col("label")==1)["session_id"].to_list()
    df = df.filter(df["session_id"].is_in(positive_sessions))
    return df

def add_features(
    df: pl.DataFrame,
    session_feat_df:pl.DataFrame, product_feat_df:pl.DataFrame,
    i2v_models:Dict[str, Word2Vec],
    imf_model, user_id2index, item_id2index,
    bpr_model, bpr_user_id2index, bpr_item_id2index,
    graph_embs, item_id2indices_prone) -> pl.DataFrame:

    @njit()
    def calc_cos_sim(v1, v2):
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

    # session features
    df = df.join(session_feat_df, on="session_id", how="left")

    # product features
    df = df.join(product_feat_df, left_on=["candidate_item", "locale"], right_on=["id", "locale"], how="left")

    # session * product features
    df = df.with_columns([
        (pl.col("P_price") - pl.col("S_mean_price")).alias("SP_price_diff_to_mean_price"),
        (pl.col("P_price") - pl.col("S_min_price")).alias("SP_price_diff_to_min_price"),
        (pl.col("P_price") - pl.col("S_max_price")).alias("SP_price_diff_to_max_price"),
        (pl.col("P_price") - pl.col("S_last_item_price")).alias("SP_price_diff_to_last_price"),
        (pl.col("P_brand_mean_price") - pl.col("S_mean_price")).alias("SP_brand_price_diff_to_mean_price"),
        (pl.col("P_brand_mean_price") - pl.col("S_min_price")).alias("SP_brand_price_diff_to_min_price"),
        (pl.col("P_brand_mean_price") - pl.col("S_max_price")).alias("SP_brand_price_diff_to_max_price"),
        (pl.col("P_brand_mean_price") - pl.col("S_last_item_price")).alias("SP_brand_price_diff_to_last_price"),
    ])

    for last_n in LAST_NS:
        df = df.with_columns([
            ((pl.col("P_brand") == pl.col(f"S_brand_last{last_n}"))&(pl.col(f"S_brand_last{last_n}").is_not_null())).cast(pl.UInt8).alias(f"SP_same_brand_last{last_n}"),
            ((pl.col("P_color") == pl.col(f"S_color_last{last_n}"))&(pl.col(f"S_color_last{last_n}").is_not_null())).cast(pl.UInt8).alias(f"SP_same_color_last{last_n}"),
            ((pl.col("P_size") == pl.col(f"S_size_last{last_n}"))&(pl.col(f"S_size_last{last_n}").is_not_null())).cast(pl.UInt8).alias(f"SP_same_size_last{last_n}"),
            ((pl.col("P_model") == pl.col(f"S_model_last{last_n}"))&(pl.col(f"S_model_last{last_n}").is_not_null())).cast(pl.UInt8).alias(f"SP_same_model_last{last_n}"),
            ((pl.col("P_material") == pl.col(f"S_material_last{last_n}"))&(pl.col(f"S_material_last{last_n}").is_not_null())).cast(pl.UInt8).alias(f"SP_same_material_last{last_n}"),
            ((pl.col("P_author") == pl.col(f"S_author_last{last_n}"))&(pl.col(f"S_author_last{last_n}").is_not_null())).cast(pl.UInt8).alias(f"SP_same_author_last{last_n}"),
        ])
    df = df.with_columns([
        (pl.col("SP_same_brand_last1") + pl.col("SP_same_brand_last2") + pl.col("SP_same_brand_last3")).cast(pl.UInt8).alias("SP_same_brand_sum"),
        (pl.col("SP_same_color_last1") + pl.col("SP_same_color_last2") + pl.col("SP_same_color_last3")).cast(pl.UInt8).alias("SP_same_color_sum"),
        (pl.col("SP_same_size_last1") + pl.col("SP_same_size_last2") + pl.col("SP_same_size_last3")).cast(pl.UInt8).alias("SP_same_size_sum"),
        (pl.col("SP_same_model_last1") + pl.col("SP_same_model_last2") + pl.col("SP_same_model_last3")).cast(pl.UInt8).alias("SP_same_model_sum"),
        (pl.col("SP_same_material_last1") + pl.col("SP_same_material_last2") + pl.col("SP_same_material_last3")).cast(pl.UInt8).alias("SP_same_material_sum"),
        (pl.col("SP_same_author_last1") + pl.col("SP_same_author_last2") + pl.col("SP_same_author_last3")).cast(pl.UInt8).alias("SP_same_author_sum"),
    ])

    # imf similarity between last items and candidates
    dfs = []
    for locale in list(df["locale"].unique()):
        df_by_locale = df.filter(pl.col("locale") == locale)

        sessions = df_by_locale["session_id"].to_list()
        candidates = df_by_locale["candidate_item"].to_list()
        user_index2vector = dict(enumerate(imf_model[locale]["user_factors"]))
        item_index2vector = dict(enumerate(imf_model[locale]["item_factors"]))
        imf_similarities = []
        for session, candidate in zip(sessions, candidates):
            try:
                user_index, item_index = user_id2index[locale][session], item_id2index[locale][candidate]
                v1, v2 = user_index2vector[user_index], item_index2vector[item_index]
                sim = calc_cos_sim(v1, v2)
            except (KeyError, TypeError): # KeyError if the item is not in the imf training data. TypeError if there are no candidates in a session.
                sim = 0
            imf_similarities.append(np.float32(sim))
        df_by_locale = df_by_locale.with_columns(pl.Series(name="imf_similarity", values=imf_similarities).cast(pl.Float32))
        dfs.append(df_by_locale)
    df = pl.concat(dfs)


    # bpr similarity between last items and candidates
    dfs = []
    for locale in list(df["locale"].unique()):
        df_by_locale = df.filter(pl.col("locale") == locale)

        sessions = df_by_locale["session_id"].to_list()
        candidates = df_by_locale["candidate_item"].to_list()
        user_index2vector = dict(enumerate(bpr_model[locale]["user_factors"]))
        item_index2vector = dict(enumerate(bpr_model[locale]["item_factors"]))
        bpr_similarities = []
        for session, candidate in zip(sessions, candidates):
            try:
                user_index, item_index = bpr_user_id2index[locale][session], bpr_item_id2index[locale][candidate]
                v1, v2 = user_index2vector[user_index], item_index2vector[item_index]
                sim = calc_cos_sim(v1, v2)
            except (KeyError, TypeError): # KeyError if the item is not in the imf training data. TypeError if there are no candidates in a session.
                sim = 0
            bpr_similarities.append(np.float32(sim))
        df_by_locale = df_by_locale.with_columns(pl.Series(name="bpr_similarity", values=bpr_similarities).cast(pl.Float32))
        dfs.append(df_by_locale)
    df = pl.concat(dfs)

    for last_n in LAST_NS:
        # item2vec similarity between last items and candidates
        dfs = []
        for locale in LOCALES:
            df_by_locale = df.filter(pl.col("locale") == locale)

            last_items = df_by_locale[f"last_item_{last_n}"].to_list()
            cand_items = df_by_locale["candidate_item"].to_list()
            item_similalities = []
            for last_item, cand_item in zip(last_items, cand_items):
                try:
                    sim = i2v_models[locale].wv.similarity(last_item, cand_item)
                except (KeyError, TypeError): # KeyError if the item is not in the item2vec training data. TypeError if there are no candidates in a session.
                    sim = -1
                item_similalities.append(np.float32(sim))
            df_by_locale = df_by_locale.with_columns(pl.Series(name=f"i2v_similarity_last{last_n}", values=item_similalities).cast(pl.Float32))
            dfs.append(df_by_locale)
        df = pl.concat(dfs)

        # prone similarity between last items and candidates
        dfs = []
        for locale in LOCALES:
            df_by_locale = df.filter(pl.col("locale") == locale)
            last_items = df_by_locale[f"last_item_{last_n}"].to_list()
            cand_items = df_by_locale["candidate_item"].to_list()
            item_similalities = []
            item_index2vector = dict(enumerate(graph_embs[locale]))
            for last_item, cand_item in zip(last_items, cand_items):
                try:
                    item_index1 = item_id2indices_prone[locale][last_item]
                    item_index2 = item_id2indices_prone[locale][cand_item]
                    v1, v2 = item_index2vector[item_index1], item_index2vector[item_index2]
                    sim = calc_cos_sim(v1, v2)
                except (KeyError, TypeError): # KeyError if the item is not in the item2vec training data. TypeError if there are no candidates in a session.
                    sim = -1
                item_similalities.append(np.float32(sim))
            df_by_locale = df_by_locale.with_columns(pl.Series(name=f"graph_emb_similarity_last{last_n}", values=item_similalities).cast(pl.Float32))
            dfs.append(df_by_locale)
        df = pl.concat(dfs)
    return df

def fill_null_and_cast(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns([
        pl.col("co_visit_weight_last1").fill_null(0).cast(pl.Float32),
        pl.col("consective_1_weight_last1").fill_null(0).cast(pl.UInt16),
        pl.col("consective_3_weight_last1").fill_null(0).cast(pl.UInt16),
        pl.col("consective_5_weight_last1").fill_null(0).cast(pl.UInt16),
        pl.col("lift_last1").fill_null(0).cast(pl.Float32),
        pl.col("prone_distance_last1").fill_null(-1).cast(pl.Float32),
        pl.col("bert_distance_last1").fill_null(-1).cast(pl.Float32),
        pl.col("similarity_score_last1").fill_null(0).cast(pl.Float32),
        pl.col("co_visit_weight_last2").fill_null(0).cast(pl.Float32),
        pl.col("consective_1_weight_last2").fill_null(0).cast(pl.UInt16),
        pl.col("consective_3_weight_last2").fill_null(0).cast(pl.UInt16),
        pl.col("consective_5_weight_last2").fill_null(0).cast(pl.UInt16),
        pl.col("lift_last2").fill_null(0).cast(pl.Float32),
        pl.col("prone_distance_last2").fill_null(-1).cast(pl.Float32),
        pl.col("bert_distance_last2").fill_null(-1).cast(pl.Float32),
        pl.col("similarity_score_last2").fill_null(0).cast(pl.Float32),
        pl.col("co_visit_weight_last3").fill_null(0).cast(pl.Float32),
        pl.col("consective_1_weight_last3").fill_null(0).cast(pl.UInt16),
        pl.col("consective_3_weight_last3").fill_null(0).cast(pl.UInt16),
        pl.col("consective_5_weight_last3").fill_null(0).cast(pl.UInt16),
        pl.col("lift_last3").fill_null(0).cast(pl.Float32),
        pl.col("prone_distance_last3").fill_null(-1).cast(pl.Float32),
        pl.col("bert_distance_last3").fill_null(-1).cast(pl.Float32),
        pl.col("similarity_score_last3").fill_null(0).cast(pl.Float32),
        pl.col("imf_score").fill_null(0).cast(pl.Float32),
        pl.col("bpr_score").fill_null(0).cast(pl.Float32),
        pl.col("co_visit_rank_last1").fill_null(999).cast(pl.UInt16),
        pl.col("consective_1_rank_last1").fill_null(999).cast(pl.UInt16),
        pl.col("consective_3_rank_last1").fill_null(999).cast(pl.UInt16),
        pl.col("consective_5_rank_last1").fill_null(999).cast(pl.UInt16),
        pl.col("lift_rank_last1").fill_null(999).cast(pl.UInt16),
        pl.col("prone_rank_last1").fill_null(999).cast(pl.UInt16),
        pl.col("bert_rank_last1").fill_null(999).cast(pl.UInt16),
        pl.col("similarity_rank_last1").fill_null(999).cast(pl.UInt16),
        pl.col("co_visit_rank_last2").fill_null(999).cast(pl.UInt16),
        pl.col("consective_1_rank_last2").fill_null(999).cast(pl.UInt16),
        pl.col("consective_3_rank_last2").fill_null(999).cast(pl.UInt16),
        pl.col("consective_5_rank_last2").fill_null(999).cast(pl.UInt16),
        pl.col("lift_rank_last2").fill_null(999).cast(pl.UInt16),
        pl.col("prone_rank_last2").fill_null(999).cast(pl.UInt16),
        pl.col("bert_rank_last2").fill_null(999).cast(pl.UInt16),
        pl.col("similarity_rank_last2").fill_null(999).cast(pl.UInt16),
        pl.col("co_visit_rank_last3").fill_null(999).cast(pl.UInt16),
        pl.col("consective_1_rank_last3").fill_null(999).cast(pl.UInt16),
        pl.col("consective_3_rank_last3").fill_null(999).cast(pl.UInt16),
        pl.col("consective_5_rank_last3").fill_null(999).cast(pl.UInt16),
        pl.col("lift_rank_last3").fill_null(999).cast(pl.UInt16),
        pl.col("prone_rank_last3").fill_null(999).cast(pl.UInt16),
        pl.col("bert_rank_last3").fill_null(999).cast(pl.UInt16),
        pl.col("similarity_rank_last3").fill_null(999).cast(pl.UInt16),
        pl.col("imf_rank").fill_null(999).cast(pl.UInt16),
        pl.col("bpr_rank").fill_null(999).cast(pl.UInt16),
        pl.col("S_locale").fill_null(0).cast(pl.UInt8),
        pl.col("S_session_length").fill_null(0).cast(pl.UInt16),
        pl.col("S_nunique_item").fill_null(0).cast(pl.UInt16),
        pl.col("S_nunique_brand").fill_null(0).cast(pl.UInt16),
        pl.col("S_color_not_null_count").fill_null(0).cast(pl.UInt16),
        pl.col("S_size_not_null_count").fill_null(0).cast(pl.UInt16),
        pl.col("S_model_not_null_count").fill_null(0).cast(pl.UInt16),
        pl.col("S_material_not_null_count").fill_null(0).cast(pl.UInt16),
        pl.col("S_author_not_null_count").fill_null(0).cast(pl.UInt16),
        pl.col("S_ratio_unique_brand").fill_null(0).cast(pl.Float32),
        pl.col("S_ratio_repurchase").fill_null(0).cast(pl.Float32),
        pl.col("S_mean_price").fill_null(0).cast(pl.Float32),
        pl.col("S_max_price").fill_null(0).cast(pl.Float32),
        pl.col("S_min_price").fill_null(0).cast(pl.Float32),
        pl.col("S_std_price").fill_null(0).cast(pl.Float32),
        pl.col("S_last_item_price").fill_null(0).cast(pl.Float32),
        pl.col("S_total_amount").fill_null(0).cast(pl.Float32),
        pl.col("P_price").fill_null(0).cast(pl.Float32),
        pl.col("P_purchase_count").fill_null(0).cast(pl.UInt32),
        pl.col("P_purchase_count_global").fill_null(0).cast(pl.UInt32),
        pl.col("P_n_unique_locale").fill_null(0).cast(pl.UInt8),
        pl.col("P_is_color_null").fill_null(0).cast(pl.UInt8),
        pl.col("P_is_size_null").fill_null(0).cast(pl.UInt8),
        pl.col("P_is_model_null").fill_null(0).cast(pl.UInt8),
        pl.col("P_is_material_null").fill_null(0).cast(pl.UInt8),
        pl.col("P_is_author_null").fill_null(0).cast(pl.UInt8),
        pl.col("P_brand_purchase_count").fill_null(0).cast(pl.UInt32),
        pl.col("P_brand_purchase_count_global").fill_null(0).cast(pl.UInt32),
        pl.col("P_total_amount").fill_null(0).cast(pl.Float32),
        pl.col("P_locale_purchase_count").fill_null(0).cast(pl.UInt32),
        pl.col("P_total_locale_amount").fill_null(0).cast(pl.Float32),
        pl.col("P_purchase_count_ratio_to_locale").fill_null(0).cast(pl.Float32),
        pl.col("P_purchase_amount_ratio_to_locale").fill_null(0).cast(pl.Float32),
        pl.col("P_purchase_count_ratio_to_brand").fill_null(0).cast(pl.Float32),
        pl.col("P_purchase_amount_ratio_to_brand").fill_null(0).cast(pl.Float32),
        pl.col("P_brand_mean_price").fill_null(0).cast(pl.Float32),
        pl.col("P_brand_max_price").fill_null(0).cast(pl.Float32),
        pl.col("P_brand_min_price").fill_null(0).cast(pl.Float32),
        pl.col("P_brand_std_price").fill_null(0).cast(pl.Float32),
        pl.col("P_total_brand_amount").fill_null(0).cast(pl.Float32),
        pl.col("P_price_diff_to_avg_brand_price").fill_null(0).cast(pl.Float32),
        pl.col("SP_price_diff_to_mean_price").fill_null(0).cast(pl.Float32),
        pl.col("SP_price_diff_to_min_price").fill_null(0).cast(pl.Float32),
        pl.col("SP_price_diff_to_max_price").fill_null(0).cast(pl.Float32),
        pl.col("SP_price_diff_to_last_price").fill_null(0).cast(pl.Float32),
        pl.col("SP_brand_price_diff_to_mean_price").fill_null(0).cast(pl.Float32),
        pl.col("SP_brand_price_diff_to_min_price").fill_null(0).cast(pl.Float32),
        pl.col("SP_brand_price_diff_to_max_price").fill_null(0).cast(pl.Float32),
        pl.col("SP_brand_price_diff_to_last_price").fill_null(0).cast(pl.Float32),
        pl.col("SP_same_brand_last1").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_brand_last2").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_brand_last3").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_color_last1").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_color_last2").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_color_last3").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_size_last1").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_size_last2").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_size_last3").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_model_last1").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_model_last2").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_model_last3").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_material_last1").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_material_last2").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_material_last3").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_author_last1").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_author_last2").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_author_last3").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_brand_sum").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_color_sum").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_size_sum").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_model_sum").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_material_sum").fill_null(0).cast(pl.UInt8),
        pl.col("SP_same_author_sum").fill_null(0).cast(pl.UInt8),

    ])
    return df

## fix seed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(SEED)

# Process data

## train

In [None]:
if MAKE_TRAIN:
    n_rows = 50_000
    for idx, df in tqdm(enumerate(train.iter_slices(n_rows=n_rows)), total=math.ceil(train.height/n_rows)): # specify "total" parameter to display tqdm progress bar
        df = generate_candidates(df, candidate_matrices_train)
        df = df.drop("prev_items")
        df = add_label(df)
        df = filter_null(df, candidate_matrices_train)
        df = filter_session_not_include_positive(df)
        df = negative_sample(df)
        df = add_features(df, session_feat, product_feat_train, i2v_models_train, imf_model_train, user_id2index_train, item_id2index_train, bpr_model_train, bpr_user_id2index_train, bpr_item_id2index_train, graph_embs_train, item_id2indices_prone_train)
        df = fill_null_and_cast(df)
        df = df[["session_id", "candidate_item", "label"] + USE_FEATURES]
        df.write_parquet(DIR + f"data/interim/for_ranker/task1/train_chunk_{EXP_NAME}_{idx}.parquet")

In [None]:
if MAKE_TEST:
    n_rows = 15_000
    for idx, df in tqdm(enumerate(test.iter_slices(n_rows=n_rows)), total=math.ceil(test.height/n_rows)): # specify "total" parameter to display tqdm progress bar
        # process data
        df = generate_candidates(df, candidate_matrices_test)
        df = df.drop("prev_items")
        df = add_features(df, session_feat, product_feat_test, i2v_models_test, imf_model_test, user_id2index_test, item_id2index_test, bpr_model_test, bpr_user_id2index_test, bpr_item_id2index_test, graph_embs_test, item_id2indices_prone_test)
        df = fill_null_and_cast(df)
        df.write_parquet(DIR + f"data/interim/for_ranker/task1/test_chunk_{EXP_NAME}_{idx}.parquet")

  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
100%|██████████| 22/22 [7:19:50<00:00, 1199.57s/it]
