In [1]:
from src.data_reader import DataReader
from src.constant import *
import matplotlib.pyplot as plt
import numpy as np
from src.datasets.dataset import Dataset
from src.models.itemknn.itemknn import ItemKNN
import pandas as pd
from src.evaluation import compute_mrr

In [2]:
dataset = Dataset()

In [3]:
split_dict = dataset.get_split()
train, train_label = split_dict[TRAIN]
val, val_label = split_dict[VAL]
test, test_label = split_dict[TEST]

val_test = pd.concat([val, test])
val_test_label = pd.concat([val_label, test_label])

In [4]:
from datetime import datetime, timedelta
max_date = train[DATE].max()
train_limit_date = max_date - timedelta(days=150)
filtered_train = train[train[DATE] > train_limit_date].copy()
id_filtered_train = filtered_train[SESS_ID].unique()

final_train_data = train[train[SESS_ID].isin(id_filtered_train)]
final_train_label = train_label[train_label[SESS_ID].isin(id_filtered_train)]

In [5]:
full_data = dataset.get_train_sessions()
# we can not use that for the final submission
lead_data = dataset.get_test_leaderboard_sessions()
final_data = dataset.get_test_final_sessions()

In [6]:
train_pur = pd.concat([final_train_data, final_train_label], axis=0)

In [7]:
model = ItemKNN(dataset, topk=1000, shrink=100, time_weight=None)

In [8]:
model.compute_similarity_matrix(train_pur)

Done: 100%|██████████| 23691/23691 [00:00<00:00, 31430.00it/s]


In [9]:
recs = model.recommend(
    interactions=val_test,
    remove_seen=True,
    cutoff=100,
    leaderboard=False
)

Considering white list items...


In [10]:
compute_mrr(recs, val_test_label)

MRR: 0.14006456846058862


0.14006456846058862

In [11]:
recs.reset_index(drop=True).to_feather(dataset.get_train_recs_df_folder() / "item_knn.feather")

In [12]:
concat_full_data = pd.concat([final_train_data, final_train_label, val, val_label, test, test_label], axis=0)

In [13]:
model = ItemKNN(dataset, topk=1000, shrink=100, time_weight=None)

In [14]:
model.compute_similarity_matrix(concat_full_data)

Done: 100%|██████████| 23691/23691 [00:00<00:00, 30010.74it/s]


In [15]:
recs_lead = model.recommend(
    interactions=lead_data,
    remove_seen=True,
    cutoff=100,
    leaderboard=True
)
recs_lead.reset_index(drop=True).to_feather(dataset.get_leaderboard_recs_df_folder() / "item_knn.feather")

Considering white list items...


In [17]:
recs_final = model.recommend(
    interactions=final_data,
    remove_seen=True,
    cutoff=100,
    leaderboard=True
)
recs_final.reset_index(drop=True).to_feather(dataset.get_final_recs_df_folder() / "item_knn.feather")

Considering white list items...
