In [1]:
from src.data_reader import DataReader
from src.constant import *
import matplotlib.pyplot as plt
import numpy as np
from src.datasets.dataset import Dataset
from src.models.itemknn.itemknn import ItemKNN
from src.models.ease.ease import EASE
import pandas as pd
from src.evaluation import compute_mrr
from src.utils.sparse_matrix import interactions_to_sparse_matrix

In [2]:
dataset = Dataset()

In [3]:
split_dict = dataset.get_split()
train, train_label = split_dict[TRAIN]
val, val_label = split_dict[VAL]
test, test_label = split_dict[TEST]

In [16]:
from datetime import datetime, timedelta
max_date = train[DATE].max()
train_limit_date = max_date - timedelta(days=120)

In [17]:
train_limit_date

Timestamp('2020-12-31 23:41:03.992000')

In [18]:
filtered_train = train[train[DATE]>train_limit_date].copy()
id_filtered_train = filtered_train[SESS_ID].unique()
filtered_train_label = train_label[train_label[SESS_ID].isin(id_filtered_train)].copy()


In [19]:
full_data = dataset.get_train_sessions()
# we can not use that for the final submission
lead_data = dataset.get_test_leaderboard_sessions()
final_data = dataset.get_test_final_sessions()

In [33]:
train["sample_weight"] = 1
train_label["sample_weight"] = 5

filtered_train["sample_weight"] = 1
filtered_train_label["sample_weight"] = 5

val["sample_weight"] = 1
val_label["sample_weight"] = 5

test["sample_weight"] = 1
test_label["sample_weight"] = 5

lead_data["sample_weight"] = 1
final_data["sample_weight"] = 1

In [21]:
filtered_train_label

Unnamed: 0,session_id,item_id,date,sample_weight
6,31,7029,2021-04-20 19:46:42.594,5
8,42,9930,2021-03-01 15:17:04.264,5
20,140,19823,2021-04-28 13:45:31.202,5
22,153,17060,2021-01-16 19:40:36.012,5
29,184,344,2021-04-04 01:43:26.238,5
...,...,...,...,...
918360,4439927,14971,2021-03-29 16:25:42.912,5
918361,4439928,18299,2021-02-27 18:32:02.882,5
918364,4439936,6792,2021-04-16 14:52:33.804,5
918368,4439952,10819,2021-03-03 19:34:55.343,5


In [40]:
train_pur = pd.concat([filtered_train, filtered_train_label], axis=0)

In [41]:
train_pur

Unnamed: 0,session_id,item_id,date,sample_weight
36,31,21891,2021-04-20 19:38:03.816,1
37,31,13739,2021-04-20 19:39:17.092,1
38,31,1725,2021-04-20 19:40:21.952,1
39,31,23209,2021-04-20 19:43:43.813,1
40,31,22292,2021-04-20 19:41:29.179,1
...,...,...,...,...
40805,4439376,14295,2021-05-01 15:53:07.612,5
40806,4439488,11403,2021-05-01 16:59:11.558,5
40807,4439680,17813,2021-05-21 16:22:54.437,5
40808,4439898,20251,2021-05-25 23:07:08.889,5


In [42]:
model = EASE(dataset, time_weight=50, l2=1e-1)

In [43]:
model.compute_similarity_matrix(train_pur)

In [44]:
recs = model.recommend(
    interactions=val,
    remove_seen=True,
    cutoff=100,
    leaderboard=True
)

Using Time Weight on Interaction matrix
Considering white list items...


In [45]:
compute_mrr(recs, val_label)

MRR: 0.21932694224620133


0.21932694224620133

# Create Submission

In [28]:
concat_full_data = pd.concat([filtered_train, filtered_train_label, val, val_label, test, test_label], axis=0)
#concat_full_data = pd.concat([train, train_label, val, val_label, test, test_label], axis=0)

In [29]:
model = EASE(dataset, time_weight=50, l2=1e-1)

In [30]:
model.compute_similarity_matrix(concat_full_data)

In [31]:
recs = model.recommend(
    interactions=lead_data,
    remove_seen=True,
    cutoff=100,
    leaderboard=True
)

Using Time Weight on Interaction matrix
Considering white list items...


In [32]:
dataset.create_submission(recs, sub_name="Ease_3m")

Submission with name: Ease_3m created succesfully!
