In [1]:
from src.data_reader import DataReader
from src.constant import *
import matplotlib.pyplot as plt
import numpy as np
from src.datasets.dataset import Dataset
from src.models.itemknn.itemknn import ItemKNN
from src.models.ease.ease import EASE
from src.models.content_ease.content_ease import CEASE
from src.models.cb_iknn.cb_iknn import CBItemKNN
from src.models.hybrid_item_sim.hybrid_item_sim import HybridItemSimilarity
import pandas as pd
from src.evaluation import compute_mrr

In [2]:
dataset = Dataset()

In [3]:
split_dict = dataset.get_split()

In [4]:
train, train_label = split_dict[TRAIN]

In [5]:
val, val_label = split_dict[VAL]

In [6]:
full_data = dataset.get_train_sessions()
# concat purchases
train_pur = pd.concat([full_data, train_label], axis=0)

In [7]:
train_pur["last_buy"] = train_pur.groupby(SESS_ID)[DATE].transform(max)
train_pur["first_buy"] = train_pur.groupby(SESS_ID)[DATE].transform(min)
train_pur["time_score"] = 1 / (
    (
        (train_pur["last_buy"] - train_pur[DATE]).apply(
            lambda x: x.total_seconds() / 3600
        )
    )
    + 1
)
train_pur = train_pur[train_pur["time_score"] >= 0.7]

In [8]:
cease = CEASE(dataset, time_weight=None, l2=1e-1)
ease = EASE(dataset, time_weight=None, l2=1e-1)


In [29]:
hybrid_m = HybridItemSimilarity(dataset=dataset, model_list=[ease, cease], model_weight_list=[0.4, 0.6],
                                normalization=None, normalization_axis=1, time_weight=50)

[0.3, 0.7]


In [30]:
hybrid_m.compute_similarity_matrix(train_pur)

(23691, 23691)
Computing inverse


In [31]:
recs = hybrid_m.recommend(
    interactions=val,
    remove_seen=True,
    cutoff=100,
    leaderboard=False
)

Using Time Weight on Interaction matrix


In [32]:
compute_mrr(recs, val_label)

MRR: 0.17904181682927253


0.17904181682927253

## SUBMISSION

In [39]:
lead_data = dataset.get_test_leaderboard_sessions()
final_data = dataset.get_test_final_sessions()
full_data = dataset.get_train_sessions()
full_label = dataset.get_train_purchases()
concat_full_data = pd.concat([full_data, full_label, lead_data, final_data], axis=0)

In [40]:
concat_full_data["last_buy"] = concat_full_data.groupby(SESS_ID)[DATE].transform(max)
concat_full_data["first_buy"] = concat_full_data.groupby(SESS_ID)[DATE].transform(min)
concat_full_data["time_score"] = 1 / (
    (
        (concat_full_data["last_buy"] - concat_full_data[DATE]).apply(
            lambda x: x.total_seconds() / 3600
        )
    )
    + 1
)
concat_full_data = concat_full_data[concat_full_data["time_score"] >= 0.7]

In [41]:
cease = CEASE(dataset, time_weight=None, l2=1e-1)
ease = EASE(dataset, time_weight=None, l2=1e-1)


In [42]:
hybrid_m = HybridItemSimilarity(dataset=dataset, model_list=[ease, cease], model_weight_list=[0.4, 0.6],
                                normalization=None, normalization_axis=1, time_weight=50)

[0.4, 0.6]


In [43]:
hybrid_m.compute_similarity_matrix(concat_full_data)

(23691, 23691)
Computing inverse


In [44]:
recs = hybrid_m.recommend(
    interactions=lead_data,
    remove_seen=True,
    cutoff=100,
    leaderboard=True
)

Using Time Weight on Interaction matrix
Considering white list items...


In [45]:
dataset.create_submission(recs, sub_name="0.7_CEASE_tw")

Submission with name: 0.7_CEASE_tw created succesfully!
