In [1]:
from src.data_reader import DataReader
from src.constant import *
import matplotlib.pyplot as plt
import numpy as np
from src.datasets.dataset import Dataset
from src.models.itemknn.itemknn import ItemKNN
from src.models.ease.ease import EASE
from src.models.content_ease.content_ease import CEASE
from src.models.hybrid_item_sim.hybrid_item_sim import HybridItemSimilarity
import pandas as pd
from src.evaluation import compute_mrr, map_at_k
from src.utils.sparse_matrix import interactions_to_sparse_matrix

In [2]:
dataset = Dataset()

In [3]:
split_dict = dataset.get_split()
train, train_label = split_dict[TRAIN]
val, val_label = split_dict[VAL]
test, test_label = split_dict[TEST]

val_test = pd.concat([val, test])
val_test_label = pd.concat([val_label, test_label])

In [4]:
from datetime import datetime, timedelta
max_date = train[DATE].max()
train_limit_date = max_date - timedelta(days=150)
filtered_train = train[train[DATE] > train_limit_date].copy()
id_filtered_train = filtered_train[SESS_ID].unique()

final_train_data = train[train[SESS_ID].isin(id_filtered_train)]
final_train_label = train_label[train_label[SESS_ID].isin(id_filtered_train)]

In [5]:
full_data = dataset.get_train_sessions()
# we can not use that for the final submission
lead_data = dataset.get_test_leaderboard_sessions()
final_data = dataset.get_test_final_sessions()

In [6]:
train["sample_weight"] = 1
train_label["sample_weight"] = 5

final_train_data["sample_weight"] = 1
final_train_label["sample_weight"] = 5

val["sample_weight"] = 1
val_label["sample_weight"] = 5

test["sample_weight"] = 1
test_label["sample_weight"] = 5

lead_data["sample_weight"] = 1
final_data["sample_weight"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_train_data["sample_weight"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_train_label["sample_weight"] = 5


In [7]:
train_pur = pd.concat([final_train_data, final_train_label], axis=0)

In [8]:
cease = CEASE(dataset, time_weight=None, l2=1e-1)
ease = EASE(dataset, time_weight=None, l2=1e-1)


In [9]:
hybrid_m = HybridItemSimilarity(dataset=dataset, model_list=[ease, cease], model_weight_list=[0.3, 0.7],
                                normalization=None, normalization_axis=1, time_weight=50)

[0.3, 0.7]


In [10]:
hybrid_m.compute_similarity_matrix(train_pur)

(23691, 23691)
Computing inverse


In [11]:
recs = hybrid_m.recommend(
    interactions=val_test,
    remove_seen=True,
    cutoff=100,
    leaderboard=False
)

Using Time Weight on Interaction matrix
Considering white list items...


In [13]:
compute_mrr(recs, val_test_label)

MRR: 0.1856782616100282


0.1856782616100282

In [14]:
recs.reset_index(drop=True).to_feather(dataset.get_train_recs_df_folder() / "hybrid_ease_tw.feather")

## SUBMISSION

In [15]:
concat_full_data = pd.concat([final_train_data, final_train_label, val, val_label, test, test_label], axis=0)

In [16]:
cease = CEASE(dataset, time_weight=None, l2=1e-1)
ease = EASE(dataset, time_weight=None, l2=1e-1)


In [17]:
hybrid_m = HybridItemSimilarity(dataset=dataset, model_list=[ease, cease], model_weight_list=[0.3, 0.7],
                                normalization=None, normalization_axis=1, time_weight=50)

[0.3, 0.7]


In [18]:
hybrid_m.compute_similarity_matrix(concat_full_data)

(23691, 23691)
Computing inverse


In [19]:
recs_lead = hybrid_m.recommend(
    interactions=lead_data,
    remove_seen=True,
    cutoff=100,
    leaderboard=True
)
recs_lead.reset_index(drop=True).to_feather(dataset.get_leaderboard_recs_df_folder() / "hybrid_ease_tw.feather")

Using Time Weight on Interaction matrix
Considering white list items...


In [20]:
recs_final = hybrid_m.recommend(
    interactions=final_data,
    remove_seen=True,
    cutoff=100,
    leaderboard=True
)
recs_final.reset_index(drop=True).to_feather(dataset.get_final_recs_df_folder() / "hybrid_ease_tw.feather")

Using Time Weight on Interaction matrix
Considering white list items...


In [None]:
#dataset.create_submission(recs, sub_name="CEASE_tw50_0.3_0.7")