In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [3]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')

interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [6]:
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Interactions

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [8]:
(train_ids, test_ids, fold_info) = cv.split(Interactions(interactions), collect_fold_stats=True).__next__()

In [9]:
train_ids

array([      0,       1,       2, ..., 5476245, 5476247, 5476249])

In [10]:
test_ids

array([      6,      33,      56, ..., 5476229, 5476230, 5476240])

In [11]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

users_mapping amount: 842129
items_mapping amount: 15404


In [12]:
from rectools.dataset import Dataset

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)

#  ItemKNN CosineRecommender

In [13]:
from implicit.nearest_neighbours import CosineRecommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel

item_knn = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=30))
item_knn.fit(dataset);

In [14]:
recs_itemknn = item_knn.recommend(
    test['user_id'].unique(), 
    dataset=dataset, 
    k=10, 
    filter_viewed=False
)

In [15]:
recs_itemknn.head()

Unnamed: 0,user_id,item_id,score,rank
0,1016458,10440,20431.63115,1
1,1016458,734,8043.999962,2
2,1016458,12192,8033.59953,3
3,1016458,1986,7999.805731,4
4,1016458,4457,7763.204607,5


In [19]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, MAP, Serendipity, calc_metrics

# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

metric_values_itemknn_cosine = calc_metrics(
            metrics,
            reco=recs_itemknn,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )

metric_values_itemknn_cosine

{'prec@10': 0.017311708814214132,
 'recall@10': 0.09520897568691472,
 'MAP@10': 0.023145528903990274,
 'novelty': 8.05318572965277,
 'serendipity': 6.63288816067437e-05}

# ItemKNN TFIDFRecommender

In [22]:
from implicit.nearest_neighbours import TFIDFRecommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel

item_knn_tfidf = ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=30))
item_knn_tfidf.fit(dataset);

In [24]:
recs_itemknn_tfidf = item_knn_tfidf.recommend(
    test['user_id'].unique(), 
    dataset=dataset, 
    k=10, 
    filter_viewed=False 
)

In [26]:
recs_itemknn_tfidf.head()

Unnamed: 0,user_id,item_id,score,rank
0,1016458,10440,21745.376927,1
1,1016458,4457,10234.863308,2
2,1016458,7102,8987.878129,3
3,1016458,12192,8957.109813,4
4,1016458,1986,8369.832448,5


In [33]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, MAP, Serendipity, calc_metrics

metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

metric_values_itemknn_tfidf = calc_metrics(
            metrics,
            reco=recs_itemknn_tfidf,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )

metric_values_itemknn_tfidf

{'prec@10': 0.023772589549238603,
 'recall@10': 0.12652382351172245,
 'MAP@10': 0.03005237337960426,
 'novelty': 6.699663403861505,
 'serendipity': 0.00010222896681730396}

# UserKNN BMP25

In [28]:
from implicit.nearest_neighbours import BM25Recommender
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel

item_knn_bmp = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=30))
item_knn_bmp.fit(dataset);

In [29]:
recs_itemknn_bmp = item_knn_bmp.recommend(
    test['user_id'].unique(), 
    dataset=dataset, 
    k=10, 
    filter_viewed=False 
)

recs_itemknn_bmp.head()

Unnamed: 0,user_id,item_id,score,rank
0,1016458,10440,685454700000.0,1
1,1016458,15297,232313800000.0,2
2,1016458,13865,172474000000.0,3
3,1016458,9728,138320800000.0,4
4,1016458,4151,114935800000.0,5


In [31]:
from rectools.metrics import Precision, Recall, MeanInvUserFreq, MAP, Serendipity, calc_metrics

metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

metric_values_itemknn_bmp = calc_metrics(
            metrics,
            reco=recs_itemknn_bmp,
            interactions=test,
            prev_interactions=train,
            catalog=catalog
        )

metric_values_itemknn_bmp

{'prec@10': 0.03252208701450242,
 'recall@10': 0.1683399650610623,
 'MAP@10': 0.04827657497255996,
 'novelty': 3.9201705312554833,
 'serendipity': 2.616232292298612e-05}