In [1]:
import os
from copy import deepcopy
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import pandas as pd
import numpy as np

from rectools.dataset import Dataset
import warnings
from metrics_and_visual import calculate_metrics,get_visualize_recs
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, MAP,NDCG, MRR, MeanInvUserFreq,calc_metrics
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter
from tqdm import tqdm
from rectools import Columns
from userknn import UserKnn


pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%%capture
!wget https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip
!unzip -o data_original.zip

In [3]:
data_interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])
data_users = pd.read_csv('data_original/users.csv')
data_items = pd.read_csv('data_original/items.csv')

In [4]:
data_interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

interactions = Interactions(data_interactions)

In [5]:
metrics = {
    "MAP@10": MAP(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "NDCG@10": NDCG(k=10),
}

In [6]:
cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=3,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [7]:
K = 10

In [8]:
models_popular = {
    "Popular": PopularModel(),
}

In [10]:
%%time
result = calculate_metrics(models=models_popular, interactions=interactions, metrics=metrics, splitter=cv, K=K)
display(result)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}


 33%|███▎      | 1/3 [00:10<00:20, 10.20s/it]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}


 67%|██████▋   | 2/3 [00:21<00:10, 10.86s/it]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}


100%|██████████| 3/3 [00:34<00:00, 11.43s/it]


[{'fold': 0,
  'model': 'Popular',
  'NDCG@10': 0.04533072797925709,
  'MAP@10': 0.08960529558386034,
  'novelty@10': 3.7115844872497505},
 {'fold': 1,
  'model': 'Popular',
  'NDCG@10': 0.04231478828810054,
  'MAP@10': 0.0826070733030659,
  'novelty@10': 3.712928068341117},
 {'fold': 2,
  'model': 'Popular',
  'NDCG@10': 0.04160689159994713,
  'MAP@10': 0.08011397267730101,
  'novelty@10': 3.715658587276043}]

CPU times: total: 30.3 s
Wall time: 34.3 s


In [11]:
pivot_results = pd.DataFrame(result).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='coral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='green', axis=0)
)

Unnamed: 0_level_0,NDCG@10,NDCG@10,MAP@10,MAP@10,novelty@10,novelty@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Popular,0.043084,0.001978,0.084109,0.004921,3.71339,0.002076


In [12]:
K = 10
user = [973171]
dataset_for_train = Dataset.construct(interactions.df)
model = deepcopy(models_popular["Popular"])
model.fit(dataset_for_train)

<rectools.models.popular.PopularModel at 0x21c0f9e5960>

In [15]:
def get_visualize_recs(model, interactions, users, K, item_data):
  dataset = Dataset.construct(interactions)
  recommendations = model.recommend(users=users, dataset=dataset, k=K, filter_viewed=True)

  item_data_relevant = item_data[['item_id', 'content_type', 'title', 'title_orig', 'release_year', 'genres']]
  item_data_relevant['num_of_views'] = interactions.groupby('item_id')['user_id'].count()

  user_viewed_items_all = []
  user_recommendations_all = []

  for user_id in users:
    user_viewed_items = interactions[interactions['user_id'] == user_id].merge(item_data_relevant, on="item_id")
    user_recommendations = recommendations[recommendations['user_id'] == user_id].merge(item_data_relevant, on="item_id")

    user_viewed_items_all.append(user_viewed_items)
    user_recommendations_all.append(user_recommendations)

  viewed_items_dataset = pd.concat(user_viewed_items_all, ignore_index=True)
  recommendations_dataset = pd.concat(user_recommendations_all, ignore_index=True)

  return viewed_items_dataset, recommendations_dataset

In [16]:
viewed,reco = get_visualize_recs(model, interactions.df, users=user, K=K, item_data=data_items)

In [17]:
viewed

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct,content_type,title,title_orig,release_year,genres,num_of_views
0,973171,12204,2021-04-25,7361.0,100.0,film,Город порока,Broken City,2012.0,"боевики, драмы, триллеры",1.0
1,973171,5471,2021-05-06,91345.0,16.0,series,Доктор Хаус,House,2004.0,"драмы, детективы",195.0
2,973171,218,2021-05-08,160564.0,89.0,series,База Куантико,Qantico,2015.0,"драмы, триллеры, криминал, детективы",22.0
3,973171,5695,2021-04-25,6520.0,100.0,film,Всегда верен,Semper Fi,2019.0,"боевики, драмы",64.0
4,973171,13865,2021-06-11,9.0,0.0,film,Девятаев,V2. Escape from Hell,2021.0,"драмы, военные, приключения",821.0


In [18]:
reco

Unnamed: 0,user_id,item_id,score,rank,content_type,title,title_orig,release_year,genres,num_of_views
0,973171,10440,202457.0,1,series,Хрустальный,Khrustal'nyy,2021.0,"триллеры, детективы",4.0
1,973171,15297,193123.0,2,series,Клиника счастья,Klinika schast'ya,2021.0,"драмы, мелодрамы",20.0
2,973171,9728,132865.0,3,film,Гнев человеческий,Wrath of Man,2021.0,"боевики, триллеры",143.0
3,973171,4151,91167.0,4,series,Секреты семейной жизни,,2021.0,комедии,10.0
4,973171,3734,74803.0,5,film,Прабабушка легкого поведения,Prababushka lyogkogo povedeniya,2021.0,комедии,
5,973171,2657,68581.0,6,series,Подслушано,Podslushano,2021.0,"драмы, триллеры",725.0
6,973171,4880,55043.0,7,series,Афера,Afera,2021.0,комедии,5.0
7,973171,142,45367.0,8,film,Маша,Masha,2020.0,"драмы, триллеры",15.0
8,973171,6809,40372.0,9,film,Дуров,,2021.0,документальное,14.0
9,973171,12192,38242.0,10,series,Фемида видит,Femida vidit,2019.0,"драмы, детективы, комедии",44.0


In [23]:
models_userknn = {
    "userknn_cos_30": UserKnn(model=CosineRecommender(), popular_model=deepcopy(models_popular["Popular"]), N_users=30),
    "userknn_bm25_30": UserKnn(model=BM25Recommender(), popular_model=deepcopy(models_popular["Popular"]), N_users=30),
    "userknn_cos_60": UserKnn(model=BM25Recommender(), popular_model=deepcopy(models_popular["Popular"]), N_users=60),
    "userknn_bm25_60": UserKnn(model=BM25Recommender(), popular_model=deepcopy(models_popular["Popular"]), N_users=60),
}

In [None]:
pivot_results = pd.DataFrame(result).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='coral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='green', axis=0)
)

In [None]:
%%time
result = calculate_metrics(models=models_userknn, interactions=interactions, metrics=metrics, splitter=cv, K=K)
display(result)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}



  0%|          | 0/797423 [00:00<?, ?it/s][A
  0%|          | 75/797423 [00:00<18:13, 728.92it/s][A
  0%|          | 77/797423 [00:00<18:07, 733.27it/s][A
  0%|          | 80/797423 [00:00<18:07, 733.27it/s][A
  0%|          | 85/797423 [00:00<18:07, 733.27it/s][A
  0%|          | 166/797423 [00:00<21:29, 618.38it/s][A
  0%|          | 166/797423 [00:00<21:29, 618.38it/s][A
  0%|          | 170/797423 [00:00<21:29, 618.38it/s][A
  0%|          | 180/797423 [00:00<21:29, 618.38it/s][A
  0%|          | 183/797423 [00:00<21:29, 618.38it/s][A
  0%|          | 186/797423 [00:00<21:29, 618.38it/s][A
  0%|          | 187/797423 [00:00<21:29, 618.38it/s][A
  0%|          | 191/797423 [00:00<21:29, 618.38it/s][A
  0%|          | 254/797423 [00:00<23:16, 570.91it/s][A
  0%|          | 257/797423 [00:00<24:10, 549.57it/s][A
  0%|          | 260/797423 [00:00<24:10, 549.57it/s][A
  0%|          | 260/797423 [00:00<24:10, 549.57it/s][A
  0%|          | 339/797423 [00:00<21:49, 608

In [26]:
pivot_results = pd.DataFrame(result).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='coral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='green', axis=0)
)

Unnamed: 0_level_0,NDCG@10,NDCG@10,MAP@10,MAP@10,novelty@10,novelty@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
userknn_cos_30,0.020301,0.001182,0.031749,0.002105,15.119545,0.510257
userknn_bm25_30,0.018727,0.001299,0.030053,0.0022,15.151622,0.422586
userknn_cos_60,0.018727,0.001299,0.030053,0.0022,15.151622,0.422586
userknn_bm25_60,0.018727,0.001299,0.030053,0.0022,15.151622,0.422586


  3%|▎         | 28492/962179 [01:00<23:33, 660.63it/s]

In [27]:
model = deepcopy(models_userknn["userknn_cos_30"])
model.fit(dataset_for_train)


  0%|          | 0/962179 [00:00<?, ?it/s][A
  0%|          | 61/962179 [00:00<28:16, 566.96it/s][A
  0%|          | 62/962179 [00:00<28:16, 566.96it/s][A
  0%|          | 63/962179 [00:00<28:16, 566.96it/s][A
  0%|          | 66/962179 [00:00<28:16, 566.96it/s][A
  0%|          | 66/962179 [00:00<28:16, 566.96it/s][A
  0%|          | 128/962179 [00:00<29:45, 538.70it/s][A
  0%|          | 130/962179 [00:00<29:28, 544.13it/s][A
  0%|          | 131/962179 [00:00<29:28, 544.13it/s][A
  0%|          | 133/962179 [00:00<29:28, 544.13it/s][A
  0%|          | 135/962179 [00:00<29:28, 544.13it/s][A
  0%|          | 136/962179 [00:00<29:28, 544.13it/s][A
  0%|          | 191/962179 [00:00<29:25, 545.00it/s][A
  0%|          | 193/962179 [00:00<29:22, 545.85it/s][A
  0%|          | 194/962179 [00:00<29:22, 545.85it/s][A
  0%|          | 252/962179 [00:00<29:26, 544.40it/s][A
  0%|          | 254/962179 [00:00<29:29, 543.56it/s][A
  0%|          | 254/962179 [00:00<29:29, 543.

In [28]:
import pickle

In [30]:
model_name = "userknn_cos_30.pickle"
pickle.dump(model, open(model_name, "wb"))