In [21]:
from warnings import filterwarnings
filterwarnings('ignore')

In [26]:
%%capture
!wget https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip
!unzip -o data_original.zip

In [27]:
from pprint import pprint

import numpy as np
import pandas as pd
from copy import deepcopy
from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, MAP, MRR, MeanInvUserFreq,calc_metrics
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter

In [28]:
data_interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

data_interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

interactions = Interactions(data_interactions)
del data_interactions

In [29]:
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [30]:
models = {
    'RandomModel': RandomModel(random_state=32),
    'PopularModel': PopularModel()
}

metrics = {
    "precision@1": Precision(k=1),
    "precision@5": Precision(k=5),
    "precision@10": Precision(k=10),
    "recall@1": Precision(k=1),
    "recall@5": Precision(k=5),
    "recall@10": Precision(k=10),
    "map@1": MAP(k=1),
    "map@5": MAP(k=5),
    "map@10": MAP(k=10),
    "MRR@1":MRR(k=1),
    "MRR@5":MRR(k=2),
    "MRR@10":MRR(k=1),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
    'diversity@1': MeanInvUserFreq(k=1),
    'diversity@5': MeanInvUserFreq(k=5),
    'diversity@10': MeanInvUserFreq(k=10)
}

In [31]:
splitter = TimeRangeSplitter(
            test_size="7D",
            n_splits=3,
            filter_already_seen=True,
            filter_cold_items=True,
            filter_cold_users=True,
            )

In [32]:
splitter.get_test_fold_borders(interactions)

[(Timestamp('2021-08-02 00:00:00', freq='7D'),
  Timestamp('2021-08-09 00:00:00', freq='7D')),
 (Timestamp('2021-08-09 00:00:00', freq='7D'),
  Timestamp('2021-08-16 00:00:00', freq='7D')),
 (Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]

In [33]:
def calculate_metrics(models, metrics, splitter, K, interactions):
  results = []

  fold_iterator = splitter.split(interactions, collect_fold_stats=True)

  for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=3):
      print(f"\n==================== Fold {fold_info['i_split']}")
      print(fold_info)

      df_train = interactions.df.iloc[train_ids]
      dataset = Dataset.construct(df_train)

      df_test = interactions.df.iloc[test_ids][Columns.UserItem]
      test_users = np.unique(df_test[Columns.User])

      catalog = df_train[Columns.Item].unique()

      for model_name, model in models.items():
          model = deepcopy(model)
          model.fit(dataset)
          recos = model.recommend(
              users=test_users,
              dataset=dataset,
              k=K,
              filter_viewed=True,
          )
          metric_values = calc_metrics(
              metrics,
              reco=recos,
              interactions=df_test,
              prev_interactions=df_train,
              catalog=catalog,
          )
          res = {"fold": fold_info["i_split"], "model": model_name}
          res.update(metric_values)
          results.append(res)
  return results

In [34]:
K=3
result=calculate_metrics(models, metrics, splitter, K,interactions)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}

{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}

{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}


In [35]:
pivot_results = pd.DataFrame(result).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='coral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='green', axis=0)
)

Unnamed: 0_level_0,precision@1,precision@1,recall@1,recall@1,precision@5,precision@5,recall@5,recall@5,precision@10,precision@10,recall@10,recall@10,MRR@1,MRR@1,MRR@5,MRR@5,MRR@10,MRR@10,map@1,map@1,map@5,map@5,map@10,map@10,diversity@1,diversity@1,diversity@5,diversity@5,diversity@10,diversity@10,serendipity@1,serendipity@1,serendipity@5,serendipity@5,serendipity@10,serendipity@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2
RandomModel,0.000156,3.8e-05,0.000156,3.8e-05,0.000109,6e-06,0.000109,6e-06,5.4e-05,3e-06,5.4e-05,3e-06,0.000156,3.8e-05,0.000249,5.4e-05,0.000156,3.8e-05,5.9e-05,3.1e-05,0.000127,2.7e-05,0.000127,2.7e-05,15.613141,0.017525,15.610132,0.015472,15.610132,0.015472,6e-06,3e-06,6e-06,2e-06,6e-06,2e-06
PopularModel,0.076432,0.006826,0.076432,0.006826,0.037624,0.001493,0.037624,0.001493,0.018812,0.000747,0.018812,0.000747,0.076432,0.006826,0.103618,0.006023,0.076432,0.006826,0.04272,0.004366,0.068675,0.003533,0.068675,0.003533,2.377055,0.023002,2.731585,0.016823,2.731585,0.016823,2e-06,0.0,3e-06,0.0,3e-06,0.0


In [36]:
def get_visualize_recs(model, interactions, users, K, item_data):
  dataset = Dataset.construct(interactions)
  recommendations = model.recommend(users=users, dataset=dataset, k=K, filter_viewed=True)

  item_data_relevant = item_data[['item_id', 'content_type', 'title', 'title_orig', 'release_year', 'genres']]
  item_data_relevant['num_of_views'] = interactions.groupby('item_id')['user_id'].count()

  user_viewed_items_all = []
  user_recommendations_all = []

  for user_id in users:
    user_viewed_items = interactions[interactions['user_id'] == user_id].merge(item_data_relevant, on="item_id")
    user_recommendations = recommendations[recommendations['user_id'] == user_id].merge(item_data_relevant, on="item_id")

    user_viewed_items_all.append(user_viewed_items)
    user_recommendations_all.append(user_recommendations)

  viewed_items_dataset = pd.concat(user_viewed_items_all, ignore_index=True)
  recommendations_dataset = pd.concat(user_recommendations_all, ignore_index=True)

  return viewed_items_dataset, recommendations_dataset

In [37]:
items = pd.read_csv(f'/content/data_original/items.csv')
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [38]:
model = PopularModel()
dataset = Dataset.construct(interactions.df)
model.fit(dataset)

<rectools.models.popular.PopularModel at 0x79a11ab79cf0>

In [39]:
USERS = [666262, 672861, 955527]
K=3
viewed,recos=get_visualize_recs(model, interactions.df, users=USERS, K=K, item_data=items)

In [40]:
viewed

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct,content_type,title,title_orig,release_year,genres,num_of_views
0,666262,7957,2021-05-12,2052.0,32.0,film,Последний викинг,The Lost Viking,2018.0,"боевики, историческое, приключения",1.0
1,666262,4785,2021-05-12,1946.0,28.0,film,Робин Гуд: Начало,Robin Hood,2018.0,"боевики, триллеры, приключения",1.0
2,666262,12981,2021-05-14,10292.0,100.0,film,Томирис,Tomiris,2020.0,"боевики, драмы, историческое, военные",5.0
3,672861,6870,2021-04-27,10.0,0.0,film,Красавица и чудовище,Beauty and the Beast,2017.0,"драмы, фэнтези, музыкальные",2.0
4,672861,8662,2021-05-04,6354.0,100.0,film,Он – дракон,Drunk Parents,2015.0,фэнтези,4.0
5,955527,1183,2021-06-02,40.0,1.0,film,Стань легендой! Бигфут Младший,The Son of Bigfoot,2017.0,"мультфильм, фэнтези, приключения, комедии",1.0
6,955527,13371,2021-05-04,686.0,11.0,film,Пеле: Рождение легенды,Pele: Birth of a Legend(aka Pele),2016.0,"драмы, спорт, биография",2.0
7,955527,4725,2021-06-02,255.0,4.0,film,Лобановский навсегда,Lobanovskiy Forever,2016.0,"спорт, биография, документальное",7.0
8,955527,1238,2021-06-02,556.0,7.0,film,Диего Марадона,Diego Maradona,2019.0,"спорт, биография, документальное",642.0


In [41]:
recos

Unnamed: 0,user_id,item_id,score,rank,content_type,title,title_orig,release_year,genres,num_of_views
0,666262,10440,202457.0,1,series,Хрустальный,Khrustal'nyy,2021.0,"триллеры, детективы",4.0
1,666262,15297,193123.0,2,series,Клиника счастья,Klinika schast'ya,2021.0,"драмы, мелодрамы",20.0
2,666262,9728,132865.0,3,film,Гнев человеческий,Wrath of Man,2021.0,"боевики, триллеры",143.0
3,672861,10440,202457.0,1,series,Хрустальный,Khrustal'nyy,2021.0,"триллеры, детективы",4.0
4,672861,15297,193123.0,2,series,Клиника счастья,Klinika schast'ya,2021.0,"драмы, мелодрамы",20.0
5,672861,9728,132865.0,3,film,Гнев человеческий,Wrath of Man,2021.0,"боевики, триллеры",143.0
6,955527,10440,202457.0,1,series,Хрустальный,Khrustal'nyy,2021.0,"триллеры, детективы",4.0
7,955527,15297,193123.0,2,series,Клиника счастья,Klinika schast'ya,2021.0,"драмы, мелодрамы",20.0
8,955527,9728,132865.0,3,film,Гнев человеческий,Wrath of Man,2021.0,"боевики, триллеры",143.0
