# Установка библиотек

In [None]:
!pip install rectools



In [None]:
import pandas as pd
import numpy as np
import zipfile as zf

import requests
from tqdm.auto import tqdm

from rectools import Columns
from rectools.models import RandomModel,PopularModel
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import Precision, Recall, MAP, MRR, Serendipity,MeanInvUserFreq,calc_metrics
from rectools.dataset import Interactions, Dataset

# Чтение данных

In [None]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'


In [None]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [None]:


files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()


In [None]:
interactions_df = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

interactions_df.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

In [None]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [None]:
interactions = Interactions(interactions_df)
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


# Расчёт метрик



In [None]:
models = {
    "random": RandomModel(random_state=32),
    "popular": PopularModel()
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "MRR@1": MRR(k=1),
    "MRR@5": MRR(k=5),
    "MRR@10": MRR(k=10),
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
}

K_RECOS = 10

In [None]:
n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [None]:
cv.get_test_fold_borders(interactions)

[(Timestamp('2021-08-02 00:00:00', freq='7D'),
  Timestamp('2021-08-09 00:00:00', freq='7D')),
 (Timestamp('2021-08-09 00:00:00', freq='7D'),
  Timestamp('2021-08-16 00:00:00', freq='7D')),
 (Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]

In [None]:
def cross_validation_function(interactions, models, metrics, K_RECOS, splitter,n_splits):

  results = []

  fold_iterator = splitter.split(interactions, collect_fold_stats=True)

  for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=n_splits):
      print(f"\n==================== Fold {fold_info['i_split']}")
      print(fold_info)

      df_train = interactions.df.iloc[train_ids]
      dataset = Dataset.construct(df_train)

      df_test = interactions.df.iloc[test_ids][Columns.UserItem]
      test_users = np.unique(df_test[Columns.User])

      catalog = df_train[Columns.Item].unique()

      for model_name, model in models.items():
          model.fit(dataset)
          recos = model.recommend(
              users=test_users,
              dataset=dataset,
              k=K_RECOS,
              filter_viewed=True,
          )
          metric_values = calc_metrics(
              metrics,
              reco=recos,
              interactions=df_test,
              prev_interactions=df_train,
              catalog=catalog,
          )
          res = {"fold": fold_info["i_split"], "model": model_name}
          res.update(metric_values)
          results.append(res)
  return results

In [None]:
%%time

cross_val_results = cross_validation_function(interactions, models, metrics, K_RECOS, cv, n_splits)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}

{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}

{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}
CPU times: user 1min 49s, sys: 2.31 s, total: 1min 52s
Wall time: 1min 52s


In [None]:
pivot_results = pd.DataFrame(cross_val_results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,MRR@1,MRR@5,MRR@10,MAP@1,MAP@5,MAP@10,novelty@1,novelty@5,novelty@10,serendipity@1,serendipity@5,serendipity@10
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
random,0.000221,7.2e-05,0.000202,0.000365,0.000193,0.000693,0.000221,0.000485,0.000604,7.2e-05,0.000169,0.000211,15.614137,15.612989,15.613009,6e-06,7e-06,7e-06
popular,0.076432,0.04272,0.052402,0.137413,0.033903,0.173492,0.076432,0.131669,0.138603,0.04272,0.078295,0.084109,2.377055,3.066979,3.71339,2e-06,3e-06,2e-06


# Визуальный анализ

In [None]:
def visualization_function(model, inter_data, user_id, item_data,item_columns, K_RECOS):
  dataset = Dataset.construct(inter_data)
  recos = model.recommend(
      users=user_id,
      dataset=dataset,
      k=K_RECOS,
      filter_viewed=True
      )
  for user in user_id:
    print('История просмотров:')
    display(inter_data[inter_data.user_id==user].merge(item_data[item_columns].merge(inter_data.groupby(['item_id']).agg(['count']).user_id.reset_index(),on='item_id'),on='item_id'))
    print('Рекомендации:')
    display(recos[recos.user_id==user].merge(item_data[item_columns].merge(inter_data.groupby(['item_id']).agg(['count']).user_id.reset_index(),on='item_id'),on='item_id'))

In [None]:
model = RandomModel(random_state=32)
dataset = Dataset.construct(interactions_df)
model.fit(dataset)
test_users = [666262, 672861, 955527]

In [None]:
visualization_function(model,interactions_df,test_users,items,['item_id','title','genres'],10)

История просмотров:


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct,title,genres,count
0,666262,7957,2021-05-12,2052.0,32.0,Последний викинг,"боевики, историческое, приключения",746
1,666262,4785,2021-05-12,1946.0,28.0,Робин Гуд: Начало,"боевики, триллеры, приключения",485
2,666262,12981,2021-05-14,10292.0,100.0,Томирис,"боевики, драмы, историческое, военные",10370


Рекомендации:


Unnamed: 0,user_id,item_id,score,rank,title,genres,count
0,666262,10101,10,1,Возвращение Будулая,мелодрамы,99
1,666262,619,9,2,Новые приключения Аладдина (жестовым языком),"зарубежные, комедии",1
2,666262,12618,8,3,Пропавшая грамота,"фэнтези, комедии",51
3,666262,5967,7,4,Братья вне игры,"драмы, спорт",262
4,666262,4041,6,5,Фрилансеры,"криминал, детективы, драмы, зарубежные, боевики",19
5,666262,5701,5,6,Алые паруса: Новая история,"комедии, мелодрамы",4
6,666262,9738,4,7,Женщина в беде 3,"детективы, мелодрамы",2
7,666262,15247,3,8,Гордость и предубеждение,"драмы, мелодрамы",150
8,666262,10004,2,9,Болванчики,"мультфильм, приключения, комедии",51
9,666262,2816,1,10,Избави нас от лукавого,"ужасы, триллеры, детективы",1370


История просмотров:


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct,title,genres,count
0,672861,6870,2021-04-27,10.0,0.0,Красавица и чудовище,"драмы, фэнтези, музыкальные",1083
1,672861,8662,2021-05-04,6354.0,100.0,Он – дракон,фэнтези,643


Рекомендации:


Unnamed: 0,user_id,item_id,score,rank,title,genres,count
0,672861,9457,10,1,Комната (жестовым языком),"драмы, зарубежные, триллеры",5
1,672861,15730,9,2,Твое подтянутое тело,фитнес,2
2,672861,473,8,3,Кто такой Букабу?,"развлекательные, для детей, документальное",15
3,672861,12736,7,4,Палач,"драмы, зарубежные, комедии",3
4,672861,3927,6,5,Помни меня,"драмы, мелодрамы",2982
5,672861,3300,5,6,Антилопа Гну. Южная Африка,документальное,8
6,672861,5334,4,7,Boys and Toys,no_genre,3
7,672861,14273,3,8,Влюбленный скорпион,"драмы, зарубежные, спорт, триллеры, мелодрамы",2
8,672861,3087,2,9,Жуки - караоке,no_genre,1
9,672861,4416,1,10,Питер,"фэнтези, приключения",33


История просмотров:


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct,title,genres,count
0,955527,1183,2021-06-02,40.0,1.0,Стань легендой! Бигфут Младший,"мультфильм, фэнтези, приключения, комедии",1587
1,955527,13371,2021-05-04,686.0,11.0,Пеле: Рождение легенды,"драмы, спорт, биография",945
2,955527,4725,2021-06-02,255.0,4.0,Лобановский навсегда,"спорт, биография, документальное",683
3,955527,1238,2021-06-02,556.0,7.0,Диего Марадона,"спорт, биография, документальное",691


Рекомендации:


Unnamed: 0,user_id,item_id,score,rank,title,genres,count
0,955527,496,10,1,Воскресший Эртугрул,"боевики, драмы, приключения",6167
1,955527,4205,9,2,Дело гастронома №1 (Операция Беркут),"драмы, русские",1
2,955527,10822,8,3,Она защищает Родину,"драмы, советские, военные",2
3,955527,10914,7,4,Великолепная,"зарубежные, комедии, мелодрамы",3
4,955527,3999,6,5,Джиперс криперс,"ужасы, триллеры",648
5,955527,15756,5,6,Ремнант: Всё ещё вижу тебя (жестовым языком),"фантастика, зарубежные, триллеры",2
6,955527,14961,4,7,Битва за Землю,"боевики, ужасы, фантастика, триллеры",2032
7,955527,13734,3,8,Сексуальный массаж и Фантазии,для взрослых,31
8,955527,3407,2,9,Черный капитан,"боевики, русские, военные",1
9,955527,14614,1,10,Настя,"мелодрамы, комедии",2
