# **Валидация и метрики**


# Общее

In [None]:
!pip install -r requirements.txt

In [4]:
from pprint import pprint
import pandas as pd
import time
import numpy as np

import requests
from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics, MAP, MRR
from rectools.model_selection import TimeRangeSplitter
from rectools.models import RandomModel, PopularModel

In [5]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

In [6]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [7]:
import zipfile as zf

files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()


In [8]:
interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

In [9]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')


In [10]:
def headtail(df):
    return pd.concat([df.head(), df.tail()])

headtail(interactions)

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [11]:
interactions.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


# Расчёт метрик

Функция будет принимать на вход:

* Словарь с инициализированными моделями. Модели: rectools.models.RandomModel(random_state=32), rectools.models.PopularModel() с параметрами по умолчанию
* Словарь с инициализированными метриками. 2 ранжирующие, 2 классификационные, 2 beyond-accuracy. Считаем по порогам 1, 5, 10. MAP обязательно
* Инициализированный Splitter для кросс-валидации. rectools.model_selection.TimeRangeSplitter, 3 фолда для кросс-валидации по неделе, исключение холодных юзеров и айтемов и просмотренных айтемов
* Количество рекомендаций для генерации (K)

Реализация обучения и валидации:

* Создаем RecTools Dataset через метод construct на train взаимодействиях для каждого фолда
* Обучаем модель (не забываем сделать deepcopy), рекоменуем K айтемов для каждого юзера, считаем метрики на test
* Дополнительно логируем время обучения
* Сохраняем метрики в отчёт

In [12]:
models = {
    "random": RandomModel(random_state=32),
    "popular": PopularModel()
}

# We will calculate several classic (precision@k and recall@k),
# ranking (MAP,MRR) and
# "beyond accuracy" metrics novelty and serendipity
metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),

    "map@1": MAP(k=1),
    "map@5": MAP(k=5),
    "map@10": MAP(k=10),
    "mrr@1": MRR(k=1),
    "mrr@5": MRR(k=5),
    "mrr@10": MRR(k=10),


    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
}

K_RECOS = 10

In [13]:
def cross_validation(dataframe,n_splits):
  intr = Interactions(dataframe)
  print(intr.df.head())

  # Cross-Validation Threshold-based
  cv = TimeRangeSplitter(
  test_size="7D",
  n_splits=n_splits,
  filter_already_seen=True,
  filter_cold_items=True,
  filter_cold_users=True,
)
  print("\nWhole dataset time period: ")
  print(dataframe["datetime"].min(), dataframe["datetime"].max())

  print('\nDataset was splitted on these folds:')
  pprint(cv.get_test_fold_borders(intr))
  return cv, intr

In [14]:
def train_val(intr, models, metrics, splitter, k_recommends):
  st = time.time()

  # For each fold generate train and test part of dataset
  # Then fit every model, generate recommendations and calculate metrics

  metrics_results = []

  fold_iterator = cv.split(intr, collect_fold_stats=True)

  for train_ids, test_ids, fold_info in tqdm((fold_iterator),
                                             total=splitter.n_splits):
      print(f"\n==================== Fold {fold_info['i_split']}")
      pprint(fold_info)

      df_train = intr.df.iloc[train_ids]
      dataset = Dataset.construct(df_train)

      df_test = intr.df.iloc[test_ids][Columns.UserItem]
      test_users = np.unique(df_test[Columns.User])

      # Catalog is set of items that we recommend.
      # Sometimes we recommend not all items from train.
      catalog = df_train[Columns.Item].unique()

      for model_name, model in models.items():
          model.fit(dataset)
          recos = model.recommend(
              users=test_users,
              dataset=dataset,
              k=K_RECOS,
              filter_viewed=True,
          )
          metric_values = calc_metrics(
              metrics,
              reco=recos,
              interactions=df_test,
              prev_interactions=df_train,
              catalog=catalog,
          )

          res = {"fold": fold_info["i_split"], "model": model_name}
          res.update(metric_values)
          metrics_results.append(res)


  print(f"Total time: {time.time()-st:.2f} seconds or {(time.time()-st)/60:.2f} minutes")
  return metrics_results

In [15]:
n_splits = 3
cv, intr = cross_validation(interactions, n_splits)

   user_id  item_id   datetime   weight  watched_pct
0   176549     9506 2021-05-11   4250.0         72.0
1   699317     1659 2021-05-29   8317.0        100.0
2   656683     7107 2021-05-09     10.0          0.0
3   864613     7638 2021-07-05  14483.0        100.0
4   964868     9506 2021-04-30   6725.0        100.0

Whole dataset time period: 
2021-03-13 00:00:00 2021-08-22 00:00:00

Dataset was splitted on these folds:
[(Timestamp('2021-08-02 00:00:00', freq='7D'),
  Timestamp('2021-08-09 00:00:00', freq='7D')),
 (Timestamp('2021-08-09 00:00:00', freq='7D'),
  Timestamp('2021-08-16 00:00:00', freq='7D')),
 (Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]


In [16]:
res = train_val(intr, models, metrics, cv, K_RECOS)

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}
Total time: 124.28 seconds or 2.07 minutes


In [17]:
# Aggregate metrics by folds and compare models
pivot_results = pd.DataFrame(res).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset,color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,prec@1,prec@1,recall@1,recall@1,prec@5,prec@5,recall@5,recall@5,prec@10,prec@10,recall@10,recall@10,mrr@1,mrr@1,mrr@5,mrr@5,mrr@10,mrr@10,map@1,map@1,map@5,map@5,map@10,map@10,novelty@1,novelty@1,novelty@5,novelty@5,novelty@10,novelty@10,serendipity@1,serendipity@1,serendipity@5,serendipity@5,serendipity@10,serendipity@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2
random,0.000221,3.3e-05,7.2e-05,1.9e-05,0.000202,2.8e-05,0.000365,8.4e-05,0.000193,1.9e-05,0.000693,7.6e-05,0.000221,3.3e-05,0.000485,5.5e-05,0.000604,6.1e-05,7.2e-05,1.9e-05,0.000169,3.3e-05,0.000211,3.2e-05,15.614137,0.022585,15.612989,0.01957,15.613009,0.019786,6e-06,2e-06,7e-06,1e-06,7e-06,0.0
popular,0.076432,0.006826,0.04272,0.004366,0.052402,0.001618,0.137413,0.005346,0.033903,0.001443,0.173492,0.007987,0.076432,0.006826,0.131669,0.006167,0.138603,0.006728,0.04272,0.004366,0.078295,0.00437,0.084109,0.004921,2.377055,0.023002,3.066979,0.012316,3.71339,0.002076,2e-06,0.0,3e-06,0.0,2e-06,0.0


# Визуальный анализ

Визуальный анализ

Функция будет принимать на вход:

* Инициализированную (и обученную!) модель
* Датасет
* Список отобранных user_id для просмотра
* item_data - данные об айтемах, которые важно отразить для визуального анализа (названия, жанры и количество просмотров в датасете (как для айтемов из истории взаимодействий каждого юзера, так и для айтемов из его рекомендаций))

Реализация:

* Генерим рекомендации для отобранных user_id
* Получаем из датасета истории просмотров юзеров
* Отображаем в любом удобном виде, который позволит смотреть на историю просмотра юзера + на его рекомендации от модели

In [18]:
model_vis = {'popular': PopularModel()}
n_splits = 3

sample_users = [666262, 672861, 955527]

# Sort the DataFrame by user_id and weight in descending order
df_sorted = interactions.sort_values(by=['user_id', 'weight'], ascending=[True, False])
# Use groupby to group by user_id and select the top K (here it is 3) items for each group
df_top_N_items = df_sorted.groupby('user_id').head(3)
df_final = df_top_N_items.copy()

cv, intr = cross_validation(df_final, n_splits)
res = train_val(intr, model_vis, metrics, cv, K_RECOS)

         user_id  item_id   datetime   weight  watched_pct
90113          0    15297 2021-07-19    459.0          0.0
620            0     7102 2021-07-19    169.0          3.0
67070          0    14359 2021-07-19    130.0          2.0
2354936        1    10440 2021-08-13  19579.0         80.0
1625374        1     3669 2021-08-16   1593.0         26.0

Whole dataset time period: 
2021-03-13 00:00:00 2021-08-22 00:00:00

Dataset was splitted on these folds:
[(Timestamp('2021-08-02 00:00:00', freq='7D'),
  Timestamp('2021-08-09 00:00:00', freq='7D')),
 (Timestamp('2021-08-09 00:00:00', freq='7D'),
  Timestamp('2021-08-16 00:00:00', freq='7D')),
 (Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]


  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 53486,
 'test_items': 3273,
 'test_users': 46072,
 'train': 1562183,
 'train_items': 10390,
 'train_users': 787156}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 60644,
 'test_items': 3502,
 'test_users': 51474,
 'train': 1700204,
 'train_items': 10585,
 'train_users': 843641}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 74891,
 'test_items': 3697,
 'test_users': 61021,
 'train': 1852024,
 'train_items': 10780,
 'train_users': 902981}
Total time: 23.68 seconds or 0.39 minutes


In [19]:
def visual_train(model, dataset, selected_users,item_data):

  filtered_result = dataset[dataset[Columns.User].isin(selected_users)].sort_values("user_id").reset_index(drop=True)
  dataset_vis = Dataset.construct(dataset)

  recos = model['popular'].recommend(
      users=filtered_result[Columns.User].unique(),
      dataset=dataset_vis,
      k=10,
      filter_viewed=True,
  )

  items_selected = items[item_data]
  user_viewed = filtered_result.merge(items_selected, on="item_id")
  user_recos = recos.merge(items_selected, on="item_id")

  return user_viewed, user_recos

In [20]:
selected_columns = ['item_id', 'title', 'title_orig', 'genres']
user_viewed, user_recos = visual_train(model_vis, df_final, sample_users, selected_columns)

In [21]:
user_viewed.query("user_id == 666262")

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct,title,title_orig,genres
0,666262,12981,2021-05-14,10292.0,100.0,Томирис,Tomiris,"боевики, драмы, историческое, военные"
1,666262,7957,2021-05-12,2052.0,32.0,Последний викинг,The Lost Viking,"боевики, историческое, приключения"
2,666262,4785,2021-05-12,1946.0,28.0,Робин Гуд: Начало,Robin Hood,"боевики, триллеры, приключения"


In [22]:
user_recos.query("user_id == 666262")

Unnamed: 0,user_id,item_id,score,rank,title,title_orig,genres
0,666262,10440,160889.0,1,Хрустальный,Khrustal'nyy,"триллеры, детективы"
3,666262,15297,155583.0,2,Клиника счастья,Klinika schast'ya,"драмы, мелодрамы"
6,666262,6809,68753.0,3,Дуров,,документальное
9,666262,10464,63967.0,4,Вирус страха,Before the fire,"драмы, триллеры"
12,666262,12841,49922.0,5,Стражи Галактики,GUARDIANS OF THE GALAXY,"боевики, фантастика, приключения, комедии"
15,666262,16341,38847.0,6,Другой,Drugoy,мелодрамы
18,666262,2657,31480.0,7,Подслушано,Podslushano,"драмы, триллеры"
21,666262,2802,25458.0,8,Starперцы,Last Vegas,"драмы, комедии"
24,666262,8710,21103.0,9,Острые козырьки. 5 сезон,Peaky Blinders. Season 5,"драмы, криминал"
27,666262,13167,20089.0,10,Рейд 2,The Raid 2: Berandal,боевики




---



In [23]:
user_viewed.query("user_id == 672861")

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct,title,title_orig,genres
3,672861,8662,2021-05-04,6354.0,100.0,Он – дракон,Drunk Parents,фэнтези
4,672861,6870,2021-04-27,10.0,0.0,Красавица и чудовище,Beauty and the Beast,"драмы, фэнтези, музыкальные"


In [24]:
user_recos.query("user_id == 672861")

Unnamed: 0,user_id,item_id,score,rank,title,title_orig,genres
1,672861,10440,160889.0,1,Хрустальный,Khrustal'nyy,"триллеры, детективы"
4,672861,15297,155583.0,2,Клиника счастья,Klinika schast'ya,"драмы, мелодрамы"
7,672861,6809,68753.0,3,Дуров,,документальное
10,672861,10464,63967.0,4,Вирус страха,Before the fire,"драмы, триллеры"
13,672861,12841,49922.0,5,Стражи Галактики,GUARDIANS OF THE GALAXY,"боевики, фантастика, приключения, комедии"
16,672861,16341,38847.0,6,Другой,Drugoy,мелодрамы
19,672861,2657,31480.0,7,Подслушано,Podslushano,"драмы, триллеры"
22,672861,2802,25458.0,8,Starперцы,Last Vegas,"драмы, комедии"
25,672861,8710,21103.0,9,Острые козырьки. 5 сезон,Peaky Blinders. Season 5,"драмы, криминал"
28,672861,13167,20089.0,10,Рейд 2,The Raid 2: Berandal,боевики




---



In [25]:
user_viewed.query("user_id == 955527")

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct,title,title_orig,genres
5,955527,13371,2021-05-04,686.0,11.0,Пеле: Рождение легенды,Pele: Birth of a Legend(aka Pele),"драмы, спорт, биография"
6,955527,1238,2021-06-02,556.0,7.0,Диего Марадона,Diego Maradona,"спорт, биография, документальное"
7,955527,4725,2021-06-02,255.0,4.0,Лобановский навсегда,Lobanovskiy Forever,"спорт, биография, документальное"


In [26]:
user_recos.query("user_id == 955527")

Unnamed: 0,user_id,item_id,score,rank,title,title_orig,genres
2,955527,10440,160889.0,1,Хрустальный,Khrustal'nyy,"триллеры, детективы"
5,955527,15297,155583.0,2,Клиника счастья,Klinika schast'ya,"драмы, мелодрамы"
8,955527,6809,68753.0,3,Дуров,,документальное
11,955527,10464,63967.0,4,Вирус страха,Before the fire,"драмы, триллеры"
14,955527,12841,49922.0,5,Стражи Галактики,GUARDIANS OF THE GALAXY,"боевики, фантастика, приключения, комедии"
17,955527,16341,38847.0,6,Другой,Drugoy,мелодрамы
20,955527,2657,31480.0,7,Подслушано,Podslushano,"драмы, триллеры"
23,955527,2802,25458.0,8,Starперцы,Last Vegas,"драмы, комедии"
26,955527,8710,21103.0,9,Острые козырьки. 5 сезон,Peaky Blinders. Season 5,"драмы, криминал"
29,955527,13167,20089.0,10,Рейд 2,The Raid 2: Berandal,боевики
