# Установка библиотек

In [None]:
import pandas as pd
import numpy as np

import requests
from tqdm.auto import tqdm
import zipfile as zf

from rectools import Columns
from rectools.models import RandomModel,PopularModel
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import Precision, Recall, MAP, MRR, Serendipity,MeanInvUserFreq,calc_metrics
from rectools.dataset import Interactions, Dataset

# Чтение данных

In [None]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

In [None]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

In [None]:
files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()

In [None]:
interactions_df = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

interactions_df.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 

In [None]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [None]:
interactions = Interactions(interactions_df)
interactions.df.head()

# Расчёт метрик


In [None]:
models = {
    "random": RandomModel(random_state=32),
    "popular": PopularModel()
}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "MRR@1": MRR(k=1),
    "MRR@5": MRR(k=5),
    "MRR@10": MRR(k=10),
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
}

K_RECOS = 10

In [None]:
n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [None]:
cv.get_test_fold_borders(interactions)

In [None]:
def cross_validation_function(interactions, models, metrics, K_RECOS, splitter,n_splits):

  results = []

  fold_iterator = splitter.split(interactions, collect_fold_stats=True)

  for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=n_splits):
      print(f"\n==================== Fold {fold_info['i_split']}")
      print(fold_info)

      df_train = interactions.df.iloc[train_ids]
      dataset = Dataset.construct(df_train)

      df_test = interactions.df.iloc[test_ids][Columns.UserItem]
      test_users = np.unique(df_test[Columns.User])

      catalog = df_train[Columns.Item].unique()

      for model_name, model in models.items():
          model.fit(dataset)
          recos = model.recommend(
              users=test_users,
              dataset=dataset,
              k=K_RECOS,
              filter_viewed=True,
          )
          metric_values = calc_metrics(
              metrics,
              reco=recos,
              interactions=df_test,
              prev_interactions=df_train,
              catalog=catalog,
          )
          res = {"fold": fold_info["i_split"], "model": model_name}
          res.update(metric_values)
          results.append(res)
  return results

In [None]:
%%time

cross_val_results = cross_validation_function(interactions, models, metrics, K_RECOS, cv, n_splits)

In [None]:
pivot_results = pd.DataFrame(cross_val_results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

# Визуальный анализ

In [None]:
def visualization_function(model, inter_data, user_id, item_data,item_columns, K_RECOS):
  dataset = Dataset.construct(inter_data)
  recos = model.recommend(
      users=user_id,
      dataset=dataset,
      k=K_RECOS,
      filter_viewed=True
      )
  for user in user_id:
    print('История просмотров:')
    display(inter_data[inter_data.user_id==user].merge(item_data[item_columns].merge(inter_data.groupby(['item_id']).agg(['count']).user_id.reset_index(),on='item_id'),on='item_id'))
    print('Рекомендации:')
    display(recos[recos.user_id==user].merge(item_data[item_columns].merge(inter_data.groupby(['item_id']).agg(['count']).user_id.reset_index(),on='item_id'),on='item_id')) 

In [None]:
model = RandomModel(random_state=32)
dataset = Dataset.construct(interactions_df)
model.fit(dataset)
test_users = [666262, 672861, 955527]

In [None]:
visualization_function(model,interactions_df,test_users,items,['item_id','title','genres'],10)