# Установка библиотек

In [1]:
!pip install rectools

Collecting rectools
  Downloading rectools-0.4.1-py3-none-any.whl (99 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/99.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.0/99.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting implicit<0.8.0,>=0.7.1 (from rectools)
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.0.1 (from rectools)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, implicit, rectools
Successfully installed implicit-0.7.2 rectools-0.4.1 typeguard-2.13.3


In [2]:
import pandas as pd
import numpy as np
import zipfile as zf

import requests
from tqdm.auto import tqdm

from rectools import Columns
from rectools.models import PopularModel
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP,Serendipity,MeanInvUserFreq, calc_metrics
from rectools.dataset import Interactions, Dataset

from implicit.nearest_neighbours import CosineRecommender,ItemItemRecommender,BM25Recommender,TFIDFRecommender



# Чтение данных

In [3]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'


In [4]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [5]:


files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()


In [6]:
interactions_df = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

interactions_df.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

In [7]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [8]:
interactions = Interactions(interactions_df)
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


# Функция валидации

In [9]:
def cross_validation_function(interactions, models, metrics, K_RECOS, splitter,n_splits):

  results = []

  fold_iterator = splitter.split(interactions, collect_fold_stats=True)

  for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=n_splits):
      print(f"\n==================== Fold {fold_info['i_split']}")
      print(fold_info)

      df_train = interactions.df.iloc[train_ids]
      dataset = Dataset.construct(df_train)

      df_test = interactions.df.iloc[test_ids][Columns.UserItem]
      test_users = np.unique(df_test[Columns.User])

      catalog = df_train[Columns.Item].unique()

      for model_name, model in models.items():
          model.fit(df_train)
          recos = model.predict(
              test=df_test,
              N_recs=K_RECOS,
          )
          metric_values = calc_metrics(
              metrics,
              reco=recos,
              interactions=df_test,
              prev_interactions=df_train,
              catalog=catalog,
          )
          res = {"fold": fold_info["i_split"], "model": model_name}
          res.update(metric_values)
          results.append(res)
  return results

# Эксперименты с кастомной моделю kNN с семинара



In [10]:
from typing import Dict
from collections import Counter

import pandas as pd
import numpy as np
import scipy as sp
from implicit.nearest_neighbours import ItemItemRecommender


class UserKnn():
    """Class for fit-perdict UserKNN model
       based on ItemKNN model from implicit.nearest_neighbours
    """

    def __init__(self, model: ItemItemRecommender, N_users: int = 50):
        self.N_users = N_users
        self.model = model
        self.is_fitted = False

    def get_mappings(self, train):
        self.users_inv_mapping = dict(enumerate(train['user_id'].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}

        self.items_inv_mapping = dict(enumerate(train['item_id'].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}

    def get_matrix(self, df: pd.DataFrame,
                   user_col: str = 'user_id',
                   item_col: str = 'item_id',
                   weight_col: str = None,
                   users_mapping: Dict[int, int] = None,
                   items_mapping: Dict[int, int] = None):

        if weight_col:
            weights = df[weight_col].astype(np.float32)
        else:
            weights = np.ones(len(df), dtype=np.float32)

        self.interaction_matrix = sp.sparse.coo_matrix((
            weights,
            (
                df[item_col].map(self.items_mapping.get),
                df[user_col].map(self.users_mapping.get)
            )
            ))

        self.watched = df\
            .groupby(user_col, as_index=False)\
            .agg({item_col: list})\
            .rename(columns={user_col: 'sim_user_id'})

        return self.interaction_matrix

    def idf(self, n: int, x: float):
        return np.log((1 + n) / (1 + x) + 1)

    def _count_item_idf(self, df: pd.DataFrame):
        item_cnt = Counter(df['item_id'].values)
        item_idf = pd.DataFrame.from_dict(item_cnt, orient='index',
                                          columns=['doc_freq']).reset_index()
        item_idf['idf'] = item_idf['doc_freq'].apply(lambda x: self.idf(self.n, x))
        self.item_idf = item_idf

    def fit(self, train: pd.DataFrame):
        self.user_knn = self.model
        self.get_mappings(train)
        self.weights_matrix = self.get_matrix(train,
                                              users_mapping=self.users_mapping,
                                              items_mapping=self.items_mapping)

        self.n = train.shape[0]
        self._count_item_idf(train)

        self.user_knn.fit(self.weights_matrix)
        self.is_fitted = True

    def _generate_recs_mapper(self, model: ItemItemRecommender, user_mapping: Dict[int, int],
                              user_inv_mapping: Dict[int, int], N: int):
        def _recs_mapper(user):
            user_id = self.users_mapping[user]
            users, sim = model.similar_items(user_id, N=N)
            return [self.users_inv_mapping[user] for user in users], sim
        return _recs_mapper

    def predict(self, test: pd.DataFrame, N_recs: int = 10):

        if not self.is_fitted:
            raise ValueError("Please call fit before predict")

        mapper = self._generate_recs_mapper(
            model=self.user_knn,
            user_mapping=self.users_mapping,
            user_inv_mapping=self.users_inv_mapping,
            N=self.N_users
        )

        recs = pd.DataFrame({'user_id': test['user_id'].unique()})
        recs['sim_user_id'], recs['sim'] = zip(*recs['user_id'].map(mapper))
        recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()

        recs = recs[~(recs['user_id'] == recs['sim_user_id'])]\
            .merge(self.watched, on=['sim_user_id'], how='left')\
            .explode('item_id')\
            .sort_values(['user_id', 'sim'], ascending=False)\
            .drop_duplicates(['user_id', 'item_id'], keep='first')\
            .merge(self.item_idf, left_on='item_id', right_on='index', how='left')

        recs['score'] = recs['sim'] * recs['idf']
        recs = recs.sort_values(['user_id', 'score'], ascending=False)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1
        return recs[recs['rank'] <= N_recs][['user_id', 'item_id', 'score', 'rank']]

В models1 посмотрим, как параметр N_users влияет на точность модели

В models2 посмотрим, как виляет тип модели на точность

Также проведем эксперимент с параметром офлайн валидации и посмотрим, как влияет test_size

In [16]:
models1 = {
    "userknn_TFIDFRecommender_30": UserKnn(TFIDFRecommender(),30),
    "userknn_TFIDFRecommender_50": UserKnn(TFIDFRecommender(),50),
    "userknn_TFIDFRecommender_70": UserKnn(TFIDFRecommender(),70)
}

models2 = {
    "userknn_TFIDFRecommender_50": UserKnn(TFIDFRecommender(),50),
    "userknn_CosineRecommender_50": UserKnn(CosineRecommender(),50),
    "userknn_BM25Recommender_50": UserKnn(BM25Recommender(),50),
}


metrics = {
    "MAP@10": MAP(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
}

K_RECOS = 10

In [12]:
n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [13]:
cross_val_results = cross_validation_function(interactions, models1, metrics, K_RECOS, cv, n_splits)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}




  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}




  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}




  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]

In [14]:
pivot_results = pd.DataFrame(cross_val_results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,MAP@10,novelty@10,serendipity@10
Unnamed: 0_level_1,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
userknn_TFIDFRecommender_30,0.006334,7.638598,6.5e-05
userknn_TFIDFRecommender_50,0.006334,7.638598,6.5e-05
userknn_TFIDFRecommender_70,0.006334,7.638598,6.5e-05


Как видим параметр N_users не повлиял на конечную точность модели

In [15]:
cross_val_results = cross_validation_function(interactions, models2, metrics, K_RECOS, cv, n_splits)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}




  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}




  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}




  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]

In [16]:
pivot_results = pd.DataFrame(cross_val_results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,MAP@10,novelty@10,serendipity@10
Unnamed: 0_level_1,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
userknn_TFIDFRecommender_50,0.006334,7.638598,6.5e-05
userknn_CosineRecommender_50,0.003905,7.540823,6e-05
userknn_BM25Recommender_50,0.002699,9.283169,9.9e-05


Среди разных моделей лучшую точность по метрике MAP показала модель TF-IDF, ее и будем использовать в дальнейшем обучении

In [13]:
cv2 = TimeRangeSplitter(
    test_size="14D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [14]:
cross_val_results = cross_validation_function(interactions, models2, metrics, K_RECOS, cv2, n_splits)

  0%|          | 0/3 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-07-12 00:00:00', freq='14D'), 'end': Timestamp('2021-07-26 00:00:00', freq='14D'), 'train': 3239125, 'train_users': 646423, 'train_items': 14730, 'test': 398993, 'test_users': 122488, 'test_items': 7394}




  0%|          | 0/646423 [00:00<?, ?it/s]



  0%|          | 0/646423 [00:00<?, ?it/s]



  0%|          | 0/646423 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-07-26 00:00:00', freq='14D'), 'end': Timestamp('2021-08-09 00:00:00', freq='14D'), 'train': 3892558, 'train_users': 742256, 'train_items': 15085, 'test': 458757, 'test_users': 135624, 'test_items': 7711}




  0%|          | 0/742256 [00:00<?, ?it/s]



  0%|          | 0/742256 [00:00<?, ?it/s]



  0%|          | 0/742256 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-09 00:00:00', freq='14D'), 'end': Timestamp('2021-08-23 00:00:00', freq='14D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 521381, 'test_users': 151629, 'test_items': 7705}




  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]

In [15]:
pivot_results = pd.DataFrame(cross_val_results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,MAP@10,novelty@10,serendipity@10
Unnamed: 0_level_1,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
userknn_TFIDFRecommender_50,0.006994,7.398688,6.6e-05
userknn_CosineRecommender_50,0.004255,7.320403,6.3e-05
userknn_BM25Recommender_50,0.002627,9.025611,0.000103


При использовании test_size 14 дней, а не 7, метрики MAP и serendipity стали чуть лучше, метрика novelty наоборот ухудшилась




# Что сделано:

реализован тюнинг гиперпараметров - выбрана оптимальная моделе по метрике MAP

проведены эксперименты с параметрами оффлайн валидации