# Домашнее задание №3

### Что необходимо сделать:

* сделать кол-во рекомендаций не меньше N 
* наличие тюнинга гиперпараметров (например, векторного расстояния или типов kNN моделей (implicit/rectools/...))
* другие варианты ранжированивания айтемов похожих пользователей
* эксперименты с оффлайн валидацией
* в тесте вас ждут холодные пользователи. Сделайте рекомендации для них (обратите внимание на rectools.models.popular)
* блендинг моделей

## Импорты

In [1]:
# import the necessary modules
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from pathlib import Path
from implicit.nearest_neighbours import (
    CosineRecommender, 
    TFIDFRecommender, 
)    
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import PopularModel, ImplicitItemKNNWrapperModel
from rectools.model_selection import TimeRangeSplit

from userknn import UserKnn


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# Подготовка данных

In [2]:
DATA_PATH = Path('/Users/nikitaborisov/Desktop/userknn/kion_train')

In [3]:
# read data from CSV-format into Pandas.DataFrame format
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')

In [4]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [5]:
# from the interactions frame we find max_date and min_date
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

In [6]:
interactions.isnull().sum()

user_id          0
item_id          0
datetime         0
weight           0
watched_pct    828
dtype: int64

In [7]:
interactions = interactions[interactions['watched_pct'] > 10]

# Создание моделей

## Модель Popular

In [8]:
# создание датасета для получения рекомендаций популярного
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [9]:
popular_model = PopularModel()
popular_model.fit(dataset);

In [10]:
# получение рекомендаций популярного
popular_recos = popular_model.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=10, 
    filter_viewed=False
).merge(items[['item_id', 'title']], 
       on='item_id',
       how='left')

In [11]:
popular_recos

Unnamed: 0,user_id,item_id,score,rank,title
0,1,15297,163104.0,1,Клиника счастья
1,1,10440,135640.0,2,Хрустальный
2,1,13865,99253.0,3,Девятаев
3,1,9728,96735.0,4,Гнев человеческий
4,1,4151,72083.0,5,Секреты семейной жизни
5,1,3734,64640.0,6,Прабабушка легкого поведения
6,1,2657,45662.0,7,Подслушано
7,1,142,35580.0,8,Маша
8,1,6809,29546.0,9,Дуров
9,1,8636,26487.0,10,Белый снег


## Модель(itemkNN -> userkNN)

## Параметры кросс-валидации

In [12]:
# Для кросс-валидации возьмем только часть пользователей
sample_fraction = 0.4
users = interactions['user_id'].unique()
size = int(sample_fraction * len(users))
users_sample = np.random.choice(users, size=size, replace=False)

interactions = interactions[interactions[Columns.User].isin(users_sample)]

In [13]:
n_folds = 3
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-07-25 00:00:00'), Timestamp('2021-08-22 00:00:00'))


## Определение границ фолдов

In [14]:
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

start_date: 2021-07-25 00:00:00
last_date: 2021-08-22 00:00:00
periods: 4
freq: 1W

Test fold borders: ['2021-07-25' '2021-08-01' '2021-08-08' '2021-08-15']
Real number of folds: 3


In [15]:
from rectools.metrics import Precision, Recall, MAP, MeanInvUserFreq, Serendipity, calc_metrics

# определим метрики, которые будем рассчитывать
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

# сравним две модели 
models = {
    "cosine_itemknn": CosineRecommender(),
    "tfidf_itemknn": TFIDFRecommender(),
}

# попробуем различное количество соседей
N_users = [10, 50, 100]

## Тренировка моделей по фолдам

In [16]:
%%time

results = []

cos_recos = []
tfidf_recos = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique() 
    
    for model_name, model in models.items():
        for n_users in N_users:
            userknn_model = UserKnn(model=model, N_users=n_users)
            userknn_model.fit(df_train)
    
            recos = userknn_model.predict(df_test)
            
            if model_name == 'cosine_itemknn':
                cos_recos.append(recos)
            elif model_name == 'tfidf_itemknn':
                tfidf_recos.append(recos)

                
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            fold = {
                "fold": i_fold,
                "model": model_name,
                "n_users": n_users,
            }
            fold.update(metric_values)
            results.append(fold) 


{'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'),
 'Test': 65742,
 'Test items': 4280,
 'Test users': 28280,
 'Train': 1027409,
 'Train items': 10930,
 'Train users': 229939}


  0%|          | 0/229939 [00:00<?, ?it/s]

  0%|          | 0/229939 [00:00<?, ?it/s]

  0%|          | 0/229939 [00:00<?, ?it/s]

  0%|          | 0/229939 [00:00<?, ?it/s]

  0%|          | 0/229939 [00:00<?, ?it/s]

  0%|          | 0/229939 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'),
 'Test': 66805,
 'Test items': 4300,
 'Test users': 29068,
 'Train': 1122897,
 'Train items': 11108,
 'Train users': 246383}


  0%|          | 0/246383 [00:00<?, ?it/s]

  0%|          | 0/246383 [00:00<?, ?it/s]

  0%|          | 0/246383 [00:00<?, ?it/s]

  0%|          | 0/246383 [00:00<?, ?it/s]

  0%|          | 0/246383 [00:00<?, ?it/s]

  0%|          | 0/246383 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'),
 'Test': 71205,
 'Test items': 4424,
 'Test users': 30830,
 'Train': 1221886,
 'Train items': 11345,
 'Train users': 262950}


  0%|          | 0/262950 [00:00<?, ?it/s]

  0%|          | 0/262950 [00:00<?, ?it/s]

  0%|          | 0/262950 [00:00<?, ?it/s]

  0%|          | 0/262950 [00:00<?, ?it/s]

  0%|          | 0/262950 [00:00<?, ?it/s]

  0%|          | 0/262950 [00:00<?, ?it/s]

CPU times: user 43min 47s, sys: 46.1 s, total: 44min 33s
Wall time: 45min 42s


## Метрики

In [17]:
df_metrics = pd.DataFrame(results)

In [18]:
df_metrics

Unnamed: 0,fold,model,n_users,prec@10,recall@10,MAP@10,novelty,serendipity
0,0,cosine_itemknn,10,0.003338,0.019392,0.004046,7.71205,4e-05
1,0,cosine_itemknn,50,0.003784,0.022524,0.004346,8.115777,4.8e-05
2,0,cosine_itemknn,100,0.003784,0.022524,0.004346,8.115777,4.8e-05
3,0,tfidf_itemknn,10,0.005569,0.031485,0.00646,7.784196,4.3e-05
4,0,tfidf_itemknn,50,0.006595,0.038579,0.007611,7.976103,5.7e-05
5,0,tfidf_itemknn,100,0.006595,0.038579,0.007611,7.976103,5.7e-05
6,1,cosine_itemknn,10,0.003258,0.018873,0.003476,7.753006,3.4e-05
7,1,cosine_itemknn,50,0.003705,0.021947,0.00385,8.151605,4.1e-05
8,1,cosine_itemknn,100,0.003705,0.021947,0.00385,8.151605,4.1e-05
9,1,tfidf_itemknn,10,0.005363,0.030935,0.006065,7.896434,4.1e-05


In [19]:
cos_recos = pd.DataFrame(cos_recos)
tfidf_recos = pd.DataFrame(tfidf_recos)

cos_recos.to_pickle ("cos_recos.pkl")
tfidf_recos.to_pickle ("tfidf_recos.pkl")

In [20]:
df_metrics.drop(['fold'], axis=1, inplace=True)
df_metrics = df_metrics.groupby(['model', 'n_users']).mean()

In [21]:
df_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,prec@10,recall@10,MAP@10,novelty,serendipity
model,n_users,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cosine_itemknn,10,0.003247,0.018726,0.00371,7.75964,3.7e-05
cosine_itemknn,50,0.003667,0.021568,0.004001,8.161817,4.5e-05
cosine_itemknn,100,0.003667,0.021568,0.004001,8.161817,4.5e-05
tfidf_itemknn,10,0.005258,0.030231,0.006039,7.876614,4.1e-05
tfidf_itemknn,50,0.006271,0.036642,0.007081,8.042001,5.3e-05
tfidf_itemknn,100,0.006271,0.036642,0.007081,8.042001,5.3e-05


Результаты кросс-валидации для двух моделей представлены выше. Можно сделать вывод, что лучшей моделью является модель TFIDFRecommender для 50 соседей. По результатам предварительных экспериментов, метрики не меняются при k >= 30, поэтому можем использовать модель с k = 30. Ее мы и будем использовать в качестве основной для внедрения в сервис.

## Модель TFIDFRecommender (K = 30) 

Обучим модель на данных и сохраним ее для онлайн выдачи, а также получим рекомендации для офлайн выдачи

In [24]:
# чтобы уменьшить размер модели, переведем в менее "тяжелые" типы 
interactions["user_id"] = interactions["user_id"].astype(np.uint32)
interactions["item_id"] = interactions["item_id"].astype(np.uint16)
interactions[Columns.Weight] = interactions[Columns.Weight].astype(np.uint16)
interactions["watched_pct"] = interactions["watched_pct"].astype(np.float32)

In [33]:
# для обучения модели возьмем данные о взаимодействиях за последний месяц
interactions = interactions[interactions['datetime'] > '2021-07-22 00:00:00']

In [37]:
# найдем теплых и горячих пользователей, а также их взаимодействия
users_count = interactions.groupby("user_id")["item_id"].count()
warm_users = users_count[users_count >= 5].index.values

warm_interactions = interactions[interactions["user_id"].isin(warm_users)].sort_values(["user_id", "datetime"])

In [39]:
warm_interactions

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
247496,3,9728,2021-07-23,10448,100.0
1815773,3,10440,2021-07-23,44827,90.0
4193217,3,9550,2021-07-23,2309,13.0
547297,3,16406,2021-07-24,1305,3.0
2334750,3,4151,2021-07-24,17606,88.0
...,...,...,...,...,...
4803008,1097544,4689,2021-08-09,11,0.0
2064423,1097544,16499,2021-08-20,75,1.0
3447035,1097544,15464,2021-08-20,13236,100.0
537061,1097544,13973,2021-08-21,3089,58.0


In [40]:
# обучим модель на этих данных
userknn_model = UserKnn(TFIDFRecommender(), N_users=30)
userknn_model.fit(warm_interactions)

  0%|          | 0/104883 [00:00<?, ?it/s]

In [41]:
# сохраним модель для получения онлайн рекомендаций
import dill

with open('userknn_model.dill', 'wb') as f:
    dill.dump(userknn_model, f)

In [45]:
# получим рекомендации для выбранных пользователей и сохраним для офлайн выдачи
userknn_recos = userknn_model.predict(pd.DataFrame(data={"user_id": warm_users}))

with open('userknn_recos.dill', 'wb') as f:
    dill.dump(userknn_recos, f)

## Количество рекомендаций не менее N (UserKNN + Popular)

Реализовано в сервисе с использованием рекомендаций популярного для юзеров, с количеством рекомендаций меньше N