In [None]:
!pip install rectools==0.2.0 implicit >> None

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
import scipy as sp
from scipy.stats import mode
from scipy.sparse import csr_matrix
from itertools import islice, cycle
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
from rectools.metrics import MAP, Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Dataset
from rectools.model_selection import TimeRangeSplit

from userknn1 import UserKnn
import dill
import joblib

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [None]:
np.random.seed(13)

# Подгружаем данные KION 

In [None]:
%%time
users = pd.read_csv('/content/drive/MyDrive/Recsys (MTC)/users_processed.csv',
                       dtype={ 
                              'user_id': 'int64', 
                              'age': 'category', 
                              'income': 'category', 
                              'sex': 'category', 
                              'kids_flg': 'bool' 
                               })
items = pd.read_csv('/content/drive/MyDrive/Recsys (MTC)/items_processed.csv', 
                       dtype = { 
                                'item_id': 'int64', 
                                'content_type': 'category', 
                                'title': 'object', 
                                'genres': 'category', 
                                'countries': 'category', 
                                'age_rating': 'category', 
                                'release_year_cat': 'category'
                                })
interactions = pd.read_csv('/content/drive/MyDrive/Recsys (MTC)/interactions_processed.csv', 
                              parse_dates=["last_watch_dt"], 
                              dtype={ 
                               'user_id': 'int64', 
                               'item_id': 'int64', 
                               'total_dur': 'int64', 
                               'watched_pct': 'float16' 
                               })

CPU times: user 2.95 s, sys: 304 ms, total: 3.25 s
Wall time: 6.14 s


In [None]:
# Переименовываем колонки
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [None]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float16       
dtypes: datetime64[ns](1), float16(1), int64(3)
memory usage: 177.6 MB


# Эксперименты (userkNN)





## Cравнение моделей на фолдах. Количество соседей 30

In [None]:
# делим на фолды
n_folds = 7
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-06-27 00:00:00'), Timestamp('2021-08-22 00:00:00'))


### Подготовка тестовой выборки

In [None]:
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

#генератор фолдов
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

start_date: 2021-06-27 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 1W

Test fold borders: ['2021-06-27' '2021-07-04' '2021-07-11' '2021-07-18' '2021-07-25'
 '2021-08-01' '2021-08-08' '2021-08-15']
Real number of folds: 7


In [None]:
# определяем метрики
metrics = {
    "MAP@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

In [None]:
# определяем модель
models = {
    "cosine": CosineRecommender()
}

# Обучение моделей

## Модель CosineRecommender

In [None]:
%%time
results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=30)
        userknn_model.fit(df_train)
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'Start date': Timestamp('2021-06-27 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'), 'Train': 2533586, 'Train users': 536802, 'Train items': 14092, 'Test': 237414, 'Test users': 98930, 'Test items': 5947}


  0%|          | 0/536802 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'), 'Train': 2886800, 'Train users': 595902, 'Train items': 14357, 'Test': 211146, 'Test users': 86167, 'Test items': 6209}


  0%|          | 0/595902 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'Train': 3192875, 'Train users': 640144, 'Train items': 14711, 'Test': 214489, 'Test users': 84234, 'Test items': 6313}


  0%|          | 0/640144 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'Train': 3506106, 'Train users': 687200, 'Train items': 14928, 'Test': 231207, 'Test users': 87632, 'Test items': 6491}


  0%|          | 0/687200 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}


  0%|          | 0/734701 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}


  0%|          | 0/788721 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}


  0%|          | 0/842129 [00:00<?, ?it/s]

CPU times: user 1h 26min 59s, sys: 20.6 s, total: 1h 27min 19s
Wall time: 1h 4min 12s


In [None]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,MAP@10,novelty,serendipity
0,0,cosine,0.004879,0.027282,0.004696,7.783925,3.1e-05
1,1,cosine,0.004807,0.028028,0.004765,7.8136,3.3e-05
2,2,cosine,0.004103,0.023102,0.004082,7.95327,3.7e-05
3,3,cosine,0.003865,0.020485,0.003623,8.063779,4.4e-05
4,4,cosine,0.0037,0.019591,0.003579,8.118989,4.7e-05
5,5,cosine,0.003669,0.01941,0.003403,8.126134,4.3e-05
6,6,cosine,0.00333,0.017233,0.003142,8.185844,4.3e-05


## Модель TFIDFRecommender на фолдах

In [None]:
# определяем модель
models = {
    "tfidf": TFIDFRecommender()
}

In [None]:
%%time
fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)

    df_train = interactions.iloc[train_ids].copy()
    df_test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=30)
        userknn_model.fit(df_train)
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'Start date': Timestamp('2021-06-27 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'), 'Train': 2533586, 'Train users': 536802, 'Train items': 14092, 'Test': 237414, 'Test users': 98930, 'Test items': 5947}


  0%|          | 0/536802 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'), 'Train': 2886800, 'Train users': 595902, 'Train items': 14357, 'Test': 211146, 'Test users': 86167, 'Test items': 6209}


  0%|          | 0/595902 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'Train': 3192875, 'Train users': 640144, 'Train items': 14711, 'Test': 214489, 'Test users': 84234, 'Test items': 6313}


  0%|          | 0/640144 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'Train': 3506106, 'Train users': 687200, 'Train items': 14928, 'Test': 231207, 'Test users': 87632, 'Test items': 6491}


  0%|          | 0/687200 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}


  0%|          | 0/734701 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}


  0%|          | 0/788721 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}


  0%|          | 0/842129 [00:00<?, ?it/s]

CPU times: user 1h 28min 24s, sys: 19.5 s, total: 1h 28min 44s
Wall time: 1h 5min 35s


In [None]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,prec@10,recall@10,MAP@10,novelty,serendipity
0,0,cosine,0.004879,0.027282,0.004696,7.783925,3.1e-05
1,1,cosine,0.004807,0.028028,0.004765,7.8136,3.3e-05
2,2,cosine,0.004103,0.023102,0.004082,7.95327,3.7e-05
3,3,cosine,0.003865,0.020485,0.003623,8.063779,4.4e-05
4,4,cosine,0.0037,0.019591,0.003579,8.118989,4.7e-05
5,5,cosine,0.003669,0.01941,0.003403,8.126134,4.3e-05
6,6,cosine,0.00333,0.017233,0.003142,8.185844,4.3e-05
7,0,tfidf,0.008546,0.04834,0.008829,7.799313,3.5e-05
8,1,tfidf,0.008504,0.05056,0.009345,7.827066,3.9e-05
9,2,tfidf,0.00683,0.038295,0.007287,7.952497,4.2e-05


# Вывод: модель TFIDF - дает лучшие метрики. Выбираем ее.
Далее обучим выбранную модель на одном фолде, чтобы использовать все имеющиеся данные

## Эксперименты с итоговой моделью

### 1. Итоговая модель на объединенной выборке (на одном фолде)

In [None]:
# train test split 
# test = last 1 week 
from rectools.model_selection import TimeRangeSplit

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [None]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(interactions, collect_fold_stats=True).__next__()

In [None]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

In [None]:
#определяем метрики
metrics = {
    "MAP@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

# определяем модель
models = {
    "tfidf_itemknn": TFIDFRecommender(),
  }


In [None]:
%%time
results = []

print(f"one fold")
print(fold_info)

df_train = train.copy()
df_test = test.copy()

catalog = df_train[Columns.Item].unique()
    
for model_name, model in models.items():
    userknn_model = UserKnn(model=model, N_users=30)
    userknn_model.fit(df_train)
    recos = userknn_model.predict(df_test)

    metric_values = calc_metrics(
        metrics,
        reco=recos,
        interactions=df_test,
        prev_interactions=df_train,
        catalog=catalog,
    )

    # fold = {"fold": one, "model": model_name}
    fold = {}
    fold.update(metric_values)
    results.append(fold)

one fold
{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}


  0%|          | 0/842129 [00:00<?, ?it/s]

CPU times: user 20min 56s, sys: 3.92 s, total: 21min
Wall time: 15min 12s


In [None]:
metric_values

{'prec@10': 0.005880391830010884,
 'recall@10': 0.03125837090719986,
 'MAP@10': 0.005903546874166698,
 'novelty': 8.143470656676735,
 'serendipity': 5.783574655047179e-05}

####Вывод: На последних фолдах модель дает более низкий скор, чем на ранних

### Итоговая модель на одном фолде 50 соседей

In [None]:
%%time
print(f"one fold")
print(fold_info)

df_train = train.copy()
df_test = test.copy()

catalog = df_train[Columns.Item].unique()
    
for model_name, model in models.items():
    userknn_model = UserKnn(model=model, N_users=50)
    userknn_model.fit(df_train)
    recos = userknn_model.predict(df_test)

    metric_values = calc_metrics(
        metrics,
        reco=recos,
        interactions=df_test,
        prev_interactions=df_train,
        catalog=catalog,
    )

    print(model_name)
    print(metric_values)

one fold
{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}


  0%|          | 0/842129 [00:00<?, ?it/s]

tfidf_itemknn
{'prec@10': 0.005880391830010884, 'recall@10': 0.03125837090719986, 'MAP@10': 0.005903546874166698, 'novelty': 8.143470656676735, 'serendipity': 5.783574655047179e-05}
CPU times: user 19min 50s, sys: 3.69 s, total: 19min 54s
Wall time: 14min 34s


In [None]:
metric_values

{'prec@10': 0.005880391830010884,
 'recall@10': 0.03125837090719986,
 'MAP@10': 0.005903546874166698,
 'novelty': 8.143470656676735,
 'serendipity': 5.783574655047179e-05}

######Разницы между 30 и 50 соседями на одном фолде по метрикам нет

###Итоговая модель на одном фолде 20 соседей

In [None]:
%%time
print(f"one fold")
print(fold_info)

df_train = train.copy()
df_test = test.copy()

catalog = df_train[Columns.Item].unique()
    
for model_name, model in models.items():
    userknn_model = UserKnn(model=model, N_users=20)
    userknn_model.fit(df_train)
    recos = userknn_model.predict(df_test)

    metric_values = calc_metrics(
        metrics,
        reco=recos,
        interactions=df_test,
        prev_interactions=df_train,
        catalog=catalog,
    )

    print(model_name)
    print(metric_values)

one fold
{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}


  0%|          | 0/842129 [00:00<?, ?it/s]

tfidf_itemknn
{'prec@10': 0.005880391830010884, 'recall@10': 0.03125837090719986, 'MAP@10': 0.005903546874166698, 'novelty': 8.143470656676735, 'serendipity': 5.783574655047179e-05}
CPU times: user 20min 51s, sys: 4.65 s, total: 20min 56s
Wall time: 15min 41s


In [None]:
metric_values

{'prec@10': 0.005880391830010884,
 'recall@10': 0.03125837090719986,
 'MAP@10': 0.005903546874166698,
 'novelty': 8.143470656676735,
 'serendipity': 5.783574655047179e-05}

###Итоговая модель на одном фолде 5 соседей

In [None]:
%%time
print(f"one fold")
print(fold_info)

df_train = train.copy()
df_test = test.copy()

catalog = df_train[Columns.Item].unique()
    
for model_name, model in models.items():
    userknn_model = UserKnn(model=model, N_users=5)
    userknn_model.fit(df_train)
    recos = userknn_model.predict(df_test)

    metric_values = calc_metrics(
        metrics,
        reco=recos,
        interactions=df_test,
        prev_interactions=df_train,
        catalog=catalog,
    )

    print(model_name)
    print(metric_values)

one fold
{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}


  0%|          | 0/842129 [00:00<?, ?it/s]

tfidf_itemknn
{'prec@10': 0.004011452889207025, 'recall@10': 0.020469959572834, 'MAP@10': 0.00404768595489729, 'novelty': 7.998270672477217, 'serendipity': 3.401568242417922e-05}
CPU times: user 20min 4s, sys: 3.81 s, total: 20min 7s
Wall time: 14min 33s


In [None]:
metric_values

{'prec@10': 0.004011452889207025,
 'recall@10': 0.020469959572834,
 'MAP@10': 0.00404768595489729,
 'novelty': 7.998270672477217,
 'serendipity': 3.401568242417922e-05}

### Вывод: 
эксперимент с количеством соседей для модели userknn с tfidf выполняли для 30, 50, 20 и 5 соседей на одном фолде. Время рассчетов практически одинаковое и составляет порядка 14-15 минут. Для соседей, равных 20, 30, 50 - метрики не изменялись, для 5 соседей - стали ниже. Для дальнейшей работы сохраним модель для 20 соседей.

In [None]:
# save model
import dill

with open('/content/drive/MyDrive/ITMO_MTC_course/hw_3/knn_20.dill', 'wb') as f:
    dill.dump(model, f)

#Блэндинг моделей
Для рекомендаций менее 10 будем подмешивать популярные таким образом, чтобы общее количество рекомендаций было равно 10.

# Популярное на фолдах

In [None]:
class PopularReco():
    def __init__(self, max_K=10, days=7, item_column='item_id', dt_column=Columns.Datetime):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.recommendations = []
        
    def fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values
    
    def recommend(self, users=None, N=10):
        recs = self.recommendations[:N]
        if users is None:
            return recs
        else:
            return list(islice(cycle([recs]), len(users)))

In [None]:
# setting for cv 
n_folds = 7
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-07-04 00:00:00'), Timestamp('2021-08-22 00:00:00'))


In [None]:
last_date = interactions[Columns.Datetime].max().normalize()
folds = 7
start_date = last_date - pd.Timedelta(days=folds*7)
start_date, last_date

(Timestamp('2021-07-04 00:00:00'), Timestamp('2021-08-22 00:00:00'))

In [None]:
periods = n_folds + 1
freq = f"{n_units}{unit}"
print(
    f"start_date: {start_date}\n"
    f"last_date: {last_date}\n"
    f"periods: {periods}\n"
    f"freq: {freq}\n"
)
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplit(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

start_date: 2021-07-04 00:00:00
last_date: 2021-08-22 00:00:00
periods: 8
freq: 1W

Test fold borders: ['2021-07-04' '2021-07-11' '2021-07-18' '2021-07-25' '2021-08-01'
 '2021-08-08' '2021-08-15' '2021-08-22']


In [None]:
folds_with_stats = list(cv.split( 
    interactions, 
    collect_fold_stats=True
))

folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])

In [None]:
folds_info_with_stats

Unnamed: 0,Start date,End date,Train,Train users,Train items,Test,Test users,Test items
0,2021-07-04,2021-07-11,2886800,595902,14357,211146,86167,6209
1,2021-07-11,2021-07-18,3192875,640144,14711,214489,84234,6313
2,2021-07-18,2021-07-25,3506106,687200,14928,231207,87632,6491
3,2021-07-25,2021-08-01,3838180,734701,15061,249396,93092,6611
4,2021-08-01,2021-08-08,4203885,788721,15212,264039,98161,6609
5,2021-08-08,2021-08-15,4587708,842129,15404,276699,101983,6715
6,2021-08-15,2021-08-22,4985269,896791,15565,297228,109382,6705


In [None]:
top_N = 10
last_n_days = 7

In [None]:
%%time
final_results = []
validation_results = pd.DataFrame()

for i_fold, (train_idx, test_idx, info) in enumerate(folds_with_stats):
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)

    train = interactions.iloc[train_ids].copy()
    test = interactions.iloc[test_ids][Columns.UserItem].copy()

    catalog = train[Columns.Item].unique()
        
    pop_model = PopularReco(days=last_n_days, dt_column=Columns.Datetime)
    pop_model.fit(train)

    recs = pd.DataFrame({'user_id': test['user_id'].unique()})
    recs['item_id'] = pop_model.recommend(recs['user_id'], N=top_N)
    recs = recs.explode('item_id')
    recs['rank'] = recs.groupby('user_id').cumcount() + 1

    metric_values = calc_metrics(
        metrics,
        reco=recs,
        interactions=test,
        prev_interactions=train,
        catalog=catalog,
    )

    fold = {"fold": i_fold}
    fold.update(metric_values)
    results.append(fold)
    
    df_metrics = pd.DataFrame(results)


{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}

{'Start date': Time

In [None]:
df_metrics

Unnamed: 0,prec@10,recall@10,MAP@10,novelty,serendipity,fold
0,0.00588,0.031258,0.005904,8.143471,5.8e-05,
1,0.037411,0.196584,0.075371,4.262035,2.7e-05,0.0
2,0.037411,0.196584,0.075371,4.262035,2.7e-05,1.0
3,0.037411,0.196584,0.075371,4.262035,2.7e-05,2.0
4,0.037411,0.196584,0.075371,4.262035,2.7e-05,3.0
5,0.037411,0.196584,0.075371,4.262035,2.7e-05,4.0
6,0.037411,0.196584,0.075371,4.262035,2.7e-05,5.0
7,0.037411,0.196584,0.075371,4.262035,2.7e-05,6.0


популярные ожидаемо дают высокую метрику

#Для сервиса

In [None]:
class PopularRecoS():
    def __init__(self, max_K=10, days=7, item_column='item_id', dt_column=Columns.Datetime):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.recommendations = []
        
    def fit(self, df, ):
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values
    
    def recommend(self, N=10):
        recs = self.recommendations[:N]
        return recs

In [None]:
pop_model_7 = PopularRecoS(days=7)
pop_model_7.fit(interactions)

In [None]:
pop_model_7.recommend(10)

array([ 9728, 15297, 10440, 14488, 13865, 12192,   341,  4151,  3734,
         512])

In [None]:
with open('/content/drive/MyDrive/ITMO_MTC_course/hw_3/pop_model_week.dill', 'wb') as f:
    dill.dump(pop_model_7, f)

У нас две модели: knn_20 и pop_week

In [None]:
with open('/content/drive/MyDrive/ITMO_MTC_course/hw_3/knn_20.dill', 'rb') as f:
    knn_20 = dill.load(f)

In [None]:
joblib.dump(knn_20, '/content/drive/MyDrive/ITMO_MTC_course/hw_3/knn_20.joblib') # для сервиса

['/content/drive/MyDrive/ITMO_MTC_course/hw_3/knn_20.joblib']

In [None]:
with open('/content/drive/MyDrive/ITMO_MTC_course/hw_3/pop_model_week.dill', 'rb') as f:
    pop_7 = dill.load(f)

In [None]:
joblib.dump(pop_7, '/content/drive/MyDrive/ITMO_MTC_course/hw_3/pop_week.joblib') # для сервиса

['/content/drive/MyDrive/ITMO_MTC_course/hw_3/pop_week.joblib']

создадим функцию, которая добивает популярным до 10 рекомендаций

In [None]:
def get_knn_pop(user_id, N=10):

    recs = knn_20.similar_items(user_id)
    recs = [x[0] for x in recs]
    
    pop = pop_7.recommend(N=10)
    
    if len(recs) < N:
            recs.extend(pop[:N])
            recs = recs[:N]
    
    return recs

In [None]:
get_knn_pop(1000)

[1000, 6009, 89972, 37543, 79860, 115378, 62591, 58236, 6022, 13156]

In [None]:
pop_7.recommend()

array([ 9728, 15297, 10440, 14488, 13865, 12192,   341,  4151,  3734,
         512])

In [None]:
# функция выдает рекомендации ровно 10. Если knn_20 выдает меньшее количество, то добивает популярными
def get_rec(user_id):
    users = interactions['user_id'].unique() 
    if user_id in users:
        return get_knn_pop(user_id)
    else:
        return pop_7.recommend()

In [None]:
get_rec(1000)

[1000, 6009, 89972, 37543, 79860, 115378, 62591, 58236, 6022, 13156]