In [1]:
import pandas as pd
import numpy as np
import pyarrow.feather as feather
from rectools.model_selection import TimeRangeSplit
from rectools.dataset import Interactions
from rectools import Columns
from rectools.metrics import MAP, calc_metrics
from userknn import UserKnn
from implicit.nearest_neighbours import CosineRecommender
import scipy as sp

In [2]:
users = feather.read_feather(
    '/Users/dmitry/Library/CloudStorage/GoogleDrive-ceo@gangai.pro/Мой диск/Проекты/recsys/data/processed_data/users_process.csv')
items = feather.read_feather(
    '/Users/dmitry/Library/CloudStorage/GoogleDrive-ceo@gangai.pro/Мой диск/Проекты/recsys/data/processed_data/items_process.csv')
interactions = feather.read_feather(
    '/Users/dmitry/Library/CloudStorage/GoogleDrive-ceo@gangai.pro/Мой диск/Проекты/recsys/data/processed_data/interactions_process.csv')

In [3]:
interactions.rename(columns={'user_id': Columns.User,
                             'item_id': Columns.Item,
                             'last_watch_dt': Columns.Datetime,
                             'total_dur': Columns.Weight}, inplace=True)

In [4]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  Int8          
dtypes: Int8(1), datetime64[ns](1), int64(3)
memory usage: 177.6 MB


In [5]:
n_folds = 7
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)
print(f"Start date and last date of the test fold: {start_date, last_date}")

Start date and last date of the test fold: (Timestamp('2021-06-27 00:00:00'), Timestamp('2021-08-22 00:00:00'))


In [6]:
periods = n_folds + 1
freq = f"{n_units}{unit}"

date_range = pd.date_range(
    start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(date_range)

DatetimeIndex(['2021-06-27', '2021-07-04', '2021-07-11', '2021-07-18',
               '2021-07-25', '2021-08-01', '2021-08-08', '2021-08-15'],
              dtype='datetime64[ns]', freq='W-SUN')


In [7]:
cv = TimeRangeSplit(date_range=date_range)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Real number of folds: 7


In [8]:
results = []
fold_iterator = cv.split(interactions, collect_fold_stats=True)
metrics = {'MAP@10': MAP(k=10)}
model = {'CosineRecommender': CosineRecommender()}

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)
    df_train = interactions.iloc[train_ids]
    df_test = interactions.iloc[test_ids][Columns.UserItem]
    catalog = df_train[Columns.Item].unique()

    userknn_model = UserKnn(model=model['CosineRecommender'], N_users=50)
    userknn_model.fit(df_train)

    recos = userknn_model.predict(df_test, df_train)

    metric_values = calc_metrics(
        metrics,
        reco=recos,
        interactions=df_test,
        prev_interactions=df_train,
        catalog=catalog,
    )

    fold = {"fold": i_fold, "model": model}
    fold.update(metric_values)
    results.append(fold)


{'Start date': Timestamp('2021-06-27 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'), 'Train': 2533586, 'Train users': 536802, 'Train items': 14092, 'Test': 237414, 'Test users': 98930, 'Test items': 5947}


  0%|          | 0/536802 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-07-04 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'), 'Train': 2886800, 'Train users': 595902, 'Train items': 14357, 'Test': 211146, 'Test users': 86167, 'Test items': 6209}


  0%|          | 0/595902 [00:00<?, ?it/s]

KeyboardInterrupt: 