In [1]:
!pip install rectools==0.2.0 >> None
!pip install optuna >> None

In [2]:
import pandas as pd
import numpy as np
import pyarrow.feather as feather
from rectools.model_selection import TimeRangeSplit
from rectools.dataset import Interactions
from rectools import Columns
from rectools.metrics import MAP, calc_metrics
from userknn import UserKnn
from implicit.nearest_neighbours import CosineRecommender
import scipy as sp
import joblib
import optuna
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
users = feather.read_feather(
    '/content/drive/MyDrive/Проекты/recsys/data/processed_data/users_process.csv')
items = feather.read_feather(
    '/content/drive/MyDrive/Проекты/recsys/data/processed_data/items_process.csv')
interactions = feather.read_feather(
    '/content/drive/MyDrive/Проекты/recsys/data/processed_data/interactions_process.csv')

In [4]:
interactions.rename(columns={'user_id': Columns.User,
                             'item_id': Columns.Item,
                             'last_watch_dt': Columns.Datetime,
                             'total_dur': Columns.Weight}, inplace=True)

In [8]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  Int8          
dtypes: Int8(1), datetime64[ns](1), int64(3)
memory usage: 177.6 MB


In [None]:
n_folds = 1
unit = "W"
n_units = 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)
print(f"Start date and last date of the test fold: {start_date, last_date}")

In [None]:
periods = n_folds + 1
freq = f"{n_units}{unit}"

date_range = pd.date_range(
    start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(date_range)

In [None]:
cv = TimeRangeSplit(date_range=date_range)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

In [None]:
userknn_model = joblib.load('/content/drive/MyDrive/Проекты/recsys/models/userknn_baseline.joblib')

In [None]:
results = []
fold_iterator = cv.split(interactions, collect_fold_stats=True)
metrics = {'MAP@10': MAP(k=10)}
model = {'CosineRecommender': CosineRecommender()}

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)
    df_train = interactions.iloc[train_ids]
    df_test = interactions.iloc[test_ids][Columns.UserItem]
    catalog = df_train[Columns.Item].unique()

    # userknn_model = UserKnn(model=model['CosineRecommender'], N_users=50)
    # userknn_model.fit(df_train)

    recos = userknn_model.predict(df_test, df_train, N_recs=10)

    metric_values = calc_metrics(
        metrics,
        reco=recos,
        interactions=df_test,
        prev_interactions=df_train,
        catalog=catalog,
    )

    fold = {"fold": i_fold, "model": list(model.keys())[0]}
    fold.update(metric_values)
    results.append(fold)

In [None]:
df_metrics = pd.DataFrame(results)

In [None]:
df_metrics

In [None]:
feather.write_feather(df_metrics, '/content/drive/MyDrive/Проекты/recsys/models/baseline_metrics.csv')

In [None]:
joblib.dump(userknn_model, '/content/drive/MyDrive/Проекты/recsys/models/userknn_baseline.joblib')

In [None]:
model = joblib.load('/content/drive/MyDrive/Проекты/recsys/models/userknn_baseline.joblib')

# Tuning

In [9]:
def objective(trial,
              data: pd.DataFrame,
              N_Folds: int,
              random_state: int) -> float:
    """
    Function that Optuna will optimize
    :param trial: optuna trial
    :param data: your data
    :param N_Folds: number of folds for cross validation
    :param random_state: random state
    :return: MAP@10 folds average
    """
    params = {'unit': trial.suggest_categorical('unit', ['D', 'W']),
              'n_units': trial.suggest_categorical('n_units', [1, 3, 5]),
              'n_users': trial.suggest_categorical('n_users', [10, 20, 50, 100]),
              'bool': trial.suggest_categorical('bool', [True, False]),
              'n_recs': trial.suggest_categorical('n_recs', [10, 20, 30])}

    periods = N_Folds + 1
    freq = f"{params['n_units']}{params['unit']}"
    last_date = interactions[Columns.Datetime].max().normalize()
    start_date = last_date - pd.Timedelta(N_Folds * params['n_units'] + 1, unit=params['unit'])
    date_range = pd.date_range(start=start_date, 
                               periods=periods, 
                               freq=freq, 
                               tz=last_date.tz)

    cv = TimeRangeSplit(date_range=date_range)
    
    results = []

    fold_iterator = cv.split(interactions, collect_fold_stats=params['bool'])
    metrics = {'MAP@10': MAP(k=10)}
    model = {'CosineRecommender': CosineRecommender(num_threads=4)}

    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        
        print(f"\n==================== Fold {i_fold}")
        print(fold_info)
        df_train = data.iloc[train_ids]
        df_test = data.iloc[test_ids][Columns.UserItem]
        catalog = df_train[Columns.Item].unique()

        userknn_model = UserKnn(model=model['CosineRecommender'], N_users=params['n_users'])
        userknn_model.fit(df_train)

        recos = userknn_model.predict(df_test, df_train, N_recs=params['n_recs'])


        metric_values = calc_metrics(metrics,
                                     reco=recos,
                                     interactions=df_test,
                                     prev_interactions=df_train,
                                     catalog=catalog)
        results.append(metric_values['MAP@10'])

    return np.mean(results)

In [10]:
n_folds = 1
RAND = 10

In [11]:
study = optuna.create_study(direction='maximize', study_name='Userknn')
func = lambda trial: objective(trial, interactions, n_folds, RAND)
study.optimize(func, n_trials=10, show_progress_bar=True)

[32m[I 2022-11-25 11:21:19,913][0m A new study created in memory with name: Userknn[0m
  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]


{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}


  0%|          | 0/842129 [00:00<?, ?it/s]

[32m[I 2022-11-25 12:08:28,996][0m Trial 0 finished with value: 0.003057269728051922 and parameters: {'unit': 'W', 'n_units': 1, 'n_users': 10, 'bool': True, 'n_recs': 20}. Best is trial 0 with value: 0.003057269728051922.[0m

{'Start date': Timestamp('2021-07-11 00:00:00', freq='5W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='5W-SUN'), 'Train': 3192875, 'Train users': 640144, 'Train items': 14711, 'Test': 850743, 'Test users': 181374, 'Test items': 8903}


  0%|          | 0/640144 [00:00<?, ?it/s]

[32m[I 2022-11-25 12:28:10,384][0m Trial 1 finished with value: 0.0037966601888796895 and parameters: {'unit': 'W', 'n_units': 5, 'n_users': 20, 'bool': True, 'n_recs': 20}. Best is trial 1 with value: 0.0037966601888796895.[0m

{'Start date': Timestamp('2021-08-18 00:00:00', freq='3D'), 'End date': Timestamp('2021-08-21 00:00:00', freq='3D'), 'Train': 5160672, 'Train users': 920722, 'Train items': 15614, 'Test': 135446, 'Test users': 63260, 'Test items': 5446}


  0%|          | 0/920722 [00:00<?, ?it/s]

[32m[I 2022-11-25 13:15:13,880][0m Trial 2 finished with value: 0.0030547392754061687 and parameters: {'unit': 'D', 'n_units': 3, 'n_users': 50, 'bool': True, 'n_recs': 10}. Best is trial 1 with value: 0.0037966601888796895.[0m

{'Start date': Timestamp('2021-08-16 00:00:00', freq='5D'), 'End date': Timestamp('2021-08-21 00:00:00', freq='5D'), 'Train': 5051815, 'Train users': 906071, 'Train items': 15577, 'Test': 207727, 'Test users': 85168, 'Test items': 6109}


  0%|          | 0/906071 [00:00<?, ?it/s]

[32m[I 2022-11-25 14:00:31,299][0m Trial 3 finished with value: 0.002908264830034007 and parameters: {'unit': 'D', 'n_units': 5, 'n_users': 20, 'bool': True, 'n_recs': 30}. Best is trial 1 with value: 0.0037966601888796895.[0m

{'Start date': Timestamp('2021-07-25 00:00:00', freq='3W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='3W-SUN')}


  0%|          | 0/734701 [00:00<?, ?it/s]

[32m[I 2022-11-25 14:27:31,565][0m Trial 4 finished with value: 0.0035037692446204865 and parameters: {'unit': 'W', 'n_units': 3, 'n_users': 50, 'bool': False, 'n_recs': 10}. Best is trial 1 with value: 0.0037966601888796895.[0m

{'Start date': Timestamp('2021-08-18 00:00:00', freq='3D'), 'End date': Timestamp('2021-08-21 00:00:00', freq='3D'), 'Train': 5160672, 'Train users': 920722, 'Train items': 15614, 'Test': 135446, 'Test users': 63260, 'Test items': 5446}


  0%|          | 0/920722 [00:00<?, ?it/s]

[32m[I 2022-11-25 15:10:42,034][0m Trial 5 finished with value: 0.002789937116049367 and parameters: {'unit': 'D', 'n_units': 3, 'n_users': 10, 'bool': True, 'n_recs': 30}. Best is trial 1 with value: 0.0037966601888796895.[0m

{'Start date': Timestamp('2021-08-18 00:00:00', freq='3D'), 'End date': Timestamp('2021-08-21 00:00:00', freq='3D')}


  0%|          | 0/920722 [00:00<?, ?it/s]

[32m[I 2022-11-25 15:52:10,523][0m Trial 6 finished with value: 0.002789937116049367 and parameters: {'unit': 'D', 'n_units': 3, 'n_users': 10, 'bool': False, 'n_recs': 20}. Best is trial 1 with value: 0.0037966601888796895.[0m

{'Start date': Timestamp('2021-08-20 00:00:00', freq='D'), 'End date': Timestamp('2021-08-21 00:00:00', freq='D'), 'Train': 5276101, 'Train users': 935851, 'Train items': 15648, 'Test': 48687, 'Test users': 29514, 'Test items': 4098}


  0%|          | 0/935851 [00:00<?, ?it/s]

[32m[I 2022-11-25 16:36:33,350][0m Trial 7 finished with value: 0.0028873064532654656 and parameters: {'unit': 'D', 'n_units': 1, 'n_users': 10, 'bool': True, 'n_recs': 30}. Best is trial 1 with value: 0.0037966601888796895.[0m

{'Start date': Timestamp('2021-07-11 00:00:00', freq='5W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='5W-SUN')}


  0%|          | 0/640144 [00:00<?, ?it/s]

[32m[I 2022-11-25 16:55:03,997][0m Trial 8 finished with value: 0.0037966601888796895 and parameters: {'unit': 'W', 'n_units': 5, 'n_users': 100, 'bool': False, 'n_recs': 30}. Best is trial 1 with value: 0.0037966601888796895.[0m

{'Start date': Timestamp('2021-07-25 00:00:00', freq='3W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='3W-SUN')}


  0%|          | 0/734701 [00:00<?, ?it/s]

[32m[I 2022-11-25 17:21:29,205][0m Trial 9 finished with value: 0.0035037692446204865 and parameters: {'unit': 'W', 'n_units': 3, 'n_users': 20, 'bool': False, 'n_recs': 20}. Best is trial 1 with value: 0.0037966601888796895.[0m


In [12]:
joblib.dump(study, '/content/drive/MyDrive/Проекты/recsys/models/study.joblib')

['/content/drive/MyDrive/Проекты/recsys/models/study.joblib']

In [13]:
study.best_params

{'unit': 'W', 'n_units': 5, 'n_users': 20, 'bool': True, 'n_recs': 20}

In [14]:
study.best_value

0.0037966601888796895

# Train with best params


In [5]:
study = joblib.load('/content/drive/MyDrive/Проекты/recsys/models/study.joblib')

In [6]:
study.best_params

{'unit': 'W', 'n_units': 5, 'n_users': 20, 'bool': True, 'n_recs': 20}

In [7]:
n_folds = 1
unit = study.best_params['unit']
n_units = study.best_params['n_units']
periods = n_folds + 1
freq = f"{n_units}{unit}"
last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)
date_range = pd.date_range(start=start_date, 
                            periods=periods, 
                            freq=freq, 
                            tz=last_date.tz)

cv = TimeRangeSplit(date_range=date_range)

fold_iterator = cv.split(interactions, collect_fold_stats=True)
metrics = {'MAP@10': MAP(k=10)}
model = {'CosineRecommender': CosineRecommender(num_threads=4)}

results = []

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)
    df_train = interactions.iloc[train_ids]
    df_test = interactions.iloc[test_ids][Columns.UserItem]
    catalog = df_train[Columns.Item].unique()

    userknn_model = UserKnn(model=model['CosineRecommender'], N_users=study.best_params['n_users'])
    userknn_model.fit(df_train)

    recos = userknn_model.predict(df_test, df_train, N_recs=study.best_params['n_recs'])


    metric_values = calc_metrics(metrics,
                                  reco=recos,
                                  interactions=df_test,
                                  prev_interactions=df_train,
                                  catalog=catalog)
    results.append(metric_values['MAP@10'])



{'Start date': Timestamp('2021-07-11 00:00:00', freq='5W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='5W-SUN'), 'Train': 3192875, 'Train users': 640144, 'Train items': 14711, 'Test': 850743, 'Test users': 181374, 'Test items': 8903}


  0%|          | 0/640144 [00:00<?, ?it/s]

In [8]:
results

[0.0037966601888796895]

In [9]:
joblib.dump(userknn_model, '/content/drive/MyDrive/Проекты/recsys/models/userknn_tined.joblib')

['/content/drive/MyDrive/Проекты/recsys/models/userknn_tined.joblib']