# Libraries

In [71]:
!pip install rectools==0.4.2
!pip install lightfm
!pip install nmslib
!pip install optuna
!pip install hnswlib

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp310-cp310-linux_x86_64.whl size=2287620 sha256=4097b86e417cf8d2dbad8b295ed1165142d9823081e4758a6ff9c35163a4c240
  Stored in directory: /root/.cache/pip/wheels/af/a9/3e/3e5d59ee41664eb31a4e6de67d1846f86d16d93c45f277c4e7
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


In [27]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

import warnings
warnings.filterwarnings('ignore')

In [77]:
import pandas as pd
import numpy as np
import zipfile as zf

import requests
from pathlib import Path
import typing as tp
from tqdm import tqdm

from rectools import Columns
from rectools.models import PopularModel, ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import Precision, Recall, MAP, Serendipity,MeanInvUserFreq, calc_metrics
from rectools.dataset import Dataset, Interactions
from rectools.tools import UserToItemAnnRecommender

from lightfm import LightFM

from implicit.als import AlternatingLeastSquares

import optuna

import matplotlib.pyplot as plt
import seaborn as sns

# Read data

In [29]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'


In [30]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download: 100%|██████████| 78.8M/78.8M [1:19:25<00:00, 16.5kiB/s]
kion dataset download:  98%|█████████▊| 77.6M/78.8M [00:00<00:00, 105MiB/s]

In [31]:
files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()

In [32]:
interactions= pd.read_csv('data_original/interactions.csv')

users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [33]:
interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [34]:
Columns.Datetime = 'last_watch_dt'

interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

max_date = interactions[Columns.Datetime].max()

interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

kion dataset download: 100%|██████████| 78.8M/78.8M [00:14<00:00, 105MiB/s]

In [35]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [36]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [37]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [38]:
TEST_USERS = test[Columns.User].unique()

# Prepare features for train

## User features

In [39]:
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


## Item features

In [40]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

item_features = pd.concat((genre_feature, content_feature))

# Create dataset

In [41]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

# Metrics

In [42]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
    "novelty": MeanInvUserFreq,
    "serendipity": Serendipity,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(5, 11, 5):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

metrics

{'Precision@5': Precision(k=5),
 'Precision@10': Precision(k=10),
 'Recall@5': Recall(k=5),
 'Recall@10': Recall(k=10),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False),
 'novelty@5': MeanInvUserFreq(k=5),
 'novelty@10': MeanInvUserFreq(k=10),
 'serendipity@5': Serendipity(k=5),
 'serendipity@10': Serendipity(k=10)}

# Models

In [43]:
K_RECOS = 10
RANDOM_STATE = 42
N_EPOCHS = 1 # Lightfm

In [55]:
models = {
    'popular': PopularModel(),
}

## Hyperparameter tuning

In [48]:
def als_tuning(trial,dataset,train,test,TEST_USERS):
    metrics = {'MAP@10': MAP(k=10)}

    factors = trial.suggest_int('factors', 8, 33, 8)
    regularization = trial.suggest_float('regularization', 0.01, 0.1)
    num_threads = trial.suggest_int('num_threads', 1, 3)
    fit_features_together = trial.suggest_categorical("fit_features_together", [True, False])

    # Создание и обучение модели
    model = ImplicitALSWrapperModel(model = AlternatingLeastSquares(
                                                                    factors=factors,
                                                                    regularization=regularization,
                                                                    num_threads=num_threads),
                                    fit_features_together=fit_features_together)
    model.fit(dataset)

    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values["MAP@10"]

# Создание объекта study и запуск оптимизации
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: als_tuning(trial, dataset, train, test, TEST_USERS), n_trials=20)

# Получение оптимальных параметров
best_params = study.best_params
print("Best Parameters:", best_params)

# После настройки параметров, обучите модель на полном наборе данных
als_model = AlternatingLeastSquares(factors=best_params['factors'],
                                       regularization=best_params['regularization'],
                                       num_threads=best_params['num_threads'])


[I 2023-12-05 14:44:08,749] A new study created in memory with name: no-name-1fcfeec1-ea35-4b77-9604-b3abda61f16d


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-05 14:46:09,338] Trial 0 finished with value: 0.06336022354608616 and parameters: {'factors': 32, 'regularization': 0.05381632474280163, 'num_threads': 2, 'fit_features_together': False}. Best is trial 0 with value: 0.06336022354608616.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-05 14:48:20,282] Trial 1 finished with value: 0.06317296936492346 and parameters: {'factors': 32, 'regularization': 0.06765402356673039, 'num_threads': 3, 'fit_features_together': False}. Best is trial 0 with value: 0.06336022354608616.
[I 2023-12-05 14:52:44,617] Trial 2 finished with value: 0.07386162383844731 and parameters: {'factors': 16, 'regularization': 0.08381637200302797, 'num_threads': 1, 'fit_features_together': True}. Best is trial 2 with value: 0.07386162383844731.
[I 2023-12-05 14:56:22,021] Trial 3 finished with value: 0.07475693234656557 and parameters: {'factors': 16, 'regularization': 0.013670127942617483, 'num_threads': 2, 'fit_features_together': True}. Best is trial 3 with value: 0.07475693234656557.
[I 2023-12-05 15:01:10,677] Trial 4 finished with value: 0.07422601174944209 and parameters: {'factors': 32, 'regularization': 0.044262495950964556, 'num_threads': 1, 'fit_features_together': True}. Best is trial 3 with value: 0.07475693234656557.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-05 15:03:00,147] Trial 5 finished with value: 0.06270600916535003 and parameters: {'factors': 16, 'regularization': 0.06242142123323594, 'num_threads': 3, 'fit_features_together': False}. Best is trial 3 with value: 0.07475693234656557.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-05 15:05:04,664] Trial 6 finished with value: 0.06323795828876816 and parameters: {'factors': 24, 'regularization': 0.04987582929565832, 'num_threads': 3, 'fit_features_together': False}. Best is trial 3 with value: 0.07475693234656557.
[I 2023-12-05 15:09:10,037] Trial 7 finished with value: 0.07410240560164374 and parameters: {'factors': 32, 'regularization': 0.09009368428796828, 'num_threads': 3, 'fit_features_together': True}. Best is trial 3 with value: 0.07475693234656557.
[I 2023-12-05 15:12:53,118] Trial 8 finished with value: 0.07458468746434949 and parameters: {'factors': 8, 'regularization': 0.06176816591366261, 'num_threads': 3, 'fit_features_together': True}. Best is trial 3 with value: 0.07475693234656557.
[I 2023-12-05 15:16:57,648] Trial 9 finished with value: 0.0745975969639648 and parameters: {'factors': 16, 'regularization': 0.04516333749235525, 'num_threads': 1, 'fit_features_together': True}. Best is trial 3 with value: 0.07475693234656557.
[I 2023-12-05

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-05 15:49:00,767] Trial 18 finished with value: 0.06269132896169459 and parameters: {'factors': 16, 'regularization': 0.03906549721218996, 'num_threads': 1, 'fit_features_together': False}. Best is trial 10 with value: 0.07486965500188066.
[I 2023-12-05 15:52:40,204] Trial 19 finished with value: 0.07438215525022633 and parameters: {'factors': 8, 'regularization': 0.014846321093541772, 'num_threads': 2, 'fit_features_together': True}. Best is trial 10 with value: 0.07486965500188066.


Best Parameters: {'factors': 8, 'regularization': 0.013675826109010468, 'num_threads': 2, 'fit_features_together': True}


In [52]:
def light_fm__tuning(trial,dataset,train,test,TEST_USERS):
    metrics = {'MAP@10': MAP(k=10)}

    no_components = trial.suggest_int('factors', 8, 65, 8)
    loss = trial.suggest_categorical("loss", ["logistic", "bpr", "warp"])
    learning_rate = trial.suggest_float("lr", 0.01, 0.1, log=True)
    num_threads = trial.suggest_int("num_threads", 1, 3)
    user_alpha = trial.suggest_float("user_alpha", 0, 1)
    item_alpha = trial.suggest_float("item_alpha", 0, 1)

    # Создание и обучение модели
    model = LightFMWrapperModel(LightFM(
                                        no_components=no_components,
                                        loss=loss,
                                        random_state=42,
                                        learning_rate=learning_rate,
                                        user_alpha=user_alpha,
                                        item_alpha=item_alpha),
                                epochs=1,
                                num_threads=num_threads)
    model.fit(dataset)

    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values["MAP@10"]

# Создание объекта study и запуск оптимизации
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: light_fm__tuning(trial, dataset, train, test, TEST_USERS), n_trials=20)

# Получение оптимальных параметров
best_params = study.best_params
print("Best Parameters:", best_params)


[I 2023-12-05 16:02:43,919] A new study created in memory with name: no-name-c9552d22-1932-4bd8-88f2-7e239f3387b3
[I 2023-12-05 16:04:53,068] Trial 0 finished with value: 0.00014369398389893156 and parameters: {'factors': 32, 'loss': 'logistic', 'lr': 0.057861405775290274, 'num_threads': 3, 'user_alpha': 0.7722886634203069, 'item_alpha': 0.5370602033491655}. Best is trial 0 with value: 0.00014369398389893156.
[I 2023-12-05 16:07:14,301] Trial 1 finished with value: 0.06995007464787781 and parameters: {'factors': 48, 'loss': 'warp', 'lr': 0.06944051882815695, 'num_threads': 2, 'user_alpha': 0.7904668242389691, 'item_alpha': 0.601344633543883}. Best is trial 1 with value: 0.06995007464787781.
[I 2023-12-05 16:08:33,844] Trial 2 finished with value: 0.0006336876346684376 and parameters: {'factors': 48, 'loss': 'warp', 'lr': 0.013153532568004845, 'num_threads': 1, 'user_alpha': 0.9173586630267543, 'item_alpha': 0.5634872589293479}. Best is trial 1 with value: 0.06995007464787781.
[I 2023-1

Best Parameters: {'factors': 48, 'loss': 'warp', 'lr': 0.06944051882815695, 'num_threads': 2, 'user_alpha': 0.7904668242389691, 'item_alpha': 0.601344633543883}


## Add models

In [56]:
models['als']=ImplicitALSWrapperModel(
                    model=AlternatingLeastSquares(
                        factors=8,
                        random_state=RANDOM_STATE,
                        num_threads=2,
                    ),
                    fit_features_together=True)

models['lightfm']=LightFMWrapperModel(
                        model=LightFM(
                            no_components=48,
                            loss='warp',
                            random_state=RANDOM_STATE,
                            learning_rate=0.06944051882815695,
                            user_alpha=0.7904668242389691,
                            item_alpha=0.601344633543883,
                        ),
                        epochs=N_EPOCHS,
                        num_threads=2,
                    )

## ANN model

In [None]:
model = models['lightfm']
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7b6484bb0ca0>

In [80]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [84]:
lightfm_ann = UserToItemAnnRecommender(
    user_vectors = user_embeddings,
    item_vectors = item_embeddings,
    user_id_map = dataset.user_id_map,
    item_id_map = dataset.item_id_map
)
lightfm_ann.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x7b6484b8f790>

# Cross validation

In [59]:
n_splits = 3

cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [62]:
interactions_1 = Interactions(interactions)
interactions_1.df.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,176549,9506,2021-05-11,4250,72.0,3.0
1,699317,1659,2021-05-29,8317,100.0,3.0
2,656683,7107,2021-05-09,10,0.0,1.0
3,864613,7638,2021-07-05,14483,100.0,3.0
4,964868,9506,2021-04-30,6725,100.0,3.0


In [63]:
cv.get_test_fold_borders(interactions_1)

[(Timestamp('2021-08-02 00:00:00', freq='7D'),
  Timestamp('2021-08-09 00:00:00', freq='7D')),
 (Timestamp('2021-08-09 00:00:00', freq='7D'),
  Timestamp('2021-08-16 00:00:00', freq='7D')),
 (Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]

In [64]:
def cross_validation_function(interactions, models, metrics, K_RECOS, splitter,n_splits):

  results = []

  fold_iterator = splitter.split(interactions, collect_fold_stats=True)

  for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=n_splits):
      print(f"\n==================== Fold {fold_info['i_split']}")
      print(fold_info)

      df_train = interactions.df.iloc[train_ids]
      dataset = Dataset.construct(df_train)

      df_test = interactions.df.iloc[test_ids][Columns.UserItem]
      test_users = np.unique(df_test[Columns.User])

      catalog = df_train[Columns.Item].unique()

      for model_name, model in models.items():
          model.fit(dataset)
          recos = model.recommend(
              users=test_users,
              dataset=dataset,
              k=K_RECOS,
              filter_viewed=True,
          )
          metric_values = calc_metrics(
              metrics,
              reco=recos,
              interactions=df_test,
              prev_interactions=df_train,
              catalog=catalog,
          )
          res = {"fold": fold_info["i_split"], "model": model_name}
          res.update(metric_values)
          results.append(res)
  return results

In [66]:
%%time

cross_val_results = cross_validation_function(interactions_1, models, metrics, K_RECOS, cv, n_splits)


  0%|          | 0/3 [00:00<?, ?it/s][A


{'i_split': 0, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D'), 'train': 4266013, 'train_users': 797423, 'train_items': 15237, 'test': 263681, 'test_users': 98184, 'test_items': 6602}



 33%|███▎      | 1/3 [12:50<25:40, 770.22s/it][A


{'i_split': 1, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 279422, 'test_users': 103511, 'test_items': 6698}



 67%|██████▋   | 2/3 [27:23<13:50, 830.98s/it][A


{'i_split': 2, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D'), 'train': 5051815, 'train_users': 906071, 'train_items': 15577, 'test': 298878, 'test_users': 110076, 'test_items': 6679}



100%|██████████| 3/3 [44:05<00:00, 881.91s/it]

CPU times: user 55min 26s, sys: 1min 31s, total: 56min 58s
Wall time: 44min 5s





In [67]:
pivot_results = pd.DataFrame(cross_val_results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,Precision@5,Recall@5,Precision@10,Recall@10,MAP@5,MAP@10,novelty@5,novelty@10,serendipity@5,serendipity@10
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
popular,0.052402,0.137413,0.033903,0.173492,0.078295,0.084109,3.066979,3.71339,3e-06,2e-06
als,0.032805,0.079965,0.026108,0.123969,0.04399,0.050484,4.678178,5.090215,2.2e-05,2.5e-05
lightfm,5.3e-05,0.000117,4.9e-05,0.000184,3.5e-05,4.4e-05,15.463867,14.43764,5e-06,5e-06


In [68]:
n_splits = 3

cv2 = TimeRangeSplitter(
    test_size="14D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [69]:
%%time

cross_val_results = cross_validation_function(interactions_1, models, metrics, K_RECOS, cv2, n_splits)


  0%|          | 0/3 [00:00<?, ?it/s][A


{'i_split': 0, 'start': Timestamp('2021-07-12 00:00:00', freq='14D'), 'end': Timestamp('2021-07-26 00:00:00', freq='14D'), 'train': 3239125, 'train_users': 646423, 'train_items': 14730, 'test': 398993, 'test_users': 122488, 'test_items': 7394}



 33%|███▎      | 1/3 [08:55<17:50, 535.35s/it][A


{'i_split': 1, 'start': Timestamp('2021-07-26 00:00:00', freq='14D'), 'end': Timestamp('2021-08-09 00:00:00', freq='14D'), 'train': 3892558, 'train_users': 742256, 'train_items': 15085, 'test': 458757, 'test_users': 135624, 'test_items': 7711}



 67%|██████▋   | 2/3 [20:28<10:28, 628.28s/it][A


{'i_split': 2, 'start': Timestamp('2021-08-09 00:00:00', freq='14D'), 'end': Timestamp('2021-08-23 00:00:00', freq='14D'), 'train': 4649162, 'train_users': 850489, 'train_items': 15415, 'test': 521381, 'test_users': 151629, 'test_items': 7705}



100%|██████████| 3/3 [35:54<00:00, 718.13s/it]

CPU times: user 45min 14s, sys: 1min 52s, total: 47min 7s
Wall time: 35min 54s





In [70]:
pivot_results = pd.DataFrame(cross_val_results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean"])
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)
)

Unnamed: 0_level_0,Precision@5,Recall@5,Precision@10,Recall@10,MAP@5,MAP@10,novelty@5,novelty@10,serendipity@5,serendipity@10
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
popular,0.070339,0.160496,0.045964,0.203517,0.091013,0.098549,3.109863,3.722852,4e-06,3e-06
als,0.039507,0.082627,0.031748,0.129827,0.047675,0.054882,4.725368,5.094197,2.5e-05,2.7e-05
lightfm,6.3e-05,6.3e-05,7.4e-05,0.000132,2.3e-05,3e-05,15.187145,14.423801,4e-06,5e-06


При изменении test size с 7D на 14D метрики изменились незначительно

# Results for service

In [85]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [88]:
unique_users = interactions[Columns.User].unique()


In [86]:
model_als = models['als']
model_als.fit(dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7b6484bb3d30>

In [90]:
als_recos = model_als.recommend(
    users=unique_users,
    dataset=dataset,
    k=10,
    filter_viewed=True,
)[[Columns.User, Columns.Item]]

In [106]:
als_recos.to_csv('/content/drive/MyDrive/Colab Notebooks/ИТМО/Recsys/als_recos.csv')

In [87]:
model_lightfm = models['lightfm']
model_lightfm.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7b6484bb0ca0>

In [107]:
lightfm_recos = model_lightfm.recommend(
    users=unique_users,
    dataset=dataset,
    k=10,
    filter_viewed=True,
)[[Columns.User, Columns.Item]]

In [108]:
lightfm_recos.to_csv('/content/drive/MyDrive/Colab Notebooks/ИТМО/Recsys/lightfm_recos.csv')

# Что сделано:

Реализован тюнинг гиперпараметров для моделей из implicit, lightfm и rectools

Для перебора гиперпараметров использована Optuna

Используется метод приближенного поиска соседей для выдачи рекомендаций

Сделаны рекомендации для холодных пользователей используя их фичи

Проведены эксперименты с параметрами оффлайн валидации и сделаны выводы