In [2]:
# import sys
# !{sys.executable} -m pip install rectools==0.2.0

In [None]:
# pip install mlflow
# pip install implicit
# pip install rectools

In [2]:
!databricks configure --host https://community.cloud.databricks.com/

Username: robert.zaraev@mail.ru
Password: 
Repeat for confirmation: 


In [1]:
!python3 --version

Python 3.9.16


In [7]:
import os
import pickle
import random
import warnings

import numpy as np
import pandas as pd
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    ItemItemRecommender
)
# from implicit.nearest_neighbours import ItemItemRecommender
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import (
    MAP,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.model_selection import TimeRangeSplitter
from rectools.models import ImplicitItemKNNWrapperModel
from rectools.models.popular import PopularModel
from rectools.dataset import Interactions

from typing import Dict, List, Optional, Set, Tuple

import os
import time
warnings.filterwarnings("ignore")

In [8]:
import mlflow
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/robert.zaraev@mail.ru/ITMO")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/3460239350097604', creation_time=1682435082087, experiment_id='3460239350097604', last_update_time=1682440668780, lifecycle_stage='active', name='/Users/robert.zaraev@mail.ru/ITMO', tags={'mlflow.experiment.sourceName': '/Users/robert.zaraev@mail.ru/ITMO',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'robert.zaraev@mail.ru',
 'mlflow.ownerId': '8185111949434688'}>

In [9]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

In [19]:
# download dataset by chunks
# !wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip -O ../data/data_original.zip
# !unzip ../data/data_original.zip -d ../data

In [10]:
interactions = pd.read_csv('/content/interactions.csv')
users = pd.read_csv('/content/users.csv')
items = pd.read_csv('/content/items.csv')

# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## Train test split

In [11]:
# train test split
# test = last 1 week
n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  # TimeDelta возвращает длительность промежутка между датами
print(f"Start date and last date of the test fold: {start_date, last_date}")

date_range = pd.date_range(start=start_date, periods=periods, freq=unit, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [12]:
(train_ids, test_ids, fold_info) = cv.split(Interactions(interactions), collect_fold_stats=True).__next__()

In [13]:
train = interactions.loc[train_ids].reset_index(drop=True)
test = interactions.loc[test_ids].reset_index(drop=True)

In [14]:
train.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [25]:
# Create dataset
train_df = Dataset.construct(
    train,
)

In [26]:
metrics = {
    "mAP@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

In [27]:
N = 10 # Количество рекомендаций

In [28]:
def return_weight(file_name:str):
  os.stat(file_name)
  return(os.stat(file_name).st_size / (1024 * 1024))

### Трекинг (3 метрики качества, 3 тех метрики, артифакты, можно проследить качество моделей исходя из кол-ва соседей)

In [73]:
neighbours = [5, 10, 20] # Обучим модели на разном количестве соседей

# Метрики будем складывать в список, чтобы потом поместить их в таблицу
tfidf = []
bm25 = []
cossim = []

mlflow.start_run()

for i in neighbours:

    # Fit model
    start = time.time()
    model_tfidf = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=i))
    model_tfidf.fit(train_df)
    time_tfidf = time.time() - start 
    mlflow.log_metric('learning_time_tfidf', time_tfidf)
    print('time_tfidf - ', time_tfidf)

    # Save model
    path_tfidf = 'model_tfidf_' + str(i) + '.pickle'
    with open(path_tfidf, 'wb') as pickle_out:
        pickle.dump(model_tfidf, pickle_out)
    mlflow.log_metric('weight_mb_tfidf', return_weight(path_tfidf))
    print(return_weight(path_tfidf))

    # Make recommendations
    start = time.time()
    recos_tfidf = model_tfidf.recommend(
        users=train[Columns.User].unique(),
        dataset=train_df,
        k=N,
        filter_viewed=True,
    )
    recos_time_tfidf = time.time() - start
    mlflow.log_metric("recos_time_tfidf", recos_time_tfidf)


    # Fit model
    start = time.time()
    model_bm25 = ImplicitItemKNNWrapperModel(BM25Recommender(K=i, K1=2)) # Изменение коэффициентов K1 и b особой роли не играет
    model_bm25.fit(train_df)
    time_bm25 = time.time() - start
    mlflow.log_metric('learning_time_bm25', time_tfidf)
    print('time_bm25 - ', time_bm25)

    # Save model
    path_bm25 = 'model_bm25_' + str(i) + '.pickle'
    with open(path_bm25, 'wb') as pickle_out:
        pickle.dump(model_tfidf, pickle_out)
    mlflow.log_metric('weight_mb_bm25', return_weight(path_bm25))
    print(return_weight(path_bm25))

    # Make recommendations
    start = time.time()
    recos_bm25 = model_bm25.recommend(
        users=train[Columns.User].unique(),
        dataset=train_df,
        k=N,
        filter_viewed=True,
    )
    recos_time_bm25 = time.time() - start
    mlflow.log_metric("recos_time_bm25", recos_time_bm25)

    # Fit model
    start = time.time()
    model_cossim = ImplicitItemKNNWrapperModel(CosineRecommender(K=i)) 
    model_cossim.fit(train_df)
    time_cossim = time.time() - start
    mlflow.log_metric('learning_time_cossim', time_cossim)
    print('time_cossim - ', time_cossim )

    # Save model
    path_cossim = 'model_cossim_' + str(i) + '.pickle'
    with open(path_cossim, 'wb') as pickle_out:
        pickle.dump(model_tfidf, pickle_out)
    mlflow.log_metric('weight_mb_cossim', return_weight(path_cossim))
    print(return_weight(path_cossim))

    # Make recommendations
    start = time.time()
    recos_cossim = model_cossim.recommend(
        users=train[Columns.User].unique(),
        dataset=train_df,
        k=N,
        filter_viewed=True,
    )
    recos_time_bm25 = time.time() - start
    mlflow.log_metric("recos_time_cossim", recos_time_bm25)

    metric_values_tfidf = calc_metrics(
        metrics,
        reco=recos_tfidf,
        interactions=test,
        prev_interactions=train,
        catalog=catalog
    )

    metric_values_bm25 = calc_metrics(
        metrics,
        reco=recos_bm25,
        interactions=test,
        prev_interactions=train,
        catalog=catalog
    )

    metric_values_cossim = calc_metrics(
        metrics,
        reco=recos_cossim,
        interactions=test,
        prev_interactions=train,
        catalog=catalog
    )
    mlflow.log_metric('prec@10_tfidf', metric_values_tfidf['prec@10'])
    mlflow.log_metric('recall@10_tfidf', metric_values_tfidf['recall@10'])
    mlflow.log_metric('mAP@10_tfidf', metric_values_tfidf['mAP@10'])

    mlflow.log_metric('prec@10_bm25', metric_values_bm25['prec@10'])
    mlflow.log_metric('recall@10_bm25', metric_values_bm25['recall@10'])
    mlflow.log_metric('mAP@10_bm25', metric_values_bm25['mAP@10'])

    mlflow.log_metric('prec@10_cossim', metric_values_cossim['prec@10'])
    mlflow.log_metric('recall@10_cossim', metric_values_cossim['recall@10'])
    mlflow.log_metric('mAP@10_cossim', metric_values_cossim['mAP@10'])

    mlflow.log_artifact(path_tfidf, 'kNN_tfidf_' + str(i) + 'neigh')
    mlflow.log_artifact(path_bm25, 'kNN_bm25_' + str(i) + 'neigh')
    mlflow.log_artifact(path_cossim, 'kNN_cossim_' + str(i) + 'neigh')

    tfidf.append(metric_values_tfidf)
    bm25.append(metric_values_bm25)
    cossim.append(metric_values_cossim)

mlflow.end_run()

time_tfidf -  2.2707202434539795
0.9073276519775391
time_bm25 -  2.2333545684814453
0.9073276519775391
time_cossim -  2.2912590503692627
0.9073276519775391
time_tfidf -  2.181783437728882
1.738351821899414
time_bm25 -  2.1892642974853516
1.738351821899414
time_cossim -  2.2243709564208984
1.738351821899414
time_tfidf -  2.127289295196533
3.356466293334961
time_bm25 -  2.072434902191162
3.356466293334961
time_cossim -  2.4899699687957764
3.356466293334961


In [32]:
dftfidf = pd.DataFrame(tfidf, index=['tfidf (k = 5)', 'tfidf (k = 10)', 'tfidf (k = 20)'])
dfbm25 = pd.DataFrame(bm25, index=['bm25 (k = 5)', 'bm25 (k = 10)', 'bm25 (k = 20)'])
dfcossim = pd.DataFrame(cossim, index=['cossim (k = 5)', 'cossim (k = 10)', 'cossim (k = 20)'])

In [33]:
metricstable = pd.concat([dftfidf, dfbm25, dfcossim])
metricstable

Unnamed: 0,prec@10,recall@10,mAP@10,novelty,serendipity
tfidf (k = 5),0.022427,0.118485,0.060637,8.533639,2.2e-05
tfidf (k = 10),0.02808,0.150242,0.0671,8.030628,1.9e-05
tfidf (k = 20),0.029198,0.155554,0.068653,7.796688,1.8e-05
bm25 (k = 5),0.028065,0.148859,0.080727,3.995606,1.3e-05
bm25 (k = 10),0.034657,0.187755,0.087131,4.208611,7e-06
bm25 (k = 20),0.034678,0.189522,0.087413,4.190081,5e-06
cossim (k = 5),0.014441,0.081689,0.043166,10.451231,1.2e-05
cossim (k = 10),0.018092,0.101092,0.047973,10.08327,1.1e-05
cossim (k = 20),0.020323,0.113794,0.050551,9.796017,1.1e-05


По метрикам лучше всего себя показал BM25 (ожидаемо) и хуже всего обычное косинусное расстояние.

# Выгружаем эксперементы и лучшую модель

In [45]:
mlflow.search_runs()

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.recall@10_cossim,metrics.prec@10_cossim,metrics.recall@10_tfidf,metrics.prec@10_tfidf,metrics.mAP@10_tfidf,metrics.prec@10_bm25,metrics.mAP@10_cossim,metrics.mAP@10_bm25,metrics.recall@10_bm25,metrics.bm25_recall@10_bm25,params.weight_mb_bm25_10neigh,params.weight_mb_tfidf_10neigh,params.weight_mb_cossim_10neigh,params.recos_time_cossim_5neigh,params.learning_time_cossim_20neigh,params.weight_mb_tfidf_5neigh,params.recos_time_bm25_20neigh,params.learning_time_tfidf_20neigh,params.learning_time_tfidf_5neigh,params.recos_time_bm25_5neigh,params.recos_time_tfidf_5neigh,params.weight_mb_cossim_20neigh,params.weight_mb_bm25_20neigh,params.learning_time_bm25_20neigh,params.learning_time_bm25_5neigh,params.recos_time_tfidf_10neigh,params.recos_time_tfidf_20neigh,params.learning_time_cossim_5neigh,params.recos_time_bm25_10neigh,params.learning_time_cossim_10neigh,params.weight_mb_tfidf_20neigh,params.recos_time_cossim_20neigh,params.recos_time_cossim_10neigh,params.weight_mb_cossim_5neigh,params.learning_time_bm25_10neigh,params.learning_time_tfidf_10neigh,params.weight_mb_bm25_5neigh,params.recall@10_cossim,params.prec@10_cossim,params.bm25_recall@10_bm25,params.recall@10_tfidf,params.prec@10_tfidf,params.mAP@10_tfidf,params.prec@10_bm25,params.mAP@10_cossim,params.mAP@10_bm25,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.name
0,79efe1ca1e6e4b84a7d3480e236ff4d3,3460239350097604,FINISHED,dbfs:/databricks/mlflow-tracking/3460239350097604/79efe1ca1e6e4b84a7d3480e236ff4d3/artifacts,2023-04-25 22:59:57.929000+00:00,2023-04-25 23:20:11.659000+00:00,0.113794,0.020323,0.155554,0.029198,0.068653,0.034678,0.050551,0.087413,0.189522,,1.738351821899414,1.738351821899414,1.738351821899414,98.3430597782135,1.975185871124268,0.9073276519775392,126.1350440979004,2.7743396759033203,1.911855936050415,91.83148717880248,100.30975079536438,3.356466293334961,3.356466293334961,2.7743396759033203,1.911855936050415,123.38590145111084,130.7483367919922,1.8656017780303955,121.9310245513916,2.438255786895752,3.356466293334961,130.73950219154358,126.01592826843262,0.9073276519775392,2.425926685333252,2.425926685333252,0.9073276519775392,,,,,,,,,,LOCAL,robert.zaraev@mail.ru,thoughtful-bird-369,/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py
1,10836bbfda154627918fbcf9b1ccdef6,3460239350097604,FINISHED,dbfs:/databricks/mlflow-tracking/3460239350097604/10836bbfda154627918fbcf9b1ccdef6/artifacts,2023-04-25 16:37:48.780000+00:00,2023-04-25 16:56:22.199000+00:00,0.097986,0.016172,0.138842,0.024192,0.058766,0.030773,0.04121,0.079598,,0.178912,1.6741352081298828,1.6741352081298828,1.6741352081298828,92.49501061439514,1.847031831741333,0.8763904571533203,119.17588710784912,1.6906676292419434,1.6437160968780518,79.68813848495483,93.90615487098694,3.216157913208008,3.216157913208008,1.6906676292419434,1.6437160968780518,115.04857325553894,119.85497665405272,1.7613751888275146,113.0884759426117,1.9821107387542725,3.216157913208008,120.48272609710692,116.10895538330078,0.8763904571533203,2.039606809616089,2.039606809616089,0.8763904571533203,,,,,,,,,,LOCAL,robert.zaraev@mail.ru,unleashed-grub-357,/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py
2,504ac45d91b144438afbcc08c219a473,3460239350097604,FINISHED,dbfs:/databricks/mlflow-tracking/3460239350097604/504ac45d91b144438afbcc08c219a473/artifacts,2023-04-25 16:18:06.325000+00:00,2023-04-25 16:37:44.658000+00:00,,,,,,,,,,,1.6741352081298828,1.6741352081298828,1.6741352081298828,91.27032613754272,,0.8763904571533203,,,2.2018229961395264,79.56686115264893,91.22894167900084,,,,2.2018229961395264,116.28496193885805,,2.11674427986145,112.90123152732848,1.667322874069214,,,115.44647240638731,0.8763904571533203,3.2761800289154053,3.2761800289154053,0.8763904571533203,0.0687547140652856,0.0113622776864444,0.1383351664320282,0.1026939819330794,0.0180042843620784,0.0517594366165061,0.0243859911323668,0.0354073711558279,0.0732005968807303,LOCAL,robert.zaraev@mail.ru,gifted-panda-293,/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py


In [71]:
from mlflow.tracking import MlflowClient
MlflowClient().download_artifacts('79efe1ca1e6e4b84a7d3480e236ff4d3', 'kNN_bm25_10neigh/', '/content/' )

'/content/kNN_bm25_10neigh/'