In [None]:
!pip install rectools

In [4]:
import os
from pathlib import Path
import pickle
import random
import warnings
import requests

import numpy as np
import pandas as pd
import scipy as sp
import tqdm
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
)
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import (
    MAP,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.model_selection import TimeRangeSplitter
from rectools.models import ImplicitItemKNNWrapperModel
from rectools.models.popular import PopularModel

from userknn import UserKnn

warnings.filterwarnings("ignore")

In [5]:
!mkdir data

In [None]:
%cd /content/data

**Get KION dataset**

In [None]:
 !wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip -O ../data/data_original.zip
 !unzip ../data/data_original.zip -d ../data

**EDA**

In [8]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')
users = pd.read_csv('../data/kion_train/users.csv')
items = pd.read_csv('../data/kion_train/items.csv')

# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight},
                    inplace=True)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

**Simple popular by number of interactions**

In [9]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

In [10]:
popular_model = PopularModel()
popular_model.fit(dataset);

In [11]:
Iter_recos = 10
popular_recommendations = popular_model.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=Iter_recos, 
    filter_viewed=False
).merge(items[['item_id', 'title']], 
       on='item_id',
       how='left')

In [12]:
popular_recommendations.head()

Unnamed: 0,user_id,item_id,score,rank,title
0,176549,10440,202457.0,1,Хрустальный
1,176549,15297,193123.0,2,Клиника счастья
2,176549,9728,132865.0,3,Гнев человеческий
3,176549,13865,122119.0,4,Девятаев
4,176549,4151,91167.0,5,Секреты семейной жизни


**UserKNN Model**

In [13]:
# train test split 
# test = last 1 week 

n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']
Real number of folds: 1


In [14]:
(train_ids, test_ids, fold_info) = cv.split(Interactions(interactions), collect_fold_stats=True).__next__()

In [15]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

In [16]:
train.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [17]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")


users_mapping amount: 842129
items_mapping amount: 15404


In [18]:
# Get sparse matrix 
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [19]:
interaction_matrix = get_coo_matrix(train, weight_col='weight',
                                    users_mapping=users_mapping, 
                                    items_mapping=items_mapping)

In [20]:
df_train = Dataset.construct(
    train,
)

**CosineRecommender, TFIDFRecommender, BM25Recommender**

Тюнинг гиперпараметров

In [21]:
metrics = {
    "mAP@10": MAP(k=10),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

catalog = train['item_id'].unique()

In [22]:
cosine = []
tfidf = []
bm25 = []

neighbours = [10, 20, 50]

for i in neighbours:

        # Fit model
    model_cosine = ImplicitItemKNNWrapperModel(CosineRecommender(K=i)) 
    model_cosine.fit(df_train)

    # Make recommendations
    reco_cosine = model_cosine.recommend(
        users=train[Columns.User].unique(),
        dataset=df_train,
        k=Iter_recos,
        filter_viewed=True,
    )

    # Fit model
    model_tfidf = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=i))
    model_tfidf.fit(df_train)

    # Make recommendations
    reco_tfidf = model_tfidf.recommend(
        users=train[Columns.User].unique(),
        dataset=df_train,
        k=Iter_recos,
        filter_viewed=True,
    )

    # Fit model
    model_bm25 = ImplicitItemKNNWrapperModel(BM25Recommender(K=i, K1=2)) 
    model_bm25.fit(df_train)

    # Make recommendations
    reco_bm25 = model_bm25.recommend(
        users=train[Columns.User].unique(),
        dataset=df_train,
        k=Iter_recos,
        filter_viewed=True,
    )

    metric_values_cosine = calc_metrics(
        metrics,
        reco=reco_cosine,
        interactions=test,
        prev_interactions=train,
        catalog=catalog
    )

    metric_values_tfidf = calc_metrics(
        metrics,
        reco=reco_tfidf,
        interactions=test,
        prev_interactions=train,
        catalog=catalog
    )

    metric_values_bm25 = calc_metrics(
        metrics,
        reco=reco_bm25,
        interactions=test,
        prev_interactions=train,
        catalog=catalog
    )

    cosine.append(metric_values_cosine)
    tfidf.append(metric_values_tfidf)
    bm25.append(metric_values_bm25)

In [23]:
df_cosine = pd.DataFrame(cosine, index=['CosineModel (k = 10)', 'CosineModel (k = 20)', 'CosineModel (k = 50)'])
df_tfidf = pd.DataFrame(tfidf, index=['TFIDFModel (k = 10)', 'TFIDFModel (k = 20)', 'TFIDFModel (k = 50)'])
df_bm25 = pd.DataFrame(bm25, index=['BM25Model (k = 10)', 'BM25Model (k = 20)', 'BM25Model (k = 50)'])

In [24]:
metrics_table = pd.concat([df_cosine, df_tfidf, df_bm25])
metrics_table

Unnamed: 0,prec@10,recall@10,mAP@10,novelty,serendipity
CosineModel (k = 10),0.022915,0.119112,0.058292,9.589126,1.5e-05
CosineModel (k = 20),0.025375,0.131688,0.060869,9.272995,1.5e-05
CosineModel (k = 50),0.026411,0.135827,0.062209,9.155556,1.5e-05
TFIDFModel (k = 10),0.0338,0.167853,0.078074,7.445809,2.5e-05
TFIDFModel (k = 20),0.034653,0.171268,0.079425,7.221953,2.2e-05
TFIDFModel (k = 50),0.035659,0.175373,0.080948,7.119074,2.3e-05
BM25Model (k = 10),0.039377,0.198762,0.095629,4.052742,8e-06
BM25Model (k = 20),0.039099,0.199296,0.095693,4.031,6e-06
BM25Model (k = 50),0.039118,0.19986,0.095762,4.019591,5e-06


По метрикам BM25 показало себя лучше всех, поэтому его и будем использовать

In [39]:
import dill

with open('bm25_k50.dill', 'wb') as f:
    dill.dump(model_bm25, f)

Добавим к нашей лучшей модели популярные фильмы

In [26]:
popular_mod = PopularModel(popularity='n_users')
popular_mod.fit(df_train)

<rectools.models.popular.PopularModel at 0x7f94bcfc4040>

In [27]:
popular_recs = popular_mod.recommend(
    users=train[Columns.User].unique(),
    dataset=df_train,
    k=Iter_recos,
    filter_viewed=False
)

In [28]:
reco_best = pd.concat([reco_bm25, popular_recs])
reco_best = reco_best.drop_duplicates(keep='first', subset=['user_id', 'item_id'])
reco_best['rank'] = reco_best.groupby('user_id')['user_id'].rank(method='first')
reco_best = reco_best[reco_best['rank'] <= 10]

In [29]:
metric_values_bm25_popular = calc_metrics(
        metrics,
        reco=reco_bm25,
        interactions=test,
        prev_interactions=train,
        catalog=catalog
    )

In [30]:
bm25_popular = pd.Series(metric_values_bm25_popular)
bm25_popular

prec@10        0.039118
recall@10      0.199860
mAP@10         0.095762
novelty        4.019591
serendipity    0.000005
dtype: float64

In [31]:
reco_best.to_csv('BM25Model_popular.csv', index=False)

**Вариант с объединением предсказаний трех видов KNN. Другой вариант ранжирования - объединяем в другом порядке**

In [32]:
reco_blend = pd.concat([reco_tfidf, reco_cosine, reco_bm25])
reco_blend = reco_blend.drop_duplicates(keep='first', subset=['user_id', 'item_id'])
reco_blend['rank'] = reco_blend.groupby('user_id')['user_id'].rank(method='first')
reco_blend = reco_blend[reco_blend['rank'] <= 10]

In [33]:
metric_values_blend = calc_metrics(
    metrics,
    reco=reco_blend,
    interactions=test,
    prev_interactions=train,
    catalog=catalog
  )

In [34]:
blended = pd.Series(metric_values_blend)
blended

prec@10        0.035659
recall@10      0.175373
mAP@10         0.080948
novelty        7.119074
serendipity    0.000023
dtype: float64

In [35]:
reco_blend.to_csv('BlendedModel.csv', index=False)

In [51]:
model = UserKnn(dist_model=BM25Recommender(K=50, K1=2), n_neighbors=10)

In [52]:
with open('../data/BM25_KNN.dill', 'wb') as f:
      dill.dump(model, f)

**Рекомендации для холодных пользователей**

In [53]:
popular_for_cold = list(popular_recommendations.head(10)['item_id'].values) 
popular_for_cold

[10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809]