In [14]:
import dill
import warnings

import numpy as np
import pandas as pd
from rectools import Columns
from rectools.dataset import Dataset, Interactions, DenseFeatures
from rectools.metrics import (
    MAP,
    Serendipity,
    MeanInvUserFreq,
    calc_metrics,
)
from rectools.metrics.base import MetricAtK
from rectools.metrics import (
    Precision,
    Recall,
    NDCG,
    PairwiseHammingDistanceCalculator,
    MRR,
    serendipity,
    IntraListDiversity,
)
from rectools.model_selection import Splitter, TimeRangeSplitter
from rectools.models import RandomModel, PopularModel, ModelBase
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
)
from tqdm import tqdm
from copy import deepcopy
from scipy.stats import mode
from pprint import pprint

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', lambda x: f'{x:,.6f}')


from userknn import UserKnn


In [15]:
interactions_df = pd.read_csv('interactions.csv')
users = pd.read_csv('users.csv')
items = pd.read_csv('items.csv')

In [16]:
metrics = {
    'precision@1': Precision(k=1),
    'precision@5': Precision(k=5),
    'precision@10': Precision(k=10),
    'recall@1': Recall(k=1),
    'recall@5': Recall(k=5),
    'recall@10': Recall(k=10),
    'MAP@1': MAP(k=1, divide_by_k=False),
    'MAP@5': MAP(k=5, divide_by_k=False),
    'MAP@10': MAP(k=10, divide_by_k=False),
    'NDCG@1': NDCG(k=1, log_base=2),
    'NDCG@5': NDCG(k=5, log_base=2),
    'NDCG@10': NDCG(k=10, log_base=2),
    'novelty@1': MeanInvUserFreq(k=1),
    'novelty@5': MeanInvUserFreq(k=5),
    'novelty@10': MeanInvUserFreq(k=10),
    "mrr@1": MRR(k=1),
    "mrr@5": MRR(k=5),
    "mrr@10": MRR(k=10)
}


In [40]:
def cross_validate(models, metrics, interactions, splitter, k_recos):
    columns = ['k_fold', 'model'] + [metric_name for metric_name in metrics]
    results_df = pd.DataFrame(columns=columns)

    fold_iterator = splitter.split(interactions, collect_fold_stats=True)

    for id_train, id_test, k_fold in fold_iterator:
        print(f"Split Index: {k_fold['i_split']}")
        print(f"Start Date: {k_fold['start']}")
        print(f"End Date: {k_fold['end']}")
        print(f"Train Set Size: {k_fold['train']}")
        print(f"Train Users: {k_fold['train_users']}")
        print(f"Train Items: {k_fold['train_items']}")
        print(f"Test Set Size: {k_fold['test']}")
        print(f"Test Users: {k_fold['test_users']}")
        print(f"Test Items: {k_fold['test_items']}")
        print("-" * 40) 

        train = interactions.df.iloc[id_train]
        dataset = Dataset.construct(train)
        val = interactions.df.iloc[id_test][Columns.UserItem]
        val_id = np.unique(val[Columns.User])

        catalog = train[Columns.Item].unique()
        
        for model_name, model in models.items():
            model.fit(train)
            recos = model.predict(val)
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=val,
                prev_interactions=train,
                catalog=catalog,
            )
            temp_df = pd.DataFrame([{"k_fold": k_fold["i_split"], "model": model_name, **metric_values}])
            results_df = pd.concat([results_df, temp_df], ignore_index=True)

    return results_df

# 1. Обучим модель для "холодных" пользователей, просто сделаем популярное

In [84]:
dataset = Dataset.construct(
    interactions_df=interactions_df,
    user_features_df=None,
    item_features_df=None
)
model = PopularModel()
model.fit(dataset)

<rectools.models.popular.PopularModel at 0x2f98ff8e0>

In [113]:
hold_data = model.recommend(
    dataset.user_id_map.external_ids,
    dataset=dataset,
    k=10,
    filter_viewed=False
)

In [123]:
hold_reco = hold_data.item_id.unique()

In [125]:
hold_reco

array([10440, 15297,  9728, 13865,  4151,  3734,  2657,  4880,   142,
        6809])

In [128]:
users[~users.user_id.isin(interactions_df.user_id)]

Unnamed: 0,user_id,age,income,sex,kids_flg
2,1047345,age_45_54,income_40_60,Ж,0
6,391756,age_25_34,income_0_20,М,0
7,15878,age_25_34,income_40_60,М,1
10,99952,,,М,0
19,1067802,age_35_44,income_40_60,М,0
...,...,...,...,...,...
840180,157810,age_25_34,income_20_40,Ж,0
840185,1021814,age_45_54,income_20_40,Ж,0
840191,365945,age_25_34,income_20_40,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1


### Вообще, пользователей без просмотров довольно много, поэтому на лидерборде популярное уже дает неплохой результат

# 3. Попробуем сделать cv на 3 фолдах на двух разныз моделях и выберем разное K, по дефолту K = 50

In [47]:
models = {
    'cosine_userknn_10': UserKnn(CosineRecommender(K=10)), 
    'cosine_userknn_50': UserKnn(CosineRecommender()),
    'tfidf_userknn_10': UserKnn(TFIDFRecommender(K=10)),
    'tfidf_userknn_50': UserKnn(TFIDFRecommender())
}

In [65]:
splitter = TimeRangeSplitter("7D", 3)

In [31]:
interactions_df.rename(
    columns={"last_watch_dt": Columns.Datetime, "total_dur": rectools.Columns.Weight}, inplace=True
)
interactions = Interactions(interactions_df)

In [68]:
result = cross_validate(models, metrics, interactions, splitter, 10)

Split Index: 0
Start Date: 2021-08-02 00:00:00
End Date: 2021-08-09 00:00:00
Train Set Size: 4266013
Train Users: 797423
Train Items: 15237
Test Set Size: 263681
Test Users: 98184
Test Items: 6602
----------------------------------------


  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

Split Index: 1
Start Date: 2021-08-09 00:00:00
End Date: 2021-08-16 00:00:00
Train Set Size: 4649162
Train Users: 850489
Train Items: 15415
Test Set Size: 279422
Test Users: 103511
Test Items: 6698
----------------------------------------


  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

Split Index: 2
Start Date: 2021-08-16 00:00:00
End Date: 2021-08-23 00:00:00
Train Set Size: 5051815
Train Users: 906071
Train Items: 15577
Test Set Size: 298878
Test Users: 110076
Test Items: 6679
----------------------------------------


  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

In [76]:
result.groupby('model').mean()

Unnamed: 0_level_0,precision@1,precision@5,precision@10,recall@1,recall@5,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,novelty@1,novelty@5,novelty@10,mrr@1,mrr@5,mrr@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
cosine_userknn_10,0.000865,0.002873,0.004142,0.000335,0.007127,0.020041,0.000335,0.002222,0.003958,0.000865,0.002369,0.003442,9.120714,7.637649,6.992581,0.000865,0.004573,0.007867
cosine_userknn_50,0.00076,0.002603,0.004279,0.000291,0.006465,0.020965,0.000291,0.001994,0.003905,0.00076,0.002137,0.003467,9.655836,8.247415,7.540823,0.00076,0.004112,0.007659
tfidf_userknn_10,0.000293,0.004434,0.00589,9e-05,0.011585,0.029503,9e-05,0.003209,0.005659,0.000293,0.0034,0.004799,9.93581,8.201203,7.433605,0.000293,0.006054,0.010469
tfidf_userknn_50,0.000281,0.00477,0.006529,8.7e-05,0.012715,0.033253,8.7e-05,0.003548,0.006334,0.000281,0.003668,0.005292,10.007046,8.415495,7.638598,0.000281,0.00655,0.011438


### Лучше всего использовать tfidf_userknn_50, эту модель и отправим в сервис (выбрал по MAP@10)

In [None]:
userknn_model = UserKnn(model=TFIDFRecommender(), N_users=50)
userknn_model.fit(interactions.df)

In [53]:
import pickle
pickle.dump(userknn_model, open('baseknn.pkl', "wb"))

In [138]:
def recommend(model, user_id:int, N_recs:int=10):
    """
    Outputs recommendations for a certain user
    """
    df = pd.DataFrame({"user_id": [user_id], "item_id": [user_id]})
    return model.predict(df, N_recs=N_recs).item_id.to_list()

In [131]:
pickled_model = pickle.load(open('baseknn.pkl', "rb"))
recommend(pickled_model, 31)

[10515]

# 2. Попробуем сделать всегда минимум 5 рекомендаций ( можно и побольше просто тогда популярных побольше собрать )

In [148]:
import pickle

def rec_fix(user_id, n, hold_reco):
    pickled_model = pickle.load(open('baseknn.pkl', "rb"))

    recommendations = recommend(pickled_model, user_id)
    
    # Удаляем элементы из hold_reco, которые уже присутствуют в recommendations
    hold_reco = [item for item in hold_reco if item not in recommendations]
    
    # Если рекомендаций все еще недостаточно, добавьте дополнительные из hold_reco
    additional_recommendations = hold_reco[:n - len(recommendations)]
    recommendations += additional_recommendations

    return recommendations

user_id = 31  
result = rec_fix(user_id, n=5, hold_reco = hold_reco)
print(result)


[10515, 10440, 15297, 9728, 13865]


### Получается порекомендовали одно и добавили 4 популярных в этой ситуации

# 4. Возьмем какой-то другой способ например BM25, который сделан для учета длины документов и частоты терминов в коллекции

In [None]:
result = cross_validate(models, metrics, interactions, splitter, 10)

In [None]:
splitter = TimeRangeSplitter("7D", 1)

In [151]:
models = {
    'bm25' : UserKnn(model=BM25Recommender())
}

In [152]:
bm25 = cross_validate(models, metrics, interactions, splitter, 10)

Split Index: 0
Start Date: 2021-08-02 00:00:00
End Date: 2021-08-09 00:00:00
Train Set Size: 4266013
Train Users: 797423
Train Items: 15237
Test Set Size: 263681
Test Users: 98184
Test Items: 6602
----------------------------------------


  0%|          | 0/797423 [00:00<?, ?it/s]

Split Index: 1
Start Date: 2021-08-09 00:00:00
End Date: 2021-08-16 00:00:00
Train Set Size: 4649162
Train Users: 850489
Train Items: 15415
Test Set Size: 279422
Test Users: 103511
Test Items: 6698
----------------------------------------


  0%|          | 0/850489 [00:00<?, ?it/s]

Split Index: 2
Start Date: 2021-08-16 00:00:00
End Date: 2021-08-23 00:00:00
Train Set Size: 5051815
Train Users: 906071
Train Items: 15577
Test Set Size: 298878
Test Users: 110076
Test Items: 6679
----------------------------------------


  0%|          | 0/906071 [00:00<?, ?it/s]

In [154]:
bm25.groupby('model').mean()

Unnamed: 0_level_0,precision@1,precision@5,precision@10,recall@1,recall@5,recall@10,MAP@1,MAP@5,MAP@10,NDCG@1,NDCG@5,NDCG@10,novelty@1,novelty@5,novelty@10,mrr@1,mrr@5,mrr@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
bm25,0.000577,0.001951,0.00305,0.000234,0.00446,0.013828,0.000234,0.00146,0.002699,0.000577,0.001626,0.002502,11.016226,9.899981,9.283169,0.000577,0.003151,0.005584


In [119]:
userknn_model = , N_users=50)
userknn_model.fit(interactions.df)

  0%|          | 0/962179 [00:00<?, ?it/s]

### Вообще, интересно что bm25 дает лучше метрики @1 а в остальных в целом проигрывает, хотя я думал, что никакого улучшения она и не покажет вовсе. В целом, тогда можно было бы использовать tfidf для рекомендаций после первого, а первую рекомендацию отправлять от bm25 и так повышать качество)