In [7]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset, Interactions
from rectools.models.popular import PopularModel
from rectools.models.implicit_knn import ImplicitItemKNNWrapperModel
from rectools.metrics import Precision, Recall, MeanInvUserFreq, MAP, Serendipity, calc_metrics

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [8]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')

interactions.rename(columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True
) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

# Split

В соответствии с предположением из ноутбука "HW-3.1" сделаем **валидацию по 5 дней и по 7 дней**

In [4]:
def create_data_range(
    last_date: pd.Timestamp, 
    n_folds: int = 7, 
    unit: str = "W", 
    n_units: int = 1, 
    show: bool = True,
):
    periods = n_folds + 1
    freq = f"{n_units}{unit}"
    
    start_date = last_date - pd.Timedelta(n_folds * n_units + n_units, unit=unit)  
    
    date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
    
    if show:
        print(
            f"start_date: {start_date}\n"
            f"last_date: {last_date}\n"
            f"periods: {periods}\n"
            f"freq: {freq}\n"
            f"Test fold borders: {date_range.values.astype('datetime64[D]')}\n"
        )
        
    return date_range

In [5]:
CONFIG_CV = {
    "cv_v1": {
        "n_folds": 5,
        "unit": "W",
        "n_units": 1,
    },
    "cv_v2": {
        "n_folds": 5,
        "unit": "D",
        "n_units": 5,
    }, 
}

In [6]:
last_date = interactions[Columns.Datetime].max().normalize()
last_date

Timestamp('2021-08-22 00:00:00')

In [7]:
print("***Folds v1***")
date_range_v1 = create_data_range(
    last_date, 
    n_folds=CONFIG_CV["cv_v1"]["n_folds"], 
    unit=CONFIG_CV["cv_v1"]["unit"], 
    n_units=CONFIG_CV["cv_v1"]["n_units"]
)

print("***Folds v2***")
date_range_v2 = create_data_range(
    last_date, 
    n_folds=CONFIG_CV["cv_v2"]["n_folds"], 
    unit=CONFIG_CV["cv_v2"]["unit"], 
    n_units=CONFIG_CV["cv_v2"]["n_units"]
)

***Folds v1***
start_date: 2021-07-11 00:00:00
last_date: 2021-08-22 00:00:00
periods: 6
freq: 1W
Test fold borders: ['2021-07-11' '2021-07-18' '2021-07-25' '2021-08-01' '2021-08-08'
 '2021-08-15']

***Folds v2***
start_date: 2021-07-23 00:00:00
last_date: 2021-08-22 00:00:00
periods: 6
freq: 5D
Test fold borders: ['2021-07-23' '2021-07-28' '2021-08-02' '2021-08-07' '2021-08-12'
 '2021-08-17']



In [8]:
cv_v1 = TimeRangeSplitter(
    date_range=date_range_v1,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv_v1.get_n_splits(Interactions(interactions))}")

cv_v2 = TimeRangeSplitter(
    date_range=date_range_v2,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv_v2.get_n_splits(Interactions(interactions))}")

CV = [cv_v1, cv_v2]

Real number of folds: 5
Real number of folds: 5


In [12]:
metrics = {
    "prec@5": Precision(k=5),
    "recall@5": Recall(k=5),
    "MAP@5": MAP(k=5),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

# Find best models

In [31]:
K = [30, 40, 50, 60]
models = {
    "popular": PopularModel(),
    "popular_mw": PopularModel(popularity="mean_weight")
}

for k in K:
    models[f"popular"]
    models[f"cosine_userknn_K{k}"] = ImplicitItemKNNWrapperModel(model=CosineRecommender(K=k))
    models[f"tfidf_userknn_K{k}"] = ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=k))
    models[f"bm25_userknn_K{k}"] = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=k))

models

{'popular': <rectools.models.popular.PopularModel at 0x7f753b9a29d0>,
 'popular_mw': <rectools.models.popular.PopularModel at 0x7f753b9a2190>,
 'cosine_userknn_K30': <rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f753b9a2400>,
 'tfidf_userknn_K30': <rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f753b9a24f0>,
 'bm25_userknn_K30': <rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f753b9a2940>,
 'cosine_userknn_K40': <rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f753b982280>,
 'tfidf_userknn_K40': <rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f753b982430>,
 'bm25_userknn_K40': <rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f753b982520>,
 'cosine_userknn_K50': <rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f753b982670>,
 'tfidf_userknn_K50': <rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f753b9826d0>,
 'bm25_userknn_K50': <rectools.models.implicit_knn.Impli

In [32]:
%%time

results = []
K_RECOS = 10
for idx, cv in enumerate(CV):
    print(f"\n ***CV_{idx}***")
    fold_iterator = cv.split(Interactions(interactions), collect_fold_stats=True)

    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        print(f"\n==================== Fold {i_fold}")
        pprint(fold_info)

        train = interactions.iloc[train_ids].copy()
        test = interactions.iloc[test_ids][Columns.UserItem].copy()

        dataset = Dataset.construct(
            interactions_df=train,
            user_features_df=None,
            item_features_df=None
        )
        
        catalog = train[Columns.Item].unique()
        
        
        for model_name, model in tqdm(models.items()):
            model.fit(dataset)
            
            for filter_viewed in [False, True]: # предсказывать одинаковые item-ы для всех юзеров?
                model_quality = {"fold": i_fold, "model": f"{model_name}_view-{filter_viewed}"}
                recos_curr = model.recommend(
                    users=test[Columns.User].unique(),
                    dataset=dataset,
                    k=K_RECOS,
                    filter_viewed=filter_viewed,
                )

                metric_values = calc_metrics(
                    metrics,
                    reco=recos_curr,
                    interactions=test,
                    prev_interactions=train,
                    catalog=catalog
                )
                model_quality.update(metric_values)
                results.append(model_quality)


 ***CV_0***

{'End date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-07-11 00:00:00', freq='W-SUN'),
 'Test': 214489,
 'Test items': 6313,
 'Test users': 84234,
 'Train': 3192875,
 'Train items': 14711,
 'Train users': 640144}


  0%|          | 0/14 [00:00<?, ?it/s]


{'End date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-07-18 00:00:00', freq='W-SUN'),
 'Test': 231207,
 'Test items': 6491,
 'Test users': 87632,
 'Train': 3506106,
 'Train items': 14928,
 'Train users': 687200}


  0%|          | 0/14 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'),
 'Test': 249396,
 'Test items': 6611,
 'Test users': 93092,
 'Train': 3838180,
 'Train items': 15061,
 'Train users': 734701}


  0%|          | 0/14 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'),
 'Test': 264039,
 'Test items': 6609,
 'Test users': 98161,
 'Train': 4203885,
 'Train items': 15212,
 'Train users': 788721}


  0%|          | 0/14 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'),
 'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'),
 'Test': 276699,
 'Test items': 6715,
 'Test users': 101983,
 'Train': 4587708,
 'Train items': 15404,
 'Train users': 842129}


  0%|          | 0/14 [00:00<?, ?it/s]


 ***CV_1***

{'End date': Timestamp('2021-07-28 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-07-23 00:00:00', freq='5D'),
 'Test': 175051,
 'Test items': 6088,
 'Test users': 72916,
 'Train': 3740952,
 'Train items': 15017,
 'Train users': 720875}


  0%|          | 0/14 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-02 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-07-28 00:00:00', freq='5D'),
 'Test': 194020,
 'Test items': 6211,
 'Test users': 80145,
 'Train': 3990413,
 'Train items': 15128,
 'Train users': 756988}


  0%|          | 0/14 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-07 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-08-02 00:00:00', freq='5D'),
 'Test': 187734,
 'Test items': 6024,
 'Test users': 78089,
 'Train': 4266013,
 'Train items': 15237,
 'Train users': 797423}


  0%|          | 0/14 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-12 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-08-07 00:00:00', freq='5D'),
 'Test': 206420,
 'Test items': 6179,
 'Test users': 83403,
 'Train': 4528392,
 'Train items': 15385,
 'Train users': 833905}


  0%|          | 0/14 [00:00<?, ?it/s]


{'End date': Timestamp('2021-08-17 00:00:00', freq='5D'),
 'Start date': Timestamp('2021-08-12 00:00:00', freq='5D'),
 'Test': 209972,
 'Test items': 6248,
 'Test users': 86532,
 'Train': 4811285,
 'Train items': 15499,
 'Train users': 872587}


  0%|          | 0/14 [00:00<?, ?it/s]

CPU times: user 40min 56s, sys: 14.7 s, total: 41min 11s
Wall time: 39min 47s


## Show metrics

In [42]:
df_metrics = pd.DataFrame(results)

df_metrics['cv'] = 'fold_1w'
df_metrics.loc[df_metrics[240:].index, 'cv'] = 'fold_5d'

df_metrics

Unnamed: 0,fold,model,prec@5,recall@5,prec@10,recall@10,MAP@5,MAP@10,novelty,serendipity,cv
0,0,popular_view-False,0.057927,0.165644,0.040750,0.225673,0.081512,0.091268,3.527632,0.000000,fold_1w
1,0,popular_view-True,0.067068,0.187785,0.043174,0.236855,0.106416,0.114674,3.770513,0.000003,fold_1w
2,0,popular_mw_view-False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,18.225101,0.000000,fold_1w
3,0,popular_mw_view-True,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,18.225113,0.000000,fold_1w
4,0,cosine_userknn_K30_view-False,0.023309,0.073918,0.022143,0.129775,0.024364,0.032651,7.914110,0.000048,fold_1w
...,...,...,...,...,...,...,...,...,...,...,...
275,4,cosine_userknn_K60_view-True,0.030860,0.087797,0.023432,0.129578,0.052772,0.059091,9.152968,0.000122,fold_5d
276,4,tfidf_userknn_K60_view-False,0.019757,0.060871,0.021400,0.122260,0.019698,0.028557,6.651334,0.000095,fold_5d
277,4,tfidf_userknn_K60_view-True,0.042803,0.116265,0.032173,0.170458,0.069460,0.077912,6.727128,0.000180,fold_5d
278,4,bm25_userknn_K60_view-False,0.037006,0.107442,0.028958,0.162346,0.037895,0.046199,3.920584,0.000024,fold_5d


In [47]:
df_metrics_mean = df_metrics.groupby(['cv', 'model'])[
    'prec@5', 'recall@5', 'prec@10', 'recall@10', 'MAP@5', 'MAP@10', 'novelty', 'serendipity'
].mean()

df_metrics_mean.style.highlight_max(color='lightgreen', axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,prec@5,recall@5,prec@10,recall@10,MAP@5,MAP@10,novelty,serendipity
cv,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
fold_1w,bm25_userknn_K30_view-False,0.042185,0.119823,0.034154,0.185347,0.042525,0.052672,3.946248,2.5e-05
fold_1w,bm25_userknn_K30_view-True,0.057809,0.158462,0.039961,0.213474,0.095232,0.103942,4.022938,4e-05
fold_1w,bm25_userknn_K40_view-False,0.042144,0.119736,0.034143,0.185324,0.04248,0.052632,3.946874,2.4e-05
fold_1w,bm25_userknn_K40_view-True,0.057809,0.158459,0.039978,0.213672,0.095225,0.103958,4.019052,4e-05
fold_1w,bm25_userknn_K50_view-False,0.042758,0.121282,0.034677,0.187847,0.043014,0.053338,3.952222,2.4e-05
fold_1w,bm25_userknn_K50_view-True,0.058727,0.160635,0.040486,0.215923,0.096576,0.105357,4.019601,4e-05
fold_1w,bm25_userknn_K60_view-False,0.042737,0.12123,0.034666,0.187796,0.042991,0.053314,3.954001,2.4e-05
fold_1w,bm25_userknn_K60_view-True,0.058733,0.160645,0.040489,0.215989,0.096582,0.105369,4.019745,4e-05
fold_1w,cosine_userknn_K30_view-False,0.018253,0.056689,0.018359,0.10563,0.018688,0.025865,8.017523,5.9e-05
fold_1w,cosine_userknn_K30_view-True,0.035555,0.098933,0.026364,0.143056,0.060424,0.06728,9.255917,0.00011


Из результатов видно, что среднее значение метрик моделей **bmp** имеют **наилучшие** значения, причем на недельном фолде метрики выше, чем на 5 дневном 

- Следует проверить статистически различимы значения или нет. Для этого следует посмотреть дисперсию и если дисперсия меньше чем различия между средними значениями метрик, то можно сделать вывод, что значения метрик статистически различны

In [51]:
df_metrics_std = df_metrics.groupby(['cv', 'model'])[
    'prec@5', 'recall@5', 'prec@10', 'recall@10', 'MAP@5', 'MAP@10', 'novelty', 'serendipity'
].std()

df_metrics_std

Unnamed: 0_level_0,Unnamed: 1_level_0,prec@5,recall@5,prec@10,recall@10,MAP@5,MAP@10,novelty,serendipity
cv,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
fold_1w,bm25_userknn_K30_view-False,0.004042,0.01138,0.003319,0.01805,0.004144,0.005235,0.02973,1.677056e-06
fold_1w,bm25_userknn_K30_view-True,0.005348,0.014587,0.002787,0.01362,0.009823,0.009871,0.015006,3.357219e-06
fold_1w,bm25_userknn_K40_view-False,0.004036,0.011393,0.00331,0.018033,0.004149,0.005234,0.029613,1.666772e-06
fold_1w,bm25_userknn_K40_view-True,0.005334,0.014592,0.002774,0.013511,0.009824,0.009859,0.014709,3.359684e-06
fold_1w,bm25_userknn_K50_view-False,0.003777,0.011013,0.003087,0.017476,0.004039,0.005071,0.029329,1.807696e-06
fold_1w,bm25_userknn_K50_view-True,0.004894,0.013955,0.002467,0.012485,0.009562,0.009532,0.014861,3.329699e-06
fold_1w,bm25_userknn_K60_view-False,0.003774,0.011002,0.003088,0.017492,0.004038,0.005074,0.029368,1.805538e-06
fold_1w,bm25_userknn_K60_view-True,0.004894,0.013963,0.002466,0.012478,0.009562,0.009529,0.014738,3.34007e-06
fold_1w,cosine_userknn_K30_view-False,0.002393,0.007803,0.001918,0.011326,0.002518,0.003091,0.047337,5.30493e-06
fold_1w,cosine_userknn_K30_view-True,0.003624,0.0104,0.001786,0.00881,0.007,0.006842,0.047027,9.087191e-06


In [86]:
df_metrics_1w_mean = df_metrics_mean.loc["fold_1w"]
df_metrics_1w_std = df_metrics_std.loc["fold_1w"]

In [82]:
best_model = "bm25_userknn_K60_view-True"
col_metrics = list(metrics.keys())
std_best_metrics = df_metrics_1w_std[df_metrics_1w_std["model"] == best_model][col_metrics].values[0]

In [95]:
print(df_metrics_1w_std.loc[best_model])
for model in df_metrics_1w_mean.index:
    if model != best_model:
        print(f"\n===Сравнение с {model}")
        print(df_metrics_1w_mean.loc[best_model] - df_metrics_1w_mean.loc[model])
        print("=========================")

prec@5         0.004894
recall@5       0.013963
prec@10        0.002466
recall@10      0.012478
MAP@5          0.009562
MAP@10         0.009529
novelty        0.014738
serendipity    0.000003
Name: bm25_userknn_K60_view-True, dtype: float64

===Сравнение с bm25_userknn_K30_view-False
prec@5         0.000852
recall@5       0.002584
prec@10       -0.000854
recall@10     -0.005571
MAP@5          0.005418
MAP@10         0.004293
novelty       -0.014992
serendipity    0.000002
dtype: float64

===Сравнение с bm25_userknn_K30_view-True
prec@5        -4.545181e-04
recall@5      -6.232778e-04
prec@10       -3.208519e-04
recall@10     -1.141463e-03
MAP@5         -2.606624e-04
MAP@10        -3.423636e-04
novelty       -2.682734e-04
serendipity   -1.714842e-08
dtype: float64

===Сравнение с bm25_userknn_K40_view-False
prec@5         0.000858
recall@5       0.002570
prec@10       -0.000844
recall@10     -0.005554
MAP@5          0.005413
MAP@10         0.004294
novelty       -0.014875
serendipity   

Лучшей модели большинством из моделей видны статистические различия, кроме всех моделей bmp (логично, потому что лучшая модель bmp с k = 60) и моделью tfidf, где для рекомендаций стоял флаг filter_viewed = True, что означает рекомендовать не одинаковые элементы для всех пользователей

# Обучение на всех имеющихся данных и формирование оффлайн рекомендаций

In [98]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

bmp25_k60_model = ImplicitItemKNNWrapperModel(BM25Recommender(K=60))
bmp25_k60_model.fit(dataset)

K_RECOS = 30
    
recos_offline_bmp25 = bmp25_k60_model.recommend(
    users=interactions[Columns.User].unique(),
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [100]:
recos_offline_bmp25.to_csv("../data/hw_3/bmp_25_k60_rectools.csv", index=False)

In [9]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

tfidf_k60_model = ImplicitItemKNNWrapperModel(TFIDFRecommender(K=60))
tfidf_k60_model.fit(dataset)

K_RECOS = 30
    
recos_offline_tfidf = tfidf_k60_model.recommend(
    users=interactions[Columns.User].unique(),
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [11]:
recos_offline_tfidf.to_csv("../data/hw_3/tfidf_k60_rectools.csv", index=False)

# Формирование рекомендаций для cold users

По моделям на основе популярного наилучшего качества достигали метрики по модели popular на основе количества уникальных пользователей взаимодействовавших с элементом, НО по среднему весу взаимодействия с элементами модель показывает по метрики новелти очень высокие результаты, поэтому стоит попробовать обе из моделей

In [6]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

popular_model = PopularModel()
popular_model.fit(dataset)

item_inv = dict(enumerate(interactions["item_id"].unique()))
recos_pop = []
for item_pop in popular_model.popularity_list[0]:
    recos_pop.append(item_inv[item_pop])

df_pop_recos = pd.DataFrame({"item_id": recos_pop})

df_pop_recos.to_csv("../data/hw_3/popular_item.csv", index=False)

In [5]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=None,
    item_features_df=None
)

popular_model_mw = PopularModel(popularity="mean_weight")
popular_model_mw.fit(dataset)

item_inv = dict(enumerate(interactions["item_id"].unique()))
recos_pop = []
for item_pop in popular_model_mw.popularity_list[0]:
    recos_pop.append(item_inv[item_pop])

df_pop_recos_mw = pd.DataFrame({"item_id": recos_pop})

df_pop_recos_mw.to_csv("../data/hw_3/popular_mean_weight_item.csv", index=False)

# Блендинг результатов моделей

Механизм блендинга будет выглядить следующим образом:

1. Берутся рекомендации, сделанные моделями tfidf и bmp25, конкатятся результаты, удялются дубликаты item-ов
2. Берется заготовленный датаест items c полями item_id и idf
3. смотрится idf, чем он выше, тем выше будет стоять item в выдаче

Такой подход обусловлен тем, что idf показывает обратную частоту item, соответственно в выдаче наверх будут попадать item, с которым меньшее количество раз взаимодейстовали пользователи, т.е. в перспективе такой подход может предлагать item, с которыми ни один пользователь не взаимодействовал или взаимодействовали очень мало, т.е. может решиться проблема длинного хвоста.

In [1]:
import pandas as pd

In [19]:
df_bmp_recs = pd.read_csv("../data/hw_3/bmp_25_k60_rectools.csv")
df_tfidf_recs = pd.read_csv("../data/hw_3/tfidf_k60_rectools.csv") 

In [20]:
pd.concat([df_bmp_recs.head(), df_bmp_recs.tail()])

Unnamed: 0,user_id,item_id,score,rank
0,176549,13865,88995970000.0,1
1,176549,10440,81530850000.0,2
2,176549,15297,72046040000.0,3
3,176549,3734,69534730000.0,4
4,176549,4151,46745910000.0,5
28862258,697262,5434,16154190000.0,26
28862259,697262,1132,16051600000.0,27
28862260,697262,7476,15666970000.0,28
28862261,697262,11237,15469070000.0,29
28862262,697262,12995,15423080000.0,30


In [21]:
pd.concat([df_tfidf_recs.head(), df_tfidf_recs.tail()])

Unnamed: 0,user_id,item_id,score,rank
0,176549,11749,13575.660185,1
1,176549,16270,11946.872708,2
2,176549,11985,11355.693119,3
3,176549,13159,10375.500647,4
4,176549,15266,10269.01969,5
28862258,697262,6192,1294.342414,26
28862259,697262,11640,1277.332333,27
28862260,697262,7476,1262.919377,28
28862261,697262,14,1213.499281,29
28862262,697262,3784,1200.347785,30


In [22]:
del df_tfidf_recs['rank'], df_bmp_recs['rank'], df_tfidf_recs['score'], df_bmp_recs['score']

In [23]:
df_all_recs = pd.concat(
    [
        df_bmp_recs, df_tfidf_recs
    ],
    ignore_index=True
).sort_values(
    ["user_id"], ascending=False
).drop_duplicates(
    ["user_id", "item_id"]
).reset_index(drop=True)

pd.concat([df_all_recs.head(20), df_all_recs.tail(20)])

Unnamed: 0,user_id,item_id
0,1097557,1132
1,1097557,5658
2,1097557,142
3,1097557,3734
4,1097557,16228
5,1097557,12192
6,1097557,13865
7,1097557,2657
8,1097557,9728
9,1097557,4880


In [25]:
item_idf = pd.read_csv("../data/kion_train/items_idf.csv")
print(item_idf.shape)
item_idf.head()

(15706, 2)


Unnamed: 0,index,idf
0,9506,7.150811
1,1659,8.524953
2,7107,5.821207
3,7638,8.407093
4,6686,7.778734


In [26]:
df_all_recs = df_all_recs.merge(
    item_idf, left_on='item_id', right_on='index', how='left'
).sort_values(['user_id', 'idf'], ascending=False)

pd.concat([df_all_recs.head(20), df_all_recs.tail(20)])

Unnamed: 0,user_id,item_id,index,idf
14,1097557,5803,5803,6.840585
17,1097557,6382,6382,6.80609
25,1097557,7476,7476,6.545666
18,1097557,4716,4716,6.480408
34,1097557,14,14,6.467549
24,1097557,11640,11640,6.318255
21,1097557,5434,5434,6.226266
0,1097557,1132,1132,6.183141
10,1097557,11778,11778,6.134312
13,1097557,3935,3935,6.067242


In [9]:
del df_all_recs['index'], df_all_recs['idf']

In [34]:
count_recs_by_users = df_all_recs.user_id.value_counts()
print(f"Количество пользователей, у которорых рекомендаций меньше 10: {len(count_recs_by_users[count_recs_by_users < 10])}")

Количество пользователей, у которорых рекомендаций меньше 10: 21


Для пользователей, у которых будет меньше рекомендаций, чем k_recs, рекомендации **будут пополняться популярным**

In [2]:
df_popular = pd.read_csv('../data/hw_3/popular_item.csv')
users_need = count_recs_by_users[count_recs_by_users < 10].index

In [6]:
k_recs = 10
users, recs = [], []
for user, count in dict(count_recs_by_users[count_recs_by_users < 10]).items():
    need_recs = k_recs - count
    users.extend([user for _ in range(need_recs)])
    recs.extend(df_popular["item_id"][:need_recs].to_list())

In [10]:
df_all_recs

Unnamed: 0,user_id,item_id
0,1097557,5803
1,1097557,6382
2,1097557,7476
3,1097557,4716
4,1097557,14
...,...,...
41023387,0,2657
41023388,0,3734
41023389,0,4151
41023390,0,13865


In [11]:
df_need = pd.DataFrame({"user_id": users, "item_id": recs})
df_all_recs = pd.concat([df_all_recs, df_need], ignore_index=True).sort_values("user_id")

In [12]:
count_recs_by_users = df_all_recs.user_id.value_counts()
print(f"Количество пользователей, у которорых рекомендаций меньше 10: {len(count_recs_by_users[count_recs_by_users < 10])}")

Количество пользователей, у которорых рекомендаций меньше 10: 0


In [13]:
df_all_recs.to_csv("../data/hw_3/blending_tfidf_bmp25_idf_rectools.csv", index=False)

Offline рекомендации не работали с блендингом, решил уменьшить количество рекомендаций для одного юзера до 10 и заработало

In [14]:
df_all_recs['rank'] = df_all_recs.groupby('user_id').cumcount() + 1
df_all_recs_top10 = df_all_recs[df_all_recs['rank'] <= 10]
del df_all_recs_top10['rank']
df_all_recs_top10.shape

(9621050, 2)

In [15]:
df_all_recs_top10.to_csv("../data/hw_3/blending_tfidf_bmp25_idf_rectools_10.csv", index=False)