In [1]:
# !wget 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
# !unzip data_original.zip

In [2]:
import copy
import time
from pprint import pprint

import pandas as pd
import numpy as np
from rectools import Columns
from rectools.models import RandomModel, PopularModel
from rectools.model_selection.time_split import TimeRangeSplitter
from rectools.metrics import MAP, NDCG, MeanInvUserFreq, Serendipity, Precision, Recall, calc_metrics
from rectools.dataset import Interactions, Dataset
from tqdm.auto import tqdm


pd.set_option('display.max_rows', 20)

# Metrics

In [3]:
inter = pd.read_csv("data_original/interactions.csv")
inter.rename(columns={"last_watch_dt": "datetime", "watched_pct": "weight"}, inplace=True)
interactions = Interactions(inter)
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,total_dur,weight
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [4]:
models = {
    "random": RandomModel(random_state=32),
    "pupular": PopularModel()
}
metrics = {
    "map@1": MAP(k=1),
    "map@5": MAP(k=5),
    "map@10": MAP(k=10),
    "ndcg@1": NDCG(k=1),
    "ndcg@5": NDCG(k=5),
    "ndcg@10": NDCG(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "precision@1": Precision(k=1),
    "precision@5": Precision(k=5),
    "precision@10": Precision(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
    "miuf@1": MeanInvUserFreq(k=1),
    "miuf@5": MeanInvUserFreq(k=5),
    "miuf@10": MeanInvUserFreq(k=10),
}
splitter = TimeRangeSplitter(
    test_size="7D",
    n_splits=3,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True
)
top_k = 10

In [5]:
def compute_metrics(models, metrics, splitter, top_k):
    results = []
    fold_iterator = splitter.split(interactions, collect_fold_stats=True)
    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=splitter.n_splits):
        print(f"\n==================== Fold {fold_info['i_split']}")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            current_model = copy.deepcopy(model)
            time_start = time.time()
            current_model.fit(dataset)
            training_time = time.time() - time_start
            recos = current_model.recommend(
                users=test_users,
                dataset=dataset,
                k=top_k,
                filter_viewed=True,
            )
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            res = {"fold": fold_info["i_split"], "model": model_name, "training_time": training_time}
            res.update(metric_values)
            results.append(res)

    pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
    pivot_results = pivot_results.reindex(sorted(pivot_results.columns), axis=1)
    pivot_results.drop(("training_time", "std"), axis=1, inplace=True)
    mean_metric_subset = [(metric, agg)
                          for metric, agg in pivot_results.columns if (agg == "mean") & (metric != "training_time")]
    pivot_results = pivot_results.style.highlight_min(subset=mean_metric_subset, color="red", axis=0).highlight_max(
        subset=mean_metric_subset, color="green", axis=0)

    return pivot_results

In [6]:
pivot_results = compute_metrics(models, metrics, splitter, 10)
pivot_results

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


Unnamed: 0_level_0,map@1,map@1,map@10,map@10,map@5,map@5,miuf@1,miuf@1,miuf@10,miuf@10,miuf@5,miuf@5,ndcg@1,ndcg@1,ndcg@10,ndcg@10,ndcg@5,ndcg@5,precision@1,precision@1,precision@10,precision@10,precision@5,precision@5,recall@1,recall@1,recall@10,recall@10,recall@5,recall@5,serendipity@1,serendipity@1,serendipity@10,serendipity@10,serendipity@5,serendipity@5,training_time
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2
random,7.2e-05,1.9e-05,0.000211,3.2e-05,0.000169,3.3e-05,15.614137,0.022585,15.613009,0.019786,15.612989,0.01957,0.000221,3.3e-05,0.0002,2e-05,0.000208,2.5e-05,0.000221,3.3e-05,0.000193,1.9e-05,0.000202,2.8e-05,7.2e-05,1.9e-05,0.000693,7.6e-05,0.000365,8.4e-05,6e-06,2e-06,7e-06,0.0,7e-06,1e-06,1.4e-05
pupular,0.04272,0.004366,0.084109,0.004921,0.078295,0.00437,2.377055,0.023002,3.71339,0.002076,3.066979,0.012316,0.076432,0.006826,0.043084,0.001978,0.057932,0.002332,0.076432,0.006826,0.033903,0.001443,0.052402,0.001618,0.04272,0.004366,0.173492,0.007987,0.137413,0.005346,2e-06,0.0,2e-06,0.0,3e-06,0.0,0.787042


# Vizualisation

In [7]:
inter = pd.read_csv("data_original/interactions.csv")
inter.rename(columns={"last_watch_dt": "datetime", "watched_pct": "weight"}, inplace=True)
inter.drop(columns=["total_dur"], inplace=True)
interactions = Interactions(inter)
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight
0,176549,9506,2021-05-11,72.0
1,699317,1659,2021-05-29,100.0
2,656683,7107,2021-05-09,0.0
3,864613,7638,2021-07-05,100.0
4,964868,9506,2021-04-30,100.0


In [8]:
items = pd.read_csv("data_original/items.csv")[["item_id", "title", "genres"]]
items.head()

Unnamed: 0,item_id,title,genres
0,10711,Поговори с ней,"драмы, зарубежные, детективы, мелодрамы"
1,2508,Голые перцы,"зарубежные, приключения, комедии"
2,10716,Тактическая сила,"криминал, зарубежные, триллеры, боевики, комедии"
3,7868,45 лет,"драмы, зарубежные, мелодрамы"
4,16268,Все решает мгновение,"драмы, спорт, советские, мелодрамы"


In [9]:
dataset = Dataset.construct(interactions.df)
model = PopularModel()
model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7f46361cd810>

In [10]:
def viz_recs(model, dataset, user_ids, item_data):
    recos = model.recommend(
        users=user_ids,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )
    dataset.interactions.df["item_id_count"] = dataset.interactions.df.groupby('item_id')['item_id'].transform('count')
    user_viewed = dataset.interactions.df[dataset.interactions.df.user_id.isin(
        dataset.user_id_map.convert_to_internal(user_ids))].merge(item_data, on="item_id")

    recos.item_id = dataset.item_id_map.convert_to_internal(recos.item_id.values)
    recos = recos.merge(dataset.interactions.df[["item_id", "item_id_count"]].drop_duplicates(), on="item_id")
    recos.item_id = dataset.item_id_map.convert_to_external(recos.item_id.values)
    user_recos = recos.merge(item_data, on="item_id")

    return user_recos, user_viewed

In [11]:
user_recos, user_viewed = viz_recs(model, dataset, [666262, 672861, 955527], items)
display(user_viewed)
display(user_recos)

Unnamed: 0,user_id,item_id,weight,datetime,item_id_count,title,genres
0,121898,894,1.0,2021-06-02,1587,Ночь в осаде,"боевики, триллеры, криминал"
1,311157,212,0.0,2021-04-27,1083,Та еще парочка,"мелодрамы, комедии"
2,121898,868,11.0,2021-05-04,945,Куклы беспокойства,"зарубежные, триллеры, ужасы"
3,121898,150,4.0,2021-06-02,683,Партия для чемпионки,"русские, мелодрамы"
4,723303,2195,32.0,2021-05-12,746,В небе ночные ведьмы,военные
5,311157,1416,100.0,2021-05-04,643,Крепкая спина,фитнес
6,121898,1249,7.0,2021-06-02,691,Хардкор,"боевики, фантастика"
7,723303,2901,28.0,2021-05-12,485,Напротив левого берега,документальное
8,723303,156,100.0,2021-05-14,10370,Лена и львёнок,"семейное, приключения, комедии"


Unnamed: 0,user_id,item_id,score,rank,item_id_count,title,genres
0,666262,10440,202457.0,1,202457,Хрустальный,"триллеры, детективы"
1,672861,10440,202457.0,1,202457,Хрустальный,"триллеры, детективы"
2,955527,10440,202457.0,1,202457,Хрустальный,"триллеры, детективы"
3,666262,15297,193123.0,2,193123,Клиника счастья,"драмы, мелодрамы"
4,672861,15297,193123.0,2,193123,Клиника счастья,"драмы, мелодрамы"
...,...,...,...,...,...,...,...
25,672861,142,45367.0,9,45367,Маша,"драмы, триллеры"
26,955527,142,45367.0,9,45367,Маша,"драмы, триллеры"
27,666262,6809,40372.0,10,40372,Дуров,документальное
28,672861,6809,40372.0,10,40372,Дуров,документальное
