In [1]:
# !wget 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
# !unzip data_original.zip

In [2]:
import copy
import time
from pprint import pprint
from collections import OrderedDict

import pandas as pd
import numpy as np
from rectools import Columns
from rectools.models import RandomModel, PopularModel
from rectools.model_selection.time_split import TimeRangeSplitter
from rectools.metrics import MAP, NDCG, MeanInvUserFreq, Serendipity, Precision, Recall, calc_metrics
from rectools.dataset import Interactions, Dataset
from tqdm.auto import tqdm


pd.set_option('display.max_rows', 20)

# Metrics

In [3]:
inter = pd.read_csv("data_original/interactions.csv")
inter.rename(columns={"last_watch_dt": "datetime", "watched_pct": "weight"}, inplace=True)
interactions = Interactions(inter)
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,total_dur,weight
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [4]:
models = {
    "random": RandomModel(random_state=32),
    "pupular": PopularModel()
}
metrics = {
    "map@1": MAP(k=1),
    "map@5": MAP(k=5),
    "map@10": MAP(k=10),
    "ndcg@1": NDCG(k=1),
    "ndcg@5": NDCG(k=5),
    "ndcg@10": NDCG(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "precision@1": Precision(k=1),
    "precision@5": Precision(k=5),
    "precision@10": Precision(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
    "miuf@1": MeanInvUserFreq(k=1),
    "miuf@5": MeanInvUserFreq(k=5),
    "miuf@10": MeanInvUserFreq(k=10),
}
splitter = TimeRangeSplitter(
    test_size="7D",
    n_splits=3,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True
)
top_k = 10

In [5]:
def group_metrics(metrics):
    new_metrics = {}
    for key, value in metrics.items():
        key = tuple(key.split("@"))
        new_metrics[(key[0], "@" + key[1])] = value

    return OrderedDict(sorted(new_metrics.items()))


def compute_metrics(models, metrics, splitter, top_k):
    results = []
    fold_iterator = splitter.split(interactions, collect_fold_stats=True)
    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=splitter.n_splits):
        print(f"\n==================== Fold {fold_info['i_split']}")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            current_model = copy.deepcopy(model)
            time_start = time.time()
            current_model.fit(dataset)
            training_time = time.time() - time_start
            recos = current_model.recommend(
                users=test_users,
                dataset=dataset,
                k=top_k,
                filter_viewed=True,
            )
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            metric_values = group_metrics(metric_values)
            res = {"fold": fold_info["i_split"], "model": model_name, ("training", "time"): training_time}
            res.update(metric_values)
            results.append(res)
    pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"]).agg("mean")
    pivot_results.columns = pd.MultiIndex.from_tuples(pivot_results.columns, names=['', ''])
    mean_metric_subset = [(metric, agg)
                          for metric, agg in pivot_results.columns if metric != "training"]
    pivot_results = pivot_results.style.highlight_min(subset=mean_metric_subset, color="red", axis=0).highlight_max(
        subset=mean_metric_subset, color="green", axis=0)

    return pivot_results

In [6]:
pivot_results = compute_metrics(models, metrics, splitter, 10)
pivot_results

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


Unnamed: 0_level_0,training,map,map,map,miuf,miuf,miuf,ndcg,ndcg,ndcg,precision,precision,precision,recall,recall,recall,serendipity,serendipity,serendipity
Unnamed: 0_level_1,time,@1,@10,@5,@1,@10,@5,@1,@10,@5,@1,@10,@5,@1,@10,@5,@1,@10,@5
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
pupular,0.797273,0.04272,0.084109,0.078295,2.377055,3.71339,3.066979,0.076432,0.043084,0.057932,0.076432,0.033903,0.052402,0.04272,0.173492,0.137413,2e-06,2e-06,3e-06
random,1.8e-05,7.2e-05,0.000211,0.000169,15.614137,15.613009,15.612989,0.000221,0.0002,0.000208,0.000221,0.000193,0.000202,7.2e-05,0.000693,0.000365,6e-06,7e-06,7e-06


# Vizualisation

In [7]:
inter = pd.read_csv("data_original/interactions.csv")
inter.rename(columns={"last_watch_dt": "datetime", "watched_pct": "weight"}, inplace=True)
inter.drop(columns=["total_dur"], inplace=True)
interactions = Interactions(inter)
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight
0,176549,9506,2021-05-11,72.0
1,699317,1659,2021-05-29,100.0
2,656683,7107,2021-05-09,0.0
3,864613,7638,2021-07-05,100.0
4,964868,9506,2021-04-30,100.0


In [8]:
items = pd.read_csv("data_original/items.csv")[["item_id", "title", "genres"]]
items.head()

Unnamed: 0,item_id,title,genres
0,10711,Поговори с ней,"драмы, зарубежные, детективы, мелодрамы"
1,2508,Голые перцы,"зарубежные, приключения, комедии"
2,10716,Тактическая сила,"криминал, зарубежные, триллеры, боевики, комедии"
3,7868,45 лет,"драмы, зарубежные, мелодрамы"
4,16268,Все решает мгновение,"драмы, спорт, советские, мелодрамы"


In [9]:
dataset = Dataset.construct(interactions.df)
model = PopularModel()
model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7ffb0972b6a0>

In [10]:
def viz_recs(model, dataset, user_ids, item_data):
    top_k = 10
    recos = model.recommend(
        users=user_ids,
        dataset=dataset,
        k=top_k,
        filter_viewed=True,
    )
    dataset.interactions.df["item_id_count"] = dataset.interactions.df.groupby('item_id')['item_id'].transform('count')
    user_viewed = dataset.interactions.df[dataset.interactions.df.user_id.isin(
        dataset.user_id_map.convert_to_internal(user_ids))].merge(item_data, on="item_id")
    user_viewed.user_id = dataset.user_id_map.convert_to_external(user_viewed.user_id)

    recos.item_id = dataset.item_id_map.convert_to_internal(recos.item_id.values)
    recos = recos.merge(dataset.interactions.df[["item_id", "item_id_count"]].drop_duplicates(), on="item_id")
    recos.item_id = dataset.item_id_map.convert_to_external(recos.item_id.values)
    user_recos = recos.merge(item_data, on="item_id").drop(columns=["score"])

    for user_id in user_ids:
        print("#"*100)
        print(f"user_id: {user_id}\nUser recommendation:")
        display(user_recos[user_recos.user_id == user_id].drop("user_id", axis=1))
        print("Last user history:")
        display(user_viewed[(user_viewed.user_id == user_id) & (user_viewed.weight >= 5)].sort_values(
            by=["datetime"]).drop(["user_id", "datetime"], axis=1).tail(top_k))
    print("#"*100)

In [11]:
user_ids = [666262, 672861, 955527]
viz_recs(model, dataset, user_ids, items)

####################################################################################################
user_id: 666262
User recommendation:


Unnamed: 0,item_id,rank,item_id_count,title,genres
0,10440,1,202457,Хрустальный,"триллеры, детективы"
3,15297,2,193123,Клиника счастья,"драмы, мелодрамы"
6,9728,3,132865,Гнев человеческий,"боевики, триллеры"
9,13865,4,122119,Девятаев,"драмы, военные, приключения"
12,4151,5,91167,Секреты семейной жизни,комедии
15,3734,6,74803,Прабабушка легкого поведения,комедии
18,2657,7,68581,Подслушано,"драмы, триллеры"
21,4880,8,55043,Афера,комедии
24,142,9,45367,Маша,"драмы, триллеры"
27,6809,10,40372,Дуров,документальное


Last user history:


Unnamed: 0,item_id,weight,item_id_count,title,genres
4,2195,32.0,746,В небе ночные ведьмы,военные
7,2901,28.0,485,Напротив левого берега,документальное
8,156,100.0,10370,Лена и львёнок,"семейное, приключения, комедии"


####################################################################################################
user_id: 672861
User recommendation:


Unnamed: 0,item_id,rank,item_id_count,title,genres
1,10440,1,202457,Хрустальный,"триллеры, детективы"
4,15297,2,193123,Клиника счастья,"драмы, мелодрамы"
7,9728,3,132865,Гнев человеческий,"боевики, триллеры"
10,13865,4,122119,Девятаев,"драмы, военные, приключения"
13,4151,5,91167,Секреты семейной жизни,комедии
16,3734,6,74803,Прабабушка легкого поведения,комедии
19,2657,7,68581,Подслушано,"драмы, триллеры"
22,4880,8,55043,Афера,комедии
25,142,9,45367,Маша,"драмы, триллеры"
28,6809,10,40372,Дуров,документальное


Last user history:


Unnamed: 0,item_id,weight,item_id_count,title,genres
5,1416,100.0,643,Крепкая спина,фитнес


####################################################################################################
user_id: 955527
User recommendation:


Unnamed: 0,item_id,rank,item_id_count,title,genres
2,10440,1,202457,Хрустальный,"триллеры, детективы"
5,15297,2,193123,Клиника счастья,"драмы, мелодрамы"
8,9728,3,132865,Гнев человеческий,"боевики, триллеры"
11,13865,4,122119,Девятаев,"драмы, военные, приключения"
14,4151,5,91167,Секреты семейной жизни,комедии
17,3734,6,74803,Прабабушка легкого поведения,комедии
20,2657,7,68581,Подслушано,"драмы, триллеры"
23,4880,8,55043,Афера,комедии
26,142,9,45367,Маша,"драмы, триллеры"
29,6809,10,40372,Дуров,документальное


Last user history:


Unnamed: 0,item_id,weight,item_id_count,title,genres
2,868,11.0,945,Куклы беспокойства,"зарубежные, триллеры, ужасы"
6,1249,7.0,691,Хардкор,"боевики, фантастика"


####################################################################################################
