In [None]:
!wget https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip
!unzip data_original.zip

In [None]:
!pip install rectools

In [27]:
import pandas as pd
import numpy as np

from time import time
from tqdm import tqdm

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics import (
    Precision,
    Recall,
    MAP,
    MRR,
    MeanInvUserFreq,
    Serendipity,
    calc_metrics,
)
from rectools.models import RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter

In [53]:
interactions = pd.read_csv("data_original/interactions.csv")
interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [54]:
interactions["weight"] = interactions["watched_pct"] / 100
interactions["datetime"] = pd.to_datetime(interactions["last_watch_dt"])
interactions.drop(
    labels=["last_watch_dt", "watched_pct", "total_dur"], axis=1, inplace=True
)

In [55]:
interactions = Interactions(interactions)

In [56]:
n_splits = 3

In [44]:
def eval(models: dict, metrics: dict, splitter, k_recos: int) -> None:
    results = []

    fold_iterator = splitter.split(interactions, collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=n_splits):
        print(f"\n==================== Fold {fold_info['i_split']}")

        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            time_start = time()
            model.fit(dataset)
            time_end = time()
            print(f"{model_name} model train time: {time_end - time_start:.6f}s")
            recos = model.recommend(
                users=test_users,
                dataset=dataset,
                k=k_recos,
                filter_viewed=True,
            )
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            res = {"fold": fold_info["i_split"], "model": model_name}
            res.update(metric_values)
            results.append(res)

    results = pd.DataFrame(results).drop(columns="fold").groupby("model").mean()

    return results

In [98]:
def visualize(model, dataset, user_ids: list, item_data: list):
    recos = model.recommend(users=user_ids, dataset=dataset, filter_viewed=True, k=10)
    recos = recos.drop(columns=["score", "rank"])
    recos["watched"] = 0
    interactions = dataset.interactions.df.drop(columns=["weight", "datetime"])
    interactions = interactions.loc[interactions["user_id"].isin(user_ids)]
    interactions["watched"] = 1
    interactions = pd.concat((interactions, recos), axis=0, ignore_index=True)
    history = pd.merge(interactions, items[["item_id"] + item_data], on="item_id")
    return history

In [76]:
models = {"random": RandomModel(random_state=32), "popular": PopularModel()}

metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "map@1": MAP(k=1),
    "map@5": MAP(k=5),
    "map@10": MAP(k=10),
    "mrr@1": MRR(k=1),
    "mrr@5": MRR(k=5),
    "mrr@10": MRR(k=10),
    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
}

splitter = TimeRangeSplitter(
    test_size="14D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [78]:
res = eval(models, metrics, splitter, k_recos=10)
res

  0%|          | 0/3 [00:00<?, ?it/s]


random model train time: 0.000070s
popular model train time: 2.242789s


 33%|███▎      | 1/3 [00:41<01:23, 41.68s/it]


random model train time: 0.000075s
popular model train time: 2.079313s


 67%|██████▋   | 2/3 [01:29<00:45, 45.10s/it]


random model train time: 0.000082s
popular model train time: 2.841973s


100%|██████████| 3/3 [02:20<00:00, 46.88s/it]


Unnamed: 0_level_0,prec@1,recall@1,prec@5,recall@5,prec@10,recall@10,mrr@1,mrr@5,mrr@10,map@1,map@5,map@10,novelty@1,novelty@5,novelty@10,serendipity@1,serendipity@5,serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
popular,0.097211,0.047186,0.070339,0.160496,0.045964,0.203517,0.097211,0.168272,0.17672,0.047186,0.091013,0.098549,2.422735,3.109863,3.722852,3e-06,4e-06,3e-06
random,0.000203,5.5e-05,0.000225,0.000293,0.000229,0.000634,0.000203,0.000499,0.000647,5.5e-05,0.000135,0.000179,15.567096,15.555586,15.557718,8e-06,8e-06,8e-06


In [57]:
dataset = Dataset.construct(interactions.df)
# model = RandomModel(random_state=32)
user_ids = [666262, 672861, 955527]
item_data = ["title", "genres", "views"]

In [87]:
items = pd.read_csv("data_original/items.csv")
items = pd.merge(
    items,
    interactions.df.groupby("item_id").count()["user_id"].rename("views"),
    on="item_id",
    how="left",
)
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords,views
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ...",5.0
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео...",9.0
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг...",6.0
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю...",2.0
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж...",1.0


In [99]:
res = visualize(models["random"], dataset, user_ids, item_data)
res

Unnamed: 0,user_id,item_id,watched,title,genres,views
0,666262,93,1,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы",1.0
1,672861,25,1,Медвежонок Винни и его друзья,"мюзиклы, мультфильм, приключения, комедии",47.0
2,672861,32,1,В ритме сердца,"драмы, мюзиклы, мелодрамы",181.0
3,955527,21,1,Признание 5,для взрослых,12.0
4,666262,10101,0,Возвращение Будулая,мелодрамы,99.0
5,666262,619,0,Новые приключения Аладдина (жестовым языком),"зарубежные, комедии",1.0
6,666262,12618,0,Пропавшая грамота,"фэнтези, комедии",51.0
7,666262,5967,0,Братья вне игры,"драмы, спорт",262.0
8,666262,4041,0,Фрилансеры,"криминал, детективы, драмы, зарубежные, боевики",19.0
9,666262,5701,0,Алые паруса: Новая история,"комедии, мелодрамы",4.0
