In [None]:
!pip install rectools

Collecting rectools
  Downloading rectools-0.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting implicit<0.8.0,>=0.7.1 (from rectools)
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Collecting scipy<2.0.0,>=1.5.4 (from rectools)
  Downloading scipy-1.11.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m487.3 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting tqdm<5.0.0,>=4.27.0 (from rectools)
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typeguard<3.0.0,>=2.0.1 (from rectools)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting threadpoolctl (from implicit<0.8.0,>=0.7.1->rectools)
  Downloading threadpoolctl-3.2.0-py3

In [7]:
import time
import requests
import numpy as np
import pandas as pd
from copy import deepcopy, copy
from tqdm.auto import tqdm
from IPython.display import display, HTML
from typing import Dict, List, Tuple, Union, Callable, Any

In [8]:
import rectools
from rectools.dataset import Interactions, Dataset, DenseFeatures
from rectools.model_selection import Splitter, TimeRangeSplitter
from rectools.models.base import ModelBase
from rectools.models import RandomModel, PopularModel
from rectools.metrics.base import MetricAtK
from rectools.metrics import (
    Precision,
    Recall,
    MAP,
    NDCG,
    Serendipity,
    MeanInvUserFreq,
    IntraListDiversity,
    PairwiseHammingDistanceCalculator,
    calc_metrics,
)

# Validation

In [14]:
def add_thresholds(metrics: Dict[str, Union[Callable, Tuple[Callable, Dict[str, Any]]]], K: List[int]):
    metrics_with_thresholds = {}
    for name, metric in metrics.items():
        kwargs = {}
        if isinstance(metric, tuple):
            kwargs.update(**metric[1])
            metric = metric[0]
        for k in K:
            metrics_with_thresholds.update({f"{name}@{k}": metric(k=k, **kwargs)})
    return metrics_with_thresholds

In [15]:
def calc_coverage(reco):
    return reco.item_id.nunique() / len(reco)

In [16]:
def cross_val(
    dataset: pd.DataFrame,
    models: Dict[str, ModelBase],
    metrics: Dict[str, Union[MetricAtK, Callable]],
    splitter: Splitter,
    k: int,
    num_splits: int = 0,
):
    rectool_metrics = {k: v for k, v in metrics.items() if isinstance(v, MetricAtK)}
    custom_metrics = {k: v for k, v in metrics.items() if k not in rectool_metrics}
    metric_values = []
    interactions = Interactions(dataset)
    pbar = tqdm(total=len(models) * num_splits)
    pbar.set_description(f"splitting")
    splits = splitter.split(interactions)
    for train_ids, test_ids, i in splits:
        num_fold = i["i_split"]
        train = Dataset.construct(dataset.iloc[train_ids])
        test = Dataset.construct(dataset.iloc[test_ids])
        prev_interactions = train.interactions.df
        for model_name, orig_model in models.items():
            pbar_prefix = f"fold {num_fold}, {model_name}"
            pbar.set_description(f"{pbar_prefix} training")
            model = deepcopy(orig_model)
            start = time.time()
            model.fit(train)
            end = time.time()
            pbar.set_description(f"{pbar_prefix} predicting")
            reco = model.recommend(test.user_id_map.external_ids, train, k, True)
            del model
            pbar.set_description(f"{pbar_prefix} evaluating")
            cur_metrics = {
                "model": model_name,
                "time": end - start,
                **calc_metrics(
                    rectool_metrics, reco=reco, interactions=interactions_df, prev_interactions=interactions_df
                ),
            }
            for name, metric in custom_metrics.items():
                cur_metrics.update({name: metric(reco)})
            metric_values.append(cur_metrics)
            pbar.update(1)
    pbar.close()
    return pd.DataFrame(metric_values).groupby("model").mean()

In [11]:
interactions_df = pd.read_csv("../kion_train/interactions.csv", parse_dates=["last_watch_dt"])
interactions_df.rename(
    columns={"last_watch_dt": rectools.Columns.Datetime, "total_dur": rectools.Columns.Weight}, inplace=True
)
interactions = Interactions(interactions_df)

In [12]:
metrics = add_thresholds(
    {
        "precision": Precision,
        "recall": Recall,
        "MAP": MAP,
        "NDCG": NDCG,
        "novelty": MeanInvUserFreq,
    },
    [1, 5, 10],
)
metrics.update({"coverage": calc_coverage})
metrics

{'precision@1': Precision(k=1),
 'precision@5': Precision(k=5),
 'precision@10': Precision(k=10),
 'recall@1': Recall(k=1),
 'recall@5': Recall(k=5),
 'recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False),
 'NDCG@1': NDCG(k=1, log_base=2),
 'NDCG@5': NDCG(k=5, log_base=2),
 'NDCG@10': NDCG(k=10, log_base=2),
 'novelty@1': MeanInvUserFreq(k=1),
 'novelty@5': MeanInvUserFreq(k=5),
 'novelty@10': MeanInvUserFreq(k=10),
 'coverage': <function __main__.calc_coverage(reco)>}

In [17]:
NUM_RECOS = 10
NUM_SPLITS = 3
SEED = 32
splitter = rectools.model_selection.time_split.TimeRangeSplitter("1D", NUM_SPLITS)
models = {"random": RandomModel(random_state=SEED), "popular": PopularModel()}

results = cross_val(interactions_df, models, metrics, splitter, NUM_RECOS, num_splits=NUM_SPLITS)

  0%|          | 0/6 [00:00<?, ?it/s]

In [18]:
results

Unnamed: 0_level_0,time,precision@1,recall@1,precision@5,recall@5,precision@10,recall@10,NDCG@1,NDCG@5,NDCG@10,MAP@1,MAP@5,MAP@10,novelty@1,novelty@5,novelty@10,coverage
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
popular,4.80696,0.00171,0.0002967446,0.001278,0.001124,0.000839,0.001412,0.00171,0.001401,0.001051,0.0002967446,0.000614,0.00066,2.348193,3.053777,3.727302,0.00011
random,9.7e-05,6e-06,7.788373e-07,6e-06,3e-06,5e-06,6e-06,6e-06,6e-06,5e-06,7.788373e-07,2e-06,2e-06,15.650227,15.65159,15.652433,0.049638


# Visualization

In [23]:
class Visualizer:
    def __init__(
        self,
        model: ModelBase,
        dataset: Dataset,
        item_data: List[str] = ["title"],
        items: Union[pd.DataFrame, None] = None,
        items_df_path: str = "../kion_train/items.csv",
    ):
        self.model = model
        self.dataset = dataset
        if items is None:
            self.items = pd.read_csv(items_df_path)
        else:
            self.items = items
        for d in item_data:
            assert (
                d in self.items.columns
            ), f'Unknown column "{d}". Can only visualize information about {", ".join(self.items.columns)}.'
        self.item_data = item_data

    def display_item_data(self, interactions):
        return pd.merge(interactions, self.items)[["user_id"] + self.item_data]

    def pretty_print(self, df):
        display(HTML(df.to_html()))

    def get_reco(self, user_ids: List[int], k: int = 10):
        reco = self.model.recommend(np.array(user_ids), self.dataset, k, True)
        return self.display_item_data(reco)

    def get_history(self, user_ids: List[int]):
        df = self.dataset.interactions.df
        history = df[df.user_id.isin(user_ids)]
        return self.display_item_data(history)

    def eval_recos(self, user_ids: List[int]):
        for user_id in user_ids:
            print(f"User {user_id} watched these film:")
            self.pretty_print(self.get_history([user_id]))
            print(f"And got these films as recomendations")
            self.pretty_print(self.get_reco([user_id]))
            print("\n")

In [20]:
items = pd.read_csv("../kion_train/items.csv")

In [21]:
model = RandomModel(random_state=32)
dataset = Dataset.construct(interactions_df)
model.fit(dataset)
USER_IDS = [666262, 672861, 955527]

In [24]:
viz = Visualizer(model, dataset, item_data=["title", "genres"], items=items)
viz.eval_recos(USER_IDS)

User 666262 watched these film:


Unnamed: 0,user_id,title,genres
0,666262,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы"


And got these films as recomendations


Unnamed: 0,user_id,title,genres
0,666262,Возвращение Будулая,мелодрамы
1,666262,Новые приключения Аладдина (жестовым языком),"зарубежные, комедии"
2,666262,Пропавшая грамота,"фэнтези, комедии"
3,666262,Братья вне игры,"драмы, спорт"
4,666262,Фрилансеры,"криминал, детективы, драмы, зарубежные, боевики"
5,666262,Алые паруса: Новая история,"комедии, мелодрамы"
6,666262,Женщина в беде 3,"детективы, мелодрамы"
7,666262,Гордость и предубеждение,"драмы, мелодрамы"
8,666262,Болванчики,"мультфильм, приключения, комедии"
9,666262,Избави нас от лукавого,"ужасы, триллеры, детективы"




User 672861 watched these film:


Unnamed: 0,user_id,title,genres
0,672861,Медвежонок Винни и его друзья,"мюзиклы, мультфильм, приключения, комедии"
1,672861,В ритме сердца,"драмы, мюзиклы, мелодрамы"


And got these films as recomendations


Unnamed: 0,user_id,title,genres
0,672861,Возвращение Будулая,мелодрамы
1,672861,Новые приключения Аладдина (жестовым языком),"зарубежные, комедии"
2,672861,Пропавшая грамота,"фэнтези, комедии"
3,672861,Братья вне игры,"драмы, спорт"
4,672861,Фрилансеры,"криминал, детективы, драмы, зарубежные, боевики"
5,672861,Алые паруса: Новая история,"комедии, мелодрамы"
6,672861,Женщина в беде 3,"детективы, мелодрамы"
7,672861,Гордость и предубеждение,"драмы, мелодрамы"
8,672861,Болванчики,"мультфильм, приключения, комедии"
9,672861,Избави нас от лукавого,"ужасы, триллеры, детективы"




User 955527 watched these film:


Unnamed: 0,user_id,title,genres
0,955527,Признание 5,для взрослых


And got these films as recomendations


Unnamed: 0,user_id,title,genres
0,955527,Возвращение Будулая,мелодрамы
1,955527,Новые приключения Аладдина (жестовым языком),"зарубежные, комедии"
2,955527,Пропавшая грамота,"фэнтези, комедии"
3,955527,Братья вне игры,"драмы, спорт"
4,955527,Фрилансеры,"криминал, детективы, драмы, зарубежные, боевики"
5,955527,Алые паруса: Новая история,"комедии, мелодрамы"
6,955527,Женщина в беде 3,"детективы, мелодрамы"
7,955527,Гордость и предубеждение,"драмы, мелодрамы"
8,955527,Болванчики,"мультфильм, приключения, комедии"
9,955527,Избави нас от лукавого,"ужасы, триллеры, детективы"




