# Connect google drive

In [1]:
from google.colab import drive

drive.mount("/content/drive/")

Mounted at /content/drive/


# Install requirements

Install req from lecture 2 repo

In [2]:
!pip install -r /content/drive/MyDrive/itmo_recsys/lesson_2/requirements.txt

Collecting appnope==0.1.3 (from -r /content/drive/MyDrive/itmo_recsys/lesson_2/requirements.txt (line 1))
  Downloading appnope-0.1.3-py2.py3-none-any.whl (4.4 kB)
Collecting asttokens==2.4.1 (from -r /content/drive/MyDrive/itmo_recsys/lesson_2/requirements.txt (line 2))
  Downloading asttokens-2.4.1-py2.py3-none-any.whl (27 kB)
Collecting comm==0.2.0 (from -r /content/drive/MyDrive/itmo_recsys/lesson_2/requirements.txt (line 6))
  Downloading comm-0.2.0-py3-none-any.whl (7.0 kB)
Collecting debugpy==1.8.0 (from -r /content/drive/MyDrive/itmo_recsys/lesson_2/requirements.txt (line 9))
  Downloading debugpy-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting decorator==5.1.1 (from -r /content/drive/MyDrive/itmo_recsys/lesson_2/requirements.txt (line 10))
  Downloading decorator-5.1.1-py3-none-any.whl (9.1 kB)
Collecting executing==2

# Imports

In [14]:
import time
from pprint import pprint
from typing import Tuple

import numpy as np
import pandas as pd
import rectools
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import (MAP, NDCG, MeanInvUserFreq, Precision, Recall,
                              Serendipity, calc_metrics)
from rectools.model_selection import TimeRangeSplitter
from rectools.models import (ImplicitItemKNNWrapperModel, PopularModel,
                             RandomModel)
from tqdm.auto import tqdm

N_FOLDS = 3
RANDOM_STATE = 32
K_RECS = 10
USERS = [666262, 672861, 955527]

# Get datasets

In [22]:
interactions = pd.read_csv(
    "/content/drive/MyDrive/itmo_recsys/kion_dataset/interactions.csv",
    parse_dates=["last_watch_dt"],
)
items = pd.read_csv("/content/drive/MyDrive/itmo_recsys/kion_dataset/items.csv")

interactions.rename(
    columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight},
    inplace=True,
)

In [6]:
interactions.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


# Calculate metrics

In [7]:
def calculate_metrics(
    models: dict,
    metrics: dict,
    cv: TimeRangeSplitter,
    k_recs: int,
    interactions: pd.DataFrame,
) -> Tuple[list, pd.DataFrame]:
    results = []
    interactions = Interactions(interactions)
    fold_iterator = cv.split(interactions, collect_fold_stats=True)
    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=3):
        print(f"\n==================== Fold {fold_info['i_split']}")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            start_time = time.time()
            model.fit(dataset)
            end_time = time.time()
            recos = model.recommend(
                users=test_users,
                dataset=dataset,
                k=k_recs,
                filter_viewed=True,
            )
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            res = {"fold": fold_info["i_split"], "model": model_name}
            res.update(metric_values)
            res.update({"time_seconds": end_time - start_time})
            results.append(res)
    results_pd = pd.DataFrame(results)

    return results, results_pd

In [8]:
cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=N_FOLDS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

models = {"random": RandomModel(random_state=RANDOM_STATE), "popular": PopularModel()}

metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),
    "map@1": MAP(k=1),
    "map@5": MAP(k=5),
    "map@10": MAP(k=10),
    "ndcg@1": NDCG(k=1),
    "ndcg@5": NDCG(k=5),
    "ndcg@10": NDCG(k=10),
    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),
}

In [9]:
results, results_pd = calculate_metrics(
    models=models, metrics=metrics, cv=cv, k_recs=K_RECS, interactions=interactions
)

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


In [10]:
agg_dict = dict(zip(list(metrics.keys()), ["mean" for _ in range(len(metrics.keys()))]))

mean_results = results_pd.groupby("model", as_index=False).agg(agg_dict)

mean_time = results_pd.groupby("model", as_index=False).agg({"time_seconds": "mean"})

In [11]:
mean_results

Unnamed: 0,model,prec@1,prec@5,prec@10,recall@1,recall@5,recall@10,map@1,map@5,map@10,ndcg@1,ndcg@5,ndcg@10,novelty@1,novelty@5,novelty@10,serendipity@1,serendipity@5,serendipity@10
0,popular,0.076432,0.052402,0.033903,0.04272,0.137413,0.173492,0.04272,0.078295,0.084109,0.076432,0.057932,0.043084,2.377055,3.066979,3.71339,2e-06,3e-06,2e-06
1,random,0.000221,0.000202,0.000193,7.2e-05,0.000365,0.000693,7.2e-05,0.000169,0.000211,0.000221,0.000208,0.0002,15.614137,15.612989,15.613009,6e-06,7e-06,7e-06


In [12]:
mean_time

Unnamed: 0,model,time_seconds
0,popular,2.659522
1,random,5.9e-05


# Visual analysis

In [23]:
def visual_analysis(
    model: rectools.models,
    interactions: pd.DataFrame,
    dataset: rectools.dataset,
    users: list,
    item_data: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    recos = model.recommend(
        users=interactions[Columns.User].unique(),
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    user_viewed_all = pd.DataFrame()
    user_recos_all = pd.DataFrame()

    for user in users:
        user_viewed = interactions.query(f"user_id == {user}").merge(
            item_data, on="item_id"
        )
        user_recos = recos.query(f"user_id == {user}").merge(item_data, on="item_id")
        user_viewed_all = pd.concat([user_viewed_all, user_viewed], axis=0)
        user_recos_all = pd.concat([user_recos_all, user_recos], axis=0)

    return user_viewed_all, user_recos_all

Calculate count of views for each item_id

In [24]:
count_of_views = (
    interactions.groupby("item_id", as_index=False)
    .agg({"user_id": "count"})
    .rename(columns={"user_id": "count_of_views"})
)

In [25]:
items = items.merge(count_of_views, left_on="item_id", right_on="item_id", how="inner")

For example, train simple model

In [26]:
simple_model = PopularModel()
dataset = Dataset.construct(interactions)
simple_model.fit(dataset)

user_viewed_all, user_recos_all = visual_analysis(
    model=simple_model,
    interactions=interactions,
    dataset=dataset,
    users=USERS,
    item_data=items,
)

In [27]:
user_viewed_all = user_viewed_all[
    ["user_id", "item_id", "datetime", "title", "genres", "count_of_views"]
]
user_recos_all = user_recos_all[
    ["user_id", "item_id", "score", "rank", "title", "genres", "count_of_views"]
]

In [28]:
user_viewed_all.head()

Unnamed: 0,user_id,item_id,datetime,title,genres,count_of_views
0,666262,7957,2021-05-12,Последний викинг,"боевики, историческое, приключения",746
1,666262,4785,2021-05-12,Робин Гуд: Начало,"боевики, триллеры, приключения",485
2,666262,12981,2021-05-14,Томирис,"боевики, драмы, историческое, военные",10370
0,672861,6870,2021-04-27,Красавица и чудовище,"драмы, фэнтези, музыкальные",1083
1,672861,8662,2021-05-04,Он – дракон,фэнтези,643


In [29]:
user_recos_all.head()

Unnamed: 0,user_id,item_id,score,rank,title,genres,count_of_views
0,666262,10440,202457.0,1,Хрустальный,"триллеры, детективы",202457
1,666262,15297,193123.0,2,Клиника счастья,"драмы, мелодрамы",193123
2,666262,9728,132865.0,3,Гнев человеческий,"боевики, триллеры",132865
3,666262,13865,122119.0,4,Девятаев,"драмы, военные, приключения",122119
4,666262,4151,91167.0,5,Секреты семейной жизни,комедии,91167
