In [1]:
import copy
import time
from collections import OrderedDict

import requests
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from scipy.stats import mode
from pprint import pprint
import warnings

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.models.popular import PopularModel
from rectools.models.popular_in_category import PopularInCategoryModel
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics
from rectools.model_selection import TimeRangeSplitter

warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: f'{x:,.6f}')

In [2]:
# url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
# req = requests.get(url, stream=True)

# with open('kion_train.zip', "wb") as fd:
#     total_size_in_bytes = int(req.headers.get('Content-Length', 0))
#     progress_bar = tqdm(desc='Downloading the kion dataset...', 
#                         total=total_size_in_bytes, 
#                         unit='iB', unit_scale=True)
#     for chunk in req.iter_content(chunk_size=2 ** 20):
#         progress_bar.update(len(chunk))
#         fd.write(chunk)

In [3]:
# !unzip kion_train.zip -x '__MACOSX/*'

In [4]:
interactions_df = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True)
print(interactions_df.shape)
# will cast types and save new pd.DataFrame inside in Interactions.df
interactions = Interactions(interactions_df)

# ! если хотите быстро прогнать этот ноутбук - раскомментируйте эту строку - она уменьшает данные
# interactions = Interactions(interactions_df.sample(frac=0.01))

interactions.df.head()

(5476251, 5)


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [5]:
N_SPLITS = 3
TEST_SIZE = '7D'

In [6]:
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [11]:
metrics = {
    'map@10': MAP(k=10),
    'novelty@10': MeanInvUserFreq(k=10),
}

# few simple models to compare
models = {
    'popular': PopularModel(),  # implicit
    'popular_genre_5': PopularInCategoryModel(category_feature='genre', n_categories=5),
    'popular_genre_3': PopularInCategoryModel(category_feature='genre', n_categories=3),
}

In [14]:
def group_metrics(metrics):
    new_metrics = {}
    for key, value in metrics.items():
        key = tuple(key.split("@"))
        new_metrics[(key[0], "@" + key[1])] = value

    return OrderedDict(sorted(new_metrics.items()))


def compute_metrics(models, metrics, splitter, top_k):
    items["genre"] = items["genres"].str.split(",")
    results = []
    fold_iterator = splitter.split(interactions, collect_fold_stats=True)
    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=splitter.n_splits):
        print(f"\n==================== Fold {fold_info['i_split']}")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids]
        genre_feature = items[["item_id", "genre"]].explode("genre")
        genre_feature.columns = ["id", "value"]
        genre_feature["feature"] = "genre"
        genre_feature = genre_feature[genre_feature['id'].isin(df_train['item_id'])]
        dataset = Dataset.construct(
            interactions_df=df_train,
            user_features_df=None,
            item_features_df=genre_feature,
            cat_item_features=['genre']
        )

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            current_model = copy.deepcopy(model)
            time_start = time.time()
            current_model.fit(dataset)
            training_time = time.time() - time_start
            recos = current_model.recommend(
                users=test_users,
                dataset=dataset,
                k=top_k,
                filter_viewed=True,
            )
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            metric_values = group_metrics(metric_values)
            res = {"fold": fold_info["i_split"], "model": model_name, ("training", "time"): training_time}
            res.update(metric_values)
            results.append(res)
    pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"]).agg("mean")
    pivot_results.columns = pd.MultiIndex.from_tuples(pivot_results.columns, names=['', ''])
    mean_metric_subset = [(metric, agg)
                          for metric, agg in pivot_results.columns if metric != "training"]
    pivot_results = pivot_results.style.highlight_min(subset=mean_metric_subset, color="red", axis=0).highlight_max(
        subset=mean_metric_subset, color="green", axis=0)

    return pivot_results

In [15]:
pivot_results = compute_metrics(models, metrics, cv, 10)
pivot_results

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


Unnamed: 0_level_0,training,map,novelty
Unnamed: 0_level_1,time,@10,@10
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
popular,0.81076,0.084109,3.71339
popular_genre_3,5.691122,0.06642,4.440021
popular_genre_5,5.847628,0.065656,4.576782


# Train full popular model

In [16]:
interactions_df = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True)
print(interactions_df.shape)
# will cast types and save new pd.DataFrame inside in Interactions.df
interactions = Interactions(interactions_df)

# ! если хотите быстро прогнать этот ноутбук - раскомментируйте эту строку - она уменьшает данные
# interactions = Interactions(interactions_df.sample(frac=0.01))

interactions.df.head()

(5476251, 5)


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [17]:
model = PopularModel()
df_train = interactions.df
dataset = Dataset.construct(
    interactions_df=df_train,
    user_features_df=None,
    item_features_df=None,
)
catalog = df_train[Columns.Item].unique()
model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7f27b3f0bd90>

In [25]:
test_users = np.unique(df_train[Columns.User])
recos = model.recommend(
    users=test_users,
    dataset=dataset,
    k=10,
    filter_viewed=False,
)

In [26]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,0,10440,202457.000000,1
1,0,15297,193123.000000,2
2,0,9728,132865.000000,3
3,0,13865,122119.000000,4
4,0,4151,91167.000000,5
...,...,...,...,...
9621785,1097557,3734,74803.000000,6
9621786,1097557,2657,68581.000000,7
9621787,1097557,4880,55043.000000,8
9621788,1097557,142,45367.000000,9


In [27]:
recos.groupby("user_id")["item_id"].agg(list).to_dict()

{0: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 1: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 2: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 3: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 4: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 5: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 7: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 8: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 9: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 10: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 11: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 12: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 13: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 14: [10440, 15297, 9728, 13865, 4151, 3734, 2657, 4880, 142, 6809],
 15: [10440, 15297, 9728, 13865, 4151, 3734

In [24]:
import json

with open("popular_model.json", "w") as f:
    json.dump(recos.groupby("user_id")["item_id"].agg(list).to_dict(), f)