<a href="https://colab.research.google.com/github/doronin99/RecoServiceTemplate/blob/task2/notebooks/metrics_and_visual_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing necessary libraries

In [None]:
pip install -q implicit

In [None]:
pip install -q rectools

In [None]:
import time

from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender
from implicit.als import AlternatingLeastSquares

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.metrics import MAP, MRR, Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.model_selection import TimeRangeSplitter

## Data loading

In [None]:
url = 'https://github.com/irsafilo/KIdata_original.zipON_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

In [None]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

Archive:  data_original.zip
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  
CPU times: user 66.3 ms, sys: 9.15 ms, total: 75.4 ms
Wall time: 6.06 s


In [None]:
%%time
interactions = pd.read_csv(
    "data_original/interactions.csv",
    sep=",",
)
print(interactions.shape)
interactions.head()

(5476251, 5)
CPU times: user 2.48 s, sys: 543 ms, total: 3.02 s
Wall time: 3.02 s


Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [None]:
%%time
items = pd.read_csv(
    "data_original/items.csv",
    sep=",",
)
print(items.shape)
items.head()

(15963, 14)
CPU times: user 673 ms, sys: 29.9 ms, total: 703 ms
Wall time: 707 ms


Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [None]:
%%time
users = pd.read_csv(
    "data_original/users.csv",
    sep=",",
)
print(users.shape)
users.head()

(840197, 5)
CPU times: user 479 ms, sys: 19.9 ms, total: 499 ms
Wall time: 505 ms


Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


## RecommenderEvaluator class

In [None]:
class RecommenderEvaluator:
    def __init__(self, models, metrics, cv, K_RECOS):
        # Initialize the evaluator with models, metrics, cross-validation splitter, and the number of recommendations (K)
        self.models = models
        self.metrics = metrics
        self.cv = cv
        self.K_RECOS = K_RECOS
        self.results = []  # Store the results of each fold
        self.trained_models = {}  # Store trained models

    def train_and_evaluate(self, interactions):
        # Obtain the iterator for cross-validation folds
        fold_iterator = self.cv.split(interactions, collect_fold_stats=True)

        # Initialize an empty dataset to store the training data
        trained_dataset = None

        # Iterate through folds
        for train_ids, test_ids, fold_info in tqdm(fold_iterator, total=n_splits):
            print(f"\n==================== Fold {fold_info['i_split']}")
            pprint(fold_info)

            # Extract training and test data based on fold information
            df_train = interactions.df.iloc[train_ids]

            # Construct a RecTools Dataset using the training data
            dataset = Dataset.construct(interactions_df=df_train)

            # Extract test data
            df_test = interactions.df.iloc[test_ids][Columns.UserItem]
            test_users = np.unique(df_test[Columns.User])

            # Catalog is a set of items that we recommend.
            # Sometimes we recommend not all items from the training set.
            catalog = df_train[Columns.Item].unique()

            # Iterate through models
            for idx, (model_name, model) in enumerate(self.models.items()):
                start_time = time.time()

                # Fit the model on the training dataset
                model.fit(dataset)

                # Store the trained models for each fold
                self.trained_models[(model_name, fold_info['i_split'])] = model

                # Generate recommendations for test users
                recos = model.recommend(
                    users=test_users,
                    dataset=dataset,
                    k=self.K_RECOS,
                    filter_viewed=True,
                )

                elapsed_time = time.time() - start_time

                # Calculate metrics based on recommendations and test interactions
                metric_values = calc_metrics(self.metrics, reco=recos, interactions=df_test, prev_interactions=df_train, catalog=catalog)

                # Store results for this fold and model
                res = {"fold": fold_info["i_split"], "model": model_name, "elapsed_time": elapsed_time}
                res.update(metric_values)
                self.results.append(res)

        # Aggregate metrics by folds and compare models
        pivot_results = pd.DataFrame(self.results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])
        mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']

        # Return a styled DataFrame with highlighted metrics
        return pivot_results.style.highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0) \
            .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)

## RecommenderEvaluator evaluation

In [None]:
# Create a modified interactions dataframe with relevant columns
interactions_upd = interactions.loc[:, ['user_id', 'item_id']]
interactions_upd['weight'] = interactions.loc[:, 'total_dur'] * interactions.loc[:, 'watched_pct'] / 100
interactions_upd['datetime'] = interactions.loc[:, 'last_watch_dt']

# Convert the modified interactions dataframe to a RecTools Interactions object
interactions_upd = Interactions(interactions_upd)

In [None]:
# Set the number of splits for cross-validation and the number of recommendations to generate (K)
n_splits = 3
k_recos = 10

# Initialize a TimeRangeSplitter for cross-validation
cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

# Define a dictionary of models for comparison
models = {
    "random": RandomModel(random_state=32), # Random recommendation model
    "popular": PopularModel(),  # Popular items recommendation model
}

# Define a dictionary of metrics to evaluate the models
metrics = {
    f"MAP@{k_recos}": MAP(k=k_recos),  # Mean Average Precision at K
    f"MRR@{k_recos}": MRR(k=k_recos),  # Mean Reciprocal Rank at K
    f"precision@{k_recos}": Precision(k=k_recos),  # Precision at K
    f"recall@{k_recos}": Recall(k=k_recos),  # Recall at K
    f"novelty@{k_recos}": MeanInvUserFreq(k=10), # Novelty at K
    f"serendipity@{k_recos}": Serendipity(k=10),  # Serendipity at K
}

In [None]:
# Initialize the RecommenderEvaluator with models, metrics, cross-validation, and K
evaluator = RecommenderEvaluator(models, metrics, cv, 10)

# Train and evaluate the models using the provided interactions data
results_table = evaluator.train_and_evaluate(interactions_upd)

# Display the results table with highlighted metrics
results_table

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


Unnamed: 0_level_0,elapsed_time,elapsed_time,precision@10,precision@10,recall@10,recall@10,MRR@10,MRR@10,MAP@10,MAP@10,novelty@10,novelty@10,serendipity@10,serendipity@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
random,15.643648,0.439835,0.000193,1.9e-05,0.000693,7.6e-05,0.000604,6.1e-05,0.000211,3.2e-05,15.613009,0.019786,7e-06,0.0
popular,11.522067,0.428386,0.033903,0.001443,0.173492,0.007987,0.138603,0.006728,0.084109,0.004921,3.71339,0.002076,2e-06,0.0


## RecommendationVisualizer class

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

class VisualAnalyzer:
    def __init__(self, model, dataset, selected_user_ids, item_data):
        # Initialize VisualAnalyzer with the specified model, dataset, user IDs, and item data
        self.model = model
        self.dataset = dataset
        self.selected_user_ids = selected_user_ids
        self.item_data = item_data

    def generate_recommendations(self):
        # Generate recommendations for the selected user IDs using the provided model
        user_recommendations = self.model.recommend(
            users=self.selected_user_ids,
            dataset=self.dataset,
            k=10,
            filter_viewed=True,
        )

        # Merge recommendation data with additional item information
        user_recommendations = pd.merge(user_recommendations, self.item_data, on='item_id', how='left')

        return user_recommendations

    def visualize_history_and_recommendations(self):
        # Initialize an empty DataFrame to store user history and recommendations
        users_history_and_recommendations = pd.DataFrame()

        # Generate recommendations for the selected user IDs
        users_recommendations = self.generate_recommendations()

        # Visualize data for each user in selected_user_ids
        for user_id in self.selected_user_ids:
            # Get the user-item interactions
            interactions = self.dataset.interactions.df

            # Extract user's viewing history
            user_history = interactions[interactions['user_id'] == user_id]
            user_history = pd.merge(user_history, self.item_data, on='item_id', how='left')

            # Extract recommendations for the user
            user_recommendations = users_recommendations[users_recommendations['user_id'] == user_id]

            # Combine user history and recommendations for convenient display
            merged_data = pd.concat([user_history, user_recommendations], ignore_index=True)
            users_history_and_recommendations = pd.concat([users_history_and_recommendations, merged_data], ignore_index=True)

        return users_history_and_recommendations

## VisualAnalyzer evaluation

In [None]:
# Create a Dataset object with user-item interactions DataFrame
dataset = Dataset.construct(interactions_df=interactions_upd.df)

# Initialize and train a RandomModel with a specified random state
random_model = RandomModel(random_state=32)
random_model.fit(dataset)

# Initialize and train a PopularModel
popular_model = PopularModel()
popular_model.fit(dataset)

# Specify a list of selected user IDs for analysis
selected_user_ids = [666262, 672861, 955527]

# Extract relevant item data including item ID, title, and genres
item_data = items[['item_id', 'title', 'genres']]

# Count the number of users who viewed each item in the interactions DataFrame
views_counted = interactions\
    .groupby('item_id')['user_id']\
        .count()\
        .reset_index()\
        .rename(columns={'user_id': 'views_counted'})

# Merge the item_data DataFrame with the views_counted information based on item ID
item_data = pd.merge(item_data, views_counted, on='item_id', how='left')


In [None]:
# Initialize a VisualAnalyzer object with a PopularModel and relevant data
popular_visualizer = VisualAnalyzer(popular_model,
                                    dataset,
                                    selected_user_ids,
                                    item_data)

# Visualize the viewing history and recommendations for selected users using the PopularModel
popular_visualizer.visualize_history_and_recommendations()

Unnamed: 0,user_id,item_id,weight,datetime,title,genres,views_counted,score,rank
0,666262,93,803.55,2021-07-21,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы",1.0,,
1,666262,10440,,NaT,Хрустальный,"триллеры, детективы",202457.0,202457.0,1.0
2,666262,15297,,NaT,Клиника счастья,"драмы, мелодрамы",193123.0,193123.0,2.0
3,666262,9728,,NaT,Гнев человеческий,"боевики, триллеры",132865.0,132865.0,3.0
4,666262,13865,,NaT,Девятаев,"драмы, военные, приключения",122119.0,122119.0,4.0
5,666262,4151,,NaT,Секреты семейной жизни,комедии,91167.0,91167.0,5.0
6,666262,3734,,NaT,Прабабушка легкого поведения,комедии,74803.0,74803.0,6.0
7,666262,2657,,NaT,Подслушано,"драмы, триллеры",68581.0,68581.0,7.0
8,666262,4880,,NaT,Афера,комедии,55043.0,55043.0,8.0
9,666262,142,,NaT,Маша,"драмы, триллеры",45367.0,45367.0,9.0


In [None]:
# Initialize another VisualAnalyzer object with a RandomModel and the same relevant data
random_visualizer = VisualAnalyzer(random_model,
                                   dataset,
                                   selected_user_ids,
                                   item_data)

# Visualize the viewing history and recommendations for selected users using the RandomModel
random_visualizer.visualize_history_and_recommendations()

Unnamed: 0,user_id,item_id,weight,datetime,title,genres,views_counted,score,rank
0,666262,93,803.55,2021-07-21,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы",1.0,,
1,666262,10101,,NaT,Возвращение Будулая,мелодрамы,99.0,10.0,1.0
2,666262,619,,NaT,Новые приключения Аладдина (жестовым языком),"зарубежные, комедии",1.0,9.0,2.0
3,666262,12618,,NaT,Пропавшая грамота,"фэнтези, комедии",51.0,8.0,3.0
4,666262,5967,,NaT,Братья вне игры,"драмы, спорт",262.0,7.0,4.0
5,666262,4041,,NaT,Фрилансеры,"криминал, детективы, драмы, зарубежные, боевики",19.0,6.0,5.0
6,666262,5701,,NaT,Алые паруса: Новая история,"комедии, мелодрамы",4.0,5.0,6.0
7,666262,9738,,NaT,Женщина в беде 3,"детективы, мелодрамы",2.0,4.0,7.0
8,666262,15247,,NaT,Гордость и предубеждение,"драмы, мелодрамы",150.0,3.0,8.0
9,666262,10004,,NaT,Болванчики,"мультфильм, приключения, комедии",51.0,2.0,9.0
