<a href="https://colab.research.google.com/github/doronin99/RecoServiceTemplate/blob/task4/ALS_with_ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dependencies installing

In [None]:
!pip -q install dill
!pip -q install implicit
!pip -q install lightfm
!pip -q install nmslib
!pip -q install rectools

In [None]:
import os

In [None]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import dill

import pandas as pd
import numpy as np
from pathlib import Path
import typing as tp
from tqdm import tqdm

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization

from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, MeanInvUserFreq, Precision, Recall, Serendipity, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.tools import UserToItemAnnRecommender

## Data loading

In [None]:
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
!unzip -o data_original.zip
!rm data_original.zip

Archive:  data_original.zip
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  


In [None]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [None]:
# Renaming columns in the 'interactions' DataFrame
interactions_df = interactions.rename(columns={'total_dur': Columns.Weight,
                                               'last_watch_dt': Columns.Datetime})

# Converting the 'datetime' column to pandas datetime format
interactions_df['datetime'] = pd.to_datetime(interactions_df['datetime'])

# Displaying the modified 'interactions_df' DataFrame
interactions_df

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0


## Train/Test splitting

In [None]:
# Finding the maximum and minimum dates in the 'datetime' column of the 'interactions_df' DataFrame
max_date = interactions_df['datetime'].max()
min_date = interactions_df['datetime'].min()

# Displaying the maximum and minimum dates in the 'interactions_df' DataFrame
print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [None]:
# Creating a training set by selecting rows with datetime values earlier than 7 days before the maximum date
train = interactions_df[interactions_df[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()

# Creating a testing set by selecting rows with datetime values on or after 7 days before the maximum date
test = interactions_df[interactions_df[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (490982, 5)


In [None]:
# Identifying cold users
cold_users = set(test[Columns.User]) - set(train[Columns.User])

# Removing rows corresponding to cold users from the test set
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

## Models

In [None]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = (4, 10, 22)

In [None]:
# Constructing a Dataset using the training interactions
dataset = Dataset.construct(
    interactions_df=train
)

# Defining a dictionary of recommendation models for evaluation
models = {
    "random": RandomModel(random_state=RANDOM_STATE),  # Random recommendation model
    "popular": PopularModel(),  # Popular items recommendation model
    "most_raited": PopularModel(popularity="sum_weight"),  # Most rated items recommendation model based on total weight
}

In [None]:
# Dictionary mapping implicit collaborative filtering model names to their corresponding classes
implicit_models = {
    'ALS': AlternatingLeastSquares,                 # Alternating Least Squares model
    'BPR': BayesianPersonalizedRanking,            # Bayesian Personalized Ranking model
    'LMF': LogisticMatrixFactorization,           # Logistic Matrix Factorization model
}

# Iterating over implicit collaborative filtering models and factors to create wrapper models
for implicit_name, implicit_model in implicit_models.items():
    for n_factors in N_FACTORS:
        # Creating wrapper models with unique names based on implicit model and number of factors
        models[f"{implicit_name}_{n_factors}"] = ImplicitALSWrapperModel(
            model=implicit_model(
                factors=n_factors,
                random_state=RANDOM_STATE,
                num_threads=NUM_THREADS
            )
        )

In [None]:
# Tuple of loss functions for the LightFM model
lightfm_losses = ('logistic', 'bpr', 'warp')

# Iterating over LightFM loss functions and factors to create wrapper models
for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        # Creating wrapper models with unique names based on LightFM loss, number of factors, and epochs
        models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
            LightFM(
                no_components=n_factors,
                loss=loss,
                random_state=RANDOM_STATE,
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )

In [None]:
# Defining a dictionary that maps metric names to corresponding metrics classes
metrics_name = {
    'Precision': Precision,             # Precision metric
    'Recall': Recall,                   # Recall metric
    'MAP': MAP,                         # Mean Average Precision metric
    'novelty': MeanInvUserFreq,         # Novelty metric based on mean inverse user frequency
    'serendipity': Serendipity,         # Serendipity metric
}

# Extracting internal user IDs from the dataset user_id_map
catalog = dataset.user_id_map.internal_ids

# Creating a dictionary to store metric instances for evaluation
metrics = {}

# Iterating over defined metrics and values of k for top-k recommendations
for metric_name, metric in metrics_name.items():
    for k in range(1, 10+1):
        # Creating metric instances with unique names based on metric name and k value
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [None]:
# Initializing an empty list to store model evaluation results
results = []

# Iterating over recommendation models using tqdm for progress tracking
for model_name, model in tqdm(models.items()):
    # Creating a dictionary to store model quality metrics, initializing with the model name
    model_quality = {'model': model_name}

    # Fitting the model on the training dataset
    model.fit(dataset)

    # Generating recommendations for test users
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

    # Calculating metric values for the model and updating the model_quality dictionary
    metric_values = calc_metrics(metrics, recos, test, train, catalog=catalog)
    model_quality.update(metric_values)

    # Appending the model_quality dictionary to the results list
    results.append(model_quality)

100%|██████████| 21/21 [40:35<00:00, 115.99s/it]


In [None]:
# Creating a DataFrame 'df_quality' to store model evaluation results and transposing it
df_quality = pd.DataFrame(results).T

# Setting the first row as column names
df_quality.columns = df_quality.iloc[0]

# Dropping the 'model' row as it is now used as column names
df_quality.drop('model', inplace=True)

# Applying a style to highlight maximum values in each column with light green color
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,random,popular,most_raited,ALS_4,ALS_10,ALS_22,BPR_4,BPR_10,BPR_22,LMF_4,LMF_10,LMF_22,LightFM_logistic_4,LightFM_logistic_10,LightFM_logistic_22,LightFM_bpr_4,LightFM_bpr_10,LightFM_bpr_22,LightFM_warp_4,LightFM_warp_10,LightFM_warp_22
Precision@1,0.000149,0.073308,0.072163,0.008049,0.017209,0.018221,0.039446,0.039911,0.041919,8e-06,5.8e-05,3.3e-05,0.0,0.0,3.3e-05,0.0,8e-06,0.0,0.08475,0.085754,0.085115
Recall@1,3.3e-05,0.038149,0.038087,0.003325,0.008714,0.008985,0.021454,0.022024,0.022167,2e-06,2.6e-05,1.3e-05,0.0,0.0,1.4e-05,0.0,4e-06,0.0,0.043638,0.043826,0.042947
Precision@2,0.000145,0.069263,0.063413,0.017827,0.016566,0.016056,0.0317,0.032709,0.035277,4.6e-05,8.3e-05,4.1e-05,4e-06,0.0,2.1e-05,0.0,4e-06,4e-06,0.071964,0.07198,0.072731
Recall@2,8e-05,0.071011,0.066715,0.019287,0.016824,0.015526,0.033883,0.034819,0.03665,2.8e-05,5.4e-05,2.3e-05,2e-06,0.0,1.4e-05,0.0,4e-06,0.0,0.072387,0.071393,0.071983
Precision@3,0.00016,0.066225,0.049303,0.020273,0.015237,0.014642,0.02619,0.028424,0.030867,4.4e-05,7.7e-05,5e-05,3e-06,0.0,2.5e-05,0.0,3e-06,3e-06,0.064972,0.064338,0.064233
Recall@3,0.000134,0.1004,0.07634,0.03285,0.023101,0.021336,0.041223,0.044171,0.046899,3.8e-05,7.6e-05,4.8e-05,2e-06,0.0,2.1e-05,0.0,4e-06,0.0,0.096289,0.094608,0.094151
Precision@4,0.000166,0.059383,0.039987,0.019026,0.014377,0.013421,0.022443,0.025504,0.027701,5e-05,8.9e-05,5.6e-05,4e-06,0.0,3.5e-05,0.0,2e-06,2e-06,0.059049,0.058155,0.057783
Recall@4,0.000177,0.118878,0.081875,0.040092,0.029109,0.025652,0.046344,0.051987,0.055089,6.4e-05,9.9e-05,7.6e-05,1e-05,0.0,3.5e-05,0.0,4e-06,0.0,0.115817,0.11278,0.111484
Precision@5,0.000181,0.052735,0.036461,0.017856,0.013737,0.01279,0.020017,0.023313,0.025496,5.5e-05,9e-05,6.3e-05,5e-06,0.0,4e-05,0.0,5e-06,2e-06,0.053097,0.052583,0.052448
Recall@5,0.000272,0.130473,0.092314,0.046598,0.034722,0.030101,0.05102,0.05879,0.062436,9.9e-05,0.000127,9.7e-05,1.8e-05,0.0,4.5e-05,0.0,6e-06,0.0,0.128674,0.125963,0.125163


## Model saving

In [None]:
# Creating a Dataset using the interactions DataFrame 'interactions_df'
dataset = Dataset.construct(
    interactions_df=interactions_df
)

# Creating an instance of the LightFMWrapperModel with specific parameters
output_model = LightFMWrapperModel(
    LightFM(
        no_components=10,
        loss='warp',
        random_state=RANDOM_STATE,
    ),
    epochs=10,
    num_threads=NUM_THREADS,
)

# Fitting the output_model on the dataset
output_model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x78a25ab5e020>

In [None]:
# Extracting user and item vectors from the trained output_model
user_vectors, item_vectors = output_model.get_vectors(dataset)

# Creating an instance of UserToItemAnnRecommender with extracted vectors and mapping information
ann_model = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)

# Fitting the approximate nearest neighbors model using the extracted vectors
ann_model.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x78a25ab5f760>

In [None]:
# Save the model to a file using dill
with open('LightFM_warp_10.dill', 'wb') as f:
    dill.dump(ann_model, f)

In [None]:
# Load the model from the saved file
with open('LightFM_warp_10.dill', 'rb') as f:
    model = dill.load(f)

# Example of finding similar items to the user with internal id = 11
list(model.get_item_list_for_user(user_id=11, top_n=K_RECOS))

[15297, 2720, 4151, 10440, 12192, 6192, 6809, 142, 9728, 2657]