In [2]:
%pwd


'/content'

In [72]:
!pip install  rectools implicit requests tqdm optuna lightfm rectools[lightfm] annoy

# Importing libraries

In [46]:
from pprint import pprint

import numpy as np
import pandas as pd

import requests
from tqdm.auto import tqdm
from rectools import Columns
from rectools.dataset import Dataset,Interactions
from rectools.models import ImplicitItemKNNWrapperModel

import optuna

from lightfm.cross_validation import random_train_test_split
from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm.cross_validation import random_train_test_split
from implicit.als import AlternatingLeastSquares
from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

from annoy import AnnoyIndex

# Getting the data

In [5]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

In [None]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [None]:
import zipfile as zf

files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()

In [6]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

# Some preprossesing

In [7]:
interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)
interactions.head()


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [8]:
max_date = interactions['datetime'].max()

train = interactions[(interactions['datetime'] < max_date - pd.Timedelta(days=7))]
test = interactions[(interactions['datetime'] >= max_date - pd.Timedelta(days=7))]


# оставляем только теплых пользователей в тесте
test = test[test['user_id'].isin(train['user_id'].unique())]

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (349088, 5)


In [19]:
dataset = Dataset.construct(
    interactions_df=train,

)
dataset

Dataset(user_id_map=IdMap(external_ids=array([176549, 699317, 656683, ..., 882138, 805174, 648596])), item_id_map=IdMap(external_ids=array([ 9506,  1659,  7107, ..., 13516, 13019, 10542])), interactions=Interactions(df=         user_id  item_id   weight   datetime
0              0        0   4250.0 2021-05-11
1              1        1   8317.0 2021-05-29
2              2        2     10.0 2021-05-09
3              3        3  14483.0 2021-07-05
4              4        0   6725.0 2021-04-30
...          ...      ...      ...        ...
5476244    69627      219   6804.0 2021-08-02
5476245    40052      132    753.0 2021-05-12
5476246   896790      318     76.0 2021-08-13
5476247   206604     2546   2308.0 2021-04-13
5476249     7236     1609   6203.0 2021-04-19

[4985269 rows x 4 columns]), user_features=None, item_features=None)

In [26]:
TEST_USERS = test[Columns.User].unique() #warm users

# Hyperparameters

In [10]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = (32,)
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

In [30]:
metrics = {
    'MAP@10': MAP(k=10)
}

# 1. Model Improvement and Hyperparameter Tuning

# 1.1 Implicit Model Tuning

In [None]:
def implicit_als_objective(trial):
    factors = trial.suggest_int("factors", 10, 300)
    regularization = trial.suggest_loguniform("regularization", 1e-5, 1e1)
    iterations = trial.suggest_int("iterations", 10, 50)

    model = ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,  # Add regularization parameter
            iterations=iterations,  # Add iterations parameter
            random_state=RANDOM_STATE,
            num_threads=NUM_THREADS
        )
    )

    model.fit(dataset)

    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

    metric_values = calc_metrics(metrics, recos, test, train)['MAP@10']

    return metric_values

study = optuna.create_study(direction="maximize")
study.optimize(implicit_als_objective, n_trials=50)

best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")


[I 2023-12-05 21:21:50,459] A new study created in memory with name: no-name-42fa2bf6-b047-4c12-ae3c-19ce0dada505
  regularization = trial.suggest_loguniform("regularization", 1e-5, 1e1)


## 1.2 LightFM Model Tuning

In [32]:
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split


def objective(trial):
    num_components = trial.suggest_int("num_components", 10, 200)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
    item_alpha = trial.suggest_float("item_alpha", 1e-6, 1e-3)

    model =  LightFMWrapperModel(
            LightFM(
                no_components=num_components,
                loss='warp',
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            ),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS,
        )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)['MAP@10']


    return metric_values

In [33]:
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())

study.optimize(objective, n_trials=10)

best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

[I 2023-12-05 17:38:48,978] A new study created in memory with name: no-name-0cc02030-fb85-432c-9b9b-339e6506bcac
[I 2023-12-05 17:39:47,927] Trial 0 finished with value: 0.08143342209224737 and parameters: {'num_components': 71, 'learning_rate': 0.09867552071845359, 'item_alpha': 0.00014837725367366}. Best is trial 0 with value: 0.08143342209224737.
[I 2023-12-05 17:41:32,756] Trial 1 finished with value: 0.08114170218577212 and parameters: {'num_components': 198, 'learning_rate': 0.02081866845144093, 'item_alpha': 5.576693749011004e-05}. Best is trial 0 with value: 0.08143342209224737.
[I 2023-12-05 17:43:08,681] Trial 2 finished with value: 0.08183190210800283 and parameters: {'num_components': 182, 'learning_rate': 0.061219890707954625, 'item_alpha': 0.00042705411927839403}. Best is trial 2 with value: 0.08183190210800283.
[I 2023-12-05 17:44:00,710] Trial 3 finished with value: 0.08206323218689399 and parameters: {'num_components': 43, 'learning_rate': 0.03616602495588512, 'item_a

Best Hyperparameters: {'num_components': 46, 'learning_rate': 0.02592156907197242, 'item_alpha': 0.0006750010130431169}


In [37]:
#Using the best Hyperparameters: {'num_components': 46, 'learning_rate': 0.02592156907197242, 'item_alpha': 0.0006750010130431169}
#We train the model
#Train model with the data
model = LightFMWrapperModel(
            LightFM(
                no_components=46,
                loss='warp',
                random_state=RANDOM_STATE,
                learning_rate=0.02592156907197242
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )

model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7e1a2ccb5f60>

# 1.3 Approximate Nearest Neighbors (ANN)

In [59]:
# Create Annoy index for items
def create_ann(item_embedding: np.array, num_trees: int = 10):
    item_index = AnnoyIndex(item_embedding.shape[1], 'dot')
    for i, item_embedding in enumerate(item_embedding):
        item_index.add_item(i, item_embedding)
    item_index.build(num_trees)
    return item_index

In [65]:
def generate_custom_recommendations(user_ids: np.array, data: Dataset, model: LightFMWrapperModel):
    # Get user embeddings
    internal_user_ids = data.user_id_map.to_internal.loc[user_ids].values
    user_embeddings, item_embeddings = model.get_vectors(data)
    user_embeddings_norm = user_embeddings / np.linalg.norm(user_embeddings, axis=1).reshape(-1, 1)
    need_user_embeddings_norm = user_embeddings_norm[internal_user_ids]

    # Create Annoy index for items
    item_index = create_ann(item_embeddings)

    # Make recommendations
    custom_recs = []
    for i, user_emb in tqdm(enumerate(need_user_embeddings_norm), total=need_user_embeddings_norm.shape[0]):
        similar_items = item_index.get_nns_by_vector(user_emb, K_RECOS)
        custom_recs.append({
            'inner_user_id': i,
            'custom_item_id': list(similar_items)
        })

    # Formatted DataFrame
    user_mapping_df = data.user_id_map.to_external.reset_index().rename(columns={'index': 'inner_user_id', 0: 'user_id'})

    custom_recs_df = pd.DataFrame(custom_recs)
    custom_recs_df = custom_recs_df.explode(column='custom_item_id').reset_index(drop=True).merge(
        user_mapping_df, on='inner_user_id')[['user_id', 'custom_item_id']]

    custom_recs_df['custom_rank'] = custom_recs_df.groupby('user_id').cumcount()
    return custom_recs_df

In [71]:
annoy_recommendations = generate_custom_recommendations(TEST_USERS, dataset, model)
calc_metrics(metrics, annoy_recommendations, test, train)

100%|██████████| 120519/120519 [00:02<00:00, 43814.89it/s]


{'MAP@10': 4.5060313170568964e-05}

In [None]:
model = LightFMWrapperModel(
            LightFM(
                no_components=46,
                loss='warp',
                random_state=RANDOM_STATE,
                learning_rate=0.02592156907197242
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )

model.fit(dataset)

# Save model with pickle for online service

In [68]:
import pickle
# Save the model to a pickle file
with open('model_hw_4.pkl', 'wb') as file:
    pickle.dump(model, file)