In [5]:
import typing as tp
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from implicit.als import AlternatingLeastSquares
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import (
    MAP
)
import implicit
import os
from rectools.model_selection import TimeRangeSplitter, cross_validate
from rectools.models import ImplicitALSWrapperModel, PopularModel
from rectools.tools import UserToItemAnnRecommender
from tqdm import tqdm

In [6]:
users = pd.read_csv("/kaggle/input/kion-dataset/users.csv")
items = pd.read_csv("/kaggle/input/kion-dataset/items.csv")
interactions = pd.read_csv("/kaggle/input/kion-dataset/interactions.csv")

In [7]:
interactions.rename(
        columns={
                'last_watch_dt': 'datetime',
                'total_dur': 'weight'
        }, 
        inplace=True)
interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [8]:
test_start_date = interactions['datetime'].max() - pd.Timedelta(days=7)
train = interactions[(interactions['datetime'] < test_start_date) & (interactions['weight'] >= 300)]
test = interactions[interactions['datetime'] >= test_start_date]

train_users = train['user_id'].unique()
warm_test = test[test['user_id'].isin(train_users)]
cold_test = test[~test['user_id'].isin(train_users)]

In [23]:
dataset = Dataset.construct(
    interactions_df=train
)

In [17]:
model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=32,
            random_state=42,
            num_threads=2,
        ))
model.fit(dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x78c5ac9f93c0>

In [56]:
metric = {
    'MAP@5': MAP(k=5),
    'MAP@10': MAP(k=10)
}

### **Реализуем функцию для optuna, сюда можем передавать и другие модели и перебирать их гиперпараметры, делаю на кагле с GPU поэтому можно попробовать для ALS что-то быстро перебрать**

In [16]:
import optuna
from implicit.als import AlternatingLeastSquares
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = 4
def objective(trial):
    factors = trial.suggest_int('factors', 10, 100, step=10)
    regularization = trial.suggest_float('regularization', 0.01, 0.1, step=0.01)

    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            random_state=42,
            num_threads=2,
        )
    )
    
    model.fit(dataset)
    recs = model.recommend(
        users=warm_test['user_id'].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True
    )
    metric_value = calc_metrics(metric, recs, warm_test, train)['MAP@10']
    return metric_value

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=10, n_jobs=2)

best_params = study.best_params
best_metric_value = study.best_value

print("Best MAP@10:", best_metric_value)
print("Best Hyperparameters:", best_params)


[I 2023-12-05 17:33:15,856] A new study created in memory with name: no-name-88be229f-ad35-4025-8d46-765ee8a694da
[I 2023-12-05 17:34:43,082] Trial 0 finished with value: 0.009340079047551512 and parameters: {'factors': 90, 'regularization': 0.03}. Best is trial 0 with value: 0.009340079047551512.
[I 2023-12-05 17:34:43,139] Trial 1 finished with value: 0.0069911689091521545 and parameters: {'factors': 80, 'regularization': 0.08}. Best is trial 0 with value: 0.009340079047551512.




[I 2023-12-05 17:36:12,884] Trial 2 finished with value: 0.007641304978851978 and parameters: {'factors': 100, 'regularization': 0.06999999999999999}. Best is trial 0 with value: 0.009340079047551512.
[I 2023-12-05 17:36:12,917] Trial 3 finished with value: 0.00746093202122258 and parameters: {'factors': 90, 'regularization': 0.06999999999999999}. Best is trial 0 with value: 0.009340079047551512.
[I 2023-12-05 17:37:40,214] Trial 4 finished with value: 0.007460577617827046 and parameters: {'factors': 90, 'regularization': 0.09}. Best is trial 0 with value: 0.009340079047551512.
[I 2023-12-05 17:37:40,240] Trial 5 finished with value: 0.01488696682378543 and parameters: {'factors': 40, 'regularization': 0.03}. Best is trial 5 with value: 0.01488696682378543.
[I 2023-12-05 17:39:03,524] Trial 7 finished with value: 0.020141784512617993 and parameters: {'factors': 30, 'regularization': 0.01}. Best is trial 7 with value: 0.020141784512617993.
[I 2023-12-05 17:39:03,555] Trial 6 finished wi

Best MAP@10: 0.020141784512617993
Best Hyperparameters: {'factors': 30, 'regularization': 0.01}


### **Воспользуемся приблеженным поиском ближайших соседей**

In [34]:
user_vectors, item_vectors = model.get_vectors()

In [57]:
alsup = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)
alsup.fit()

<rectools.tools.ann.UserToItemAnnRecommender at 0x78c5ad77b250>

In [61]:
alsup.get_item_list_for_user(123123, top_n=10)

array([11132, 12598,  5651, 15423,   254,   111,   297,  3419,  1986,
         623])

### **Воспользуемся информацией о пользователях и о айтемах**

In [63]:
selected_users = users[users[Columns.User].isin(interactions[Columns.User])]
selected_items = items[items[Columns.Item].isin(interactions[Columns.Item])]

def create_user_feature_frame(users, feature):
    frame = users[[Columns.User, feature]].rename(columns={Columns.User: "id", feature: "value"})
    frame["feature"] = feature
    return frame

user_features = pd.concat([create_user_feature_frame(selected_users, feature) for feature in ['sex', 'income', 'age']])

selected_items["genre"] = selected_items["genres"].str.lower().replace(", ", ",", regex=False).str.split(",")

def create_item_feature_frame(items, feature_name, column_name):
    frame = items.explode(feature_name) if feature_name == "genre" else items
    return frame[[Columns.Item, feature_name]].rename(columns={Columns.Item: "id", feature_name: "value"}).assign(feature=column_name)

item_features = pd.concat([
    create_item_feature_frame(selected_items, "genre", "genre"),
    create_item_feature_frame(selected_items, "content_type", "content_type")
])


In [48]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [49]:
model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=32,
            random_state=42,
            num_threads=2,
        ))
model.fit(dataset)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]



<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x78c5adc4c730>

**Ну и как всегда сделаем PopularModel, чтобы было что-то для холодных пользователей о которых нет никакой информации**

In [50]:
pop_model = PopularModel()
pop_model.fit(dataset)

<rectools.models.popular.PopularModel at 0x78c5adc4c940>

Сохраним в csv

In [None]:
all_recos = model.recommend(
    users=interactions[Columns.User].unique(),
    dataset=dataset,
    k=10,
    filter_viewed=True,
)[[Columns.User, Columns.Item]]
all_recos.to_csv('rico.csv')

In [None]:
hold_recos = pop_model.recommend(
    users=interactions[Columns.User].unique(),
    dataset=dataset,
    k=10,
    filter_viewed=True,
)[[Columns.User, Columns.Item]]
all_recos.to_csv('hold.csv')