# Домашнее задание № 4 часть 1 (курс МТС, команда 33)

## Подготовка окружения для проведения эксперимента

In [None]:
# устанавливаем необходимые зависимости
!pip install rectools > None
!pip install optuna > None

In [None]:
# импортируем зависимости

import datetime
import optuna
import dill
import nmslib
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares
from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from lightfm import LightFM
from lightfm.data import Dataset as LFMDataset

In [None]:
# скачиваем датасеты

!wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip

In [None]:
!ls

In [None]:
!unzip ./kion_train.zip 

In [None]:
N = 10
METRICS = {"map@10": MAP(k=10),"recall@10": Recall(k=10)}

## Подготовка датасета

In [None]:
def get_datasets():
    inter_df = pd.read_csv('kion_train/interactions.csv')
    user_df = pd.read_csv('kion_train/users.csv')
    item_df = pd.read_csv('kion_train/items.csv')
    inter_df.rename(columns={'last_watch_dt': 'datetime','total_dur': 'weight',},inplace=True)
    inter_df['datetime'] = pd.to_datetime(inter_df['datetime'])
    inter_df[Columns.Weight] = np.where(inter_df['watched_pct'] > 10, 3, 1)
    return inter_df, user_df, item_df

In [None]:
inter_df, user_df, item_df = get_datasets()

In [None]:
# визуализируем датасеты

inter_df.head(10)

In [None]:
item_df.head(10)

In [None]:
user_df.head(10)

In [None]:
# формируем трейновый и тестовый датасет

last_date = inter_df[Columns.Datetime].max().normalize()

train = inter_df[inter_df[Columns.Datetime] < last_date - pd.Timedelta(days=7)].copy()
test = inter_df[inter_df[Columns.Datetime] >= last_date - pd.Timedelta(days=7)].copy()

print(f"Размерность трейна : {train.shape}")
print(f"Размерность теста: {test.shape}")

In [None]:
# убираем холодных пользователей из теста

cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [None]:
print(user_df.isna().sum())

In [None]:
# заполняем пустые значения

user_df.fillna('Unknown', inplace=True)
user_df = user_df.loc[user_df[Columns.User].isin(train[Columns.User])].copy()
user_df.head()

In [None]:
# извлекаем возраст, пол и доход пользователей

user_features_frames = []
for feature in ["age", "sex", "income"]:
    feature_frame = user_df.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

In [None]:
item_df = item_df.loc[item_df[Columns.Item].isin(train[Columns.Item])].copy()
item_df.head()

In [None]:
# определяем тип контента

print(item_df.content_type.unique())

In [None]:
# извлекаем жанры для каждого фильма и сериала

item_df["genre"] = item_df["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = item_df[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

In [None]:
# извлекаем тип контента для каждого айтема

content_feature = item_df.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

In [None]:
# извлекаем квантиль года выпуска для каждого айтема

item_df['binned_r_year'] = pd.qcut(item_df['release_year'], q=10, labels=list(range(10)))
release_year_feature = item_df.reindex(columns=[Columns.Item, "binned_r_year"])
release_year_feature.columns = ["id", "value"]
release_year_feature["feature"] = "binned_r_year"
release_year_feature.head()

In [None]:
# извлекаем местоположение студии для каждого айтема

item_df["country"] = item_df["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = item_df[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"
country_feature.head()

In [None]:
# конкатенируем таблицы с фичами и проверяем по рандомному id

item_features = pd.concat((genre_feature, content_feature, country_feature, release_year_feature))
item_features[item_features['id'] == 222]

In [None]:
# создаем датасет с фичами

DATASET = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["age", "sex", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "country", 'binned_r_year', 'content_type'],
)

In [None]:
DATASET

In [None]:
# определяем пользователей для теста

TEST_USERS = test[Columns.User].unique()

## Обучение факторизационных машин

### 1. ALS - Alternating Least Squares

In [None]:
def objective_train_als_model(trial):
    n_factors = trial.suggest_int("n_factors", low=32, high=128, step=32)
    regularization = trial.suggest_float("regularization", low=0.01, high=0.51, step=0.1)
    model = ImplicitALSWrapperModel(
      model=AlternatingLeastSquares(
        factors=n_factors, 
        regularization=regularization,
        random_state=100, 
      ),
      fit_features_together=True,
    )
    model.fit(DATASET)
    rec = model.recommend(
      users=TEST_USERS,
      dataset=DATASET,
      k=N,
      filter_viewed=True,
    )
    metrics = calc_metrics(METRICS, rec, test, train)
    return metrics['map@10'], metrics['recall@10']

def save_best_model(trial):
    n_factors = trial.suggest_int("n_factors", low=32, high=128, step=32)
    regularization = trial.suggest_float("regularization", low=0.01, high=0.51, step=0.1)
    model = ImplicitALSWrapperModel(
      model=AlternatingLeastSquares(
        factors=n_factors, 
        regularization=regularization,
        random_state=100, 
      ),
      fit_features_together=True,
    )
    model.fit(DATASET)
    with open('als_model.dill', 'wb') as f:
        dill.dump(model.model, f)

In [None]:
def print_train_loop_inf(train_loop):
    print("Завершенных сессий: ", len(train_loop.trials))
    print("Результаты: ")
    trials = train_loop.best_trials
    for trial in trials:
        print("  Параметры: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))

In [None]:
%%time

# обучаем ALS модель

study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective_train_als_model, n_trials=10)

In [None]:
print_train_loop_inf(study)

In [None]:
# визуализируем лучший вариант

study.best_trials

In [None]:
# сохраняем лучший вариант

save_best_model(study.best_trials[0])

### 2. Обучение LightFM

In [None]:
def objective_train_light_fm(trial):
    n_factors = trial.suggest_int("n_factors", low=32, high=128, step=32)
    loss = trial.suggest_categorical("loss", choices=['logistic', 'bpr', 'warp'])
    lr = trial.suggest_float("lr", low=0.05, high=0.25, step=0.05)
    item_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)
    user_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)
    model = LightFMWrapperModel(
      model=LightFM(
        no_components=n_factors, 
        loss=loss, 
        random_state=100,
        learning_rate=lr,
        user_alpha=user_alpha,
        item_alpha=item_alpha,
      ),
      epochs=2,
      num_threads=1,
    )
    model.fit(DATASET)
    rec = model.recommend(
      users=TEST_USERS,
      dataset=DATASET,
      k=N,
      filter_viewed=True,
    )
    metrics = calc_metrics(METRICS, rec, test, train)
    return metrics['map@10'], metrics['recall@10']

def save_best_model(trial):
    n_factors = trial.suggest_int("n_factors", low=32, high=128, step=32)
    loss = trial.suggest_categorical("loss", choices=['logistic', 'bpr', 'warp'])
    lr = trial.suggest_float("lr", low=0.05, high=0.25, step=0.05)
    item_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)
    user_alpha = trial.suggest_float("item_alpha", low=0.0, high=0.1, step=0.05)
    model = LightFMWrapperModel(
      model=LightFM(
        no_components=n_factors, 
        loss=loss, 
        random_state=100,
        learning_rate=lr,
        user_alpha=user_alpha,
        item_alpha=item_alpha,
      ),
      epochs=3,
      num_threads=1,
    )
    model.fit(DATASET)
    with open('lightfm.dill', 'wb') as f:
        dill.dump(model.model, f)

In [None]:
%%time

# обучаем LightFM модель

study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(objective_train_light_fm, n_trials=4)

In [None]:
print_train_loop_inf(study)

In [None]:
study.best_trials

In [None]:
save_best_model(study.best_trials[0])

## Создание аватаров

Для выполнения данного задания мы решили создать 3 аватара: 1) молодой мужчина, интересующийся немецкими ужастиками; 2) пожилой мужчина, увлекающийся американскими боевиками; 3) женщина среднего возраста, предпочитающая французские комедии.

In [None]:
def create_watched_list(user_id, country, genre):
    data = item_df[(item_df['countries'].str.contains(country)) & item_df['genres'].str.contains(genre)].sample(7, random_state=100)['title'].values
    avatar_watched_list = pd.DataFrame({"user_id": user_id, "title": data})
    avatar_watched_list = avatar_watched_list.merge(item_df[["item_id", "title", "genres"]], on="title")
    return avatar_watched_list

def create_avatar(user_id, sex, age, income):
    avatar = pd.DataFrame([{'id': user_id, 'value': sex, 'feature': 'sex'},
                            {'id': user_id, 'value': age, 'feature': 'age'},
                            {'id': user_id, 'value': income, 'feature': 'income'},]
    )
    return avatar

In [None]:
ger_horror_watched = create_watched_list('ger_horror', 'Германия', 'ужасы')
ger_horror_watched

In [None]:
ger_horror = create_avatar('ger_horror', 'М', 'age_18_24', 'income_0_20')
ger_horror

In [None]:
rambo_watched = create_watched_list('rambo', 'США', 'боевик')
rambo_watched

In [None]:
rambo = create_avatar('rambo', 'М', 'age_65_inf', 'income_60_90')
rambo

In [None]:
madam_hihi_watched = create_watched_list('madam_hihi', 'Франция', 'комедии')
madam_hihi_watched

In [None]:
madam_hihi = create_avatar('madam_hihi', 'Ж', 'age_35_44', 'income_90_150')
madam_hihi

In [None]:
avatar_train = pd.concat([train, ger_horror_watched.drop(['title', 'genres'], axis=1),
                          rambo_watched.drop(['title', 'genres'], axis=1), 
                          madam_hihi_watched.drop(['title', 'genres'], axis=1),], sort=False)
avatar_train.head(10)

In [None]:
avatar_train.tail(10)

In [None]:
avatar_train['datetime'] = avatar_train['datetime'].fillna(datetime.datetime(2021, 1, 1))
avatar_train['weight'] = avatar_train['weight'].fillna(3)
avatar_train['watched_pct'] = avatar_train['watched_pct'].fillna(80)

In [None]:
DATASET = LFMDataset()
DATASET.fit(users=avatar_train["user_id"].values,items=avatar_train["item_id"].values,)
inter_matrix, _ = DATASET.build_interactions(zip(*avatar_train[["user_id", "item_id"]].values.T))

In [None]:
model = LightFM(learning_rate=0.05, 
                loss='warp', 
                no_components=64,
                random_state=100,
)
model.fit(interactions = inter_matrix, 
                        epochs=2,
                        num_threads=20,
)

In [None]:
id_item_map = {v: k for k, v in DATASET._item_id_mapping.items()}

In [None]:
def get_recos(user_id, model, inter_matrix, user_to_id, id_to_item, n_recommendations = N):
    inner_id = user_to_id[user_id]
    scores = model.predict(user_ids=inner_id,
                            item_ids=np.arange(inter_matrix.shape[1]),
                            num_threads=20)
    watched_items = inter_matrix.col[inter_matrix.row == inner_id]
    scores[watched_items] = -np.inf
    reco_item_inner_ids = np.argpartition(scores, -np.arange(n_recommendations))[-n_recommendations:][::-1]
    recos = [id_to_item[x] for x in reco_item_inner_ids]
    return recos


In [None]:
# рекомендации для аватара ger_horror

user_id = "ger_horror"

recommended_items = get_recos(user_id=user_id,
                              model=model,
                              inter_matrix=inter_matrix,
                              user_to_id=DATASET._user_id_mapping,
                              id_to_item=id_item_map,
                              n_recommendations=N
)

ger_horror_recos = pd.DataFrame({"user_id": user_id, "item_id": recommended_items}).merge(item_df[["item_id", "title", "genres"]])
ger_horror_recos

In [None]:
# рекомендации для аватара rambo

user_id = "rambo"

recommended_items = get_recos(user_id=user_id,
                              model=model,
                              inter_matrix=inter_matrix,
                              user_to_id=DATASET._user_id_mapping,
                              id_to_item=id_item_map,
                              n_recommendations=N
)

rambo_recos = pd.DataFrame({"user_id": user_id, "item_id": recommended_items}).merge(item_df[["item_id", "title", "genres"]])
rambo_recos

In [None]:
# рекомендации для аватара madam_hihi

user_id = "madam_hihi"

recommended_items = get_recos(user_id=user_id,
                              model=model,
                              inter_matrix=inter_matrix,
                              user_to_id=DATASET._user_id_mapping,
                              id_to_item=id_item_map,
                              n_recommendations=N
)

madam_hihi_recos = pd.DataFrame({"user_id": user_id, "item_id": recommended_items}).merge(item_df[["item_id", "title", "genres"]])
madam_hihi_recos