# Байесовская оптимизация

In [67]:
import os
import timeit

In [68]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [69]:
import warnings

warnings.filterwarnings("ignore")

In [70]:
import pandas as pd
import numpy as np

from rectools.metrics import MAP
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

from pathlib import Path
from tqdm import tqdm

from lightfm import LightFM

from implicit.als import AlternatingLeastSquares

import optuna

## Loading Data

In [71]:
%%time

interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

CPU times: user 1.67 s, sys: 1.67 s, total: 3.34 s
Wall time: 3.34 s


## Preprocessing

In [72]:
def headtail(df: pd.DataFrame):
    return pd.concat([df.head(), df.tail()])

In [73]:
Columns.Datetime = "last_watch_dt"

In [74]:
interactions.drop(
    interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True
)

In [75]:
interactions[Columns.Datetime] = pd.to_datetime(
    interactions[Columns.Datetime], format="%Y-%m-%d"
)

In [76]:
max_date = interactions[Columns.Datetime].max()

In [77]:
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

In [78]:
train = interactions[
    interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)
].copy()
test = interactions[
    interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)
].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [79]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [80]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [81]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

## Features Preparing

### User features

In [82]:
users.fillna("Unknown", inplace=True)

In [83]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [84]:
users = users.astype({"kids_flg": bool})

#### Generate user features dataset

In [85]:
user_features_frames = []
for feature in ["sex", "age", "income", "kids_flg"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


### Item features

In [86]:
# Full data
items = pd.read_csv(DATA_PATH / "items.csv")

In [87]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

#### Genre

In [88]:
# Explode genres to flatten table
items["genre"] = (
    items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
)
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


#### Content

#### Release year

In [89]:
items["release_year"].fillna(int(items["release_year"].max()), inplace=True)

Преобразуем

In [90]:
items = items.astype({"release_year": int})

In [91]:
year_from = 1977
step = 5
bins = [
    year
    for year in range(
        year_from, items["release_year"].max() + step, step
    )
]

Делаем интервалы (строгое вхождение)

In [92]:
bins_bias = [item + 1 for item in bins]

In [93]:
pairs_strict = list(zip(bins_bias, bins[1:]))


In [94]:
# Add the most first release year
bins = [items["release_year"].min()] + bins
# Add interval for it
pairs_strict = [(items["release_year"].min(), bins[1])] + pairs_strict
# Generate feature values
labels = [f'year_{item[0]}_{item[1]}' for item in pairs_strict]

In [95]:
year_bins = pd.cut(items["release_year"], bins=bins, labels=labels, include_lowest=True)

In [96]:
items['release_year'] = year_bins.astype(str)

#### Age rating

In [97]:
items['age_rating'].fillna(0, inplace=True);

In [98]:
items = items.astype({"age_rating": int})

#### For kids

Если возраст 12+, тогда взрослый контент, иначе - детский 

In [99]:
items["for_kids"].loc[items["age_rating"] > 12] = 0
items["for_kids"].loc[items["age_rating"] <= 12] = 1

In [100]:
items = items.astype({"for_kids": bool})

#### Generate item features dataset

In [101]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [102]:
item_features_frames = []
for feature in ["content_type", "release_year", "age_rating", "for_kids"]:
    feature_frame = items.reindex(columns=[Columns.Item, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    item_features_frames.append(feature_frame)
item_features_frames.append(genre_feature)
item_features = pd.concat(item_features_frames)
headtail(item_features)

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
15960,10632,криминал,genre
15961,4538,драмы,genre
15961,4538,спорт,genre
15961,4538,криминал,genre
15962,3206,комедии,genre


### Model optimization

In [103]:
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

TRAIN = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
TEST = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

TRAIN.drop(TRAIN.query("total_dur < 300").index, inplace=True)

# отфильтруем холодных пользователей из теста
cold_users = set(TEST[Columns.User]) - set(TRAIN[Columns.User])
TEST.drop(TEST[TEST[Columns.User].isin(cold_users)].index, inplace=True)


In [104]:
item_features_train = item_features.loc[
    item_features.id.isin(
        TRAIN.item_id
    )
]
item_features_test = item_features.loc[
    item_features.id.isin(
        TEST.item_id
    )
]

In [105]:
user_features_train = user_features.loc[
    user_features.id.isin(
        TRAIN.user_id
    )
]
user_features_test = user_features.loc[
    user_features.id.isin(
        TEST.user_id
    )
]

In [106]:
# make dataset

DATASET = Dataset.construct(
    interactions_df=TRAIN,
    user_features_df=user_features_train,
    cat_user_features=["sex", "age", "income", "kids_flg"],
    item_features_df=item_features_train,
    cat_item_features=["genre", "content_type", "release_year", "for_kids", "age_rating"],
)

In [107]:
TEST_USERS = TEST[Columns.User].unique()
K_FOR_METRIC = 10
NUM_THREADS = 32

#### Make bayesian optimization for LightFM

Going to use MAP10 as black box function

In [66]:

def objective_function(
    model,
) -> float:

    model.fit(DATASET)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=DATASET,
        k=K_FOR_METRIC,
        filter_viewed=True,
    )

    return MAP(k=K_FOR_METRIC).calc(recos, TEST)

def objective(trial):


    N_FACTORS = trial.suggest_int("n_factors", 8, 64)
    LEARNING_RATE = trial.suggest_float("learning_rate", 0, 1)
    USER_ALPHA = trial.suggest_float("user_alpha", 0, 1)
    ITEM_ALPHA = trial.suggest_float("item_alpha", 0, 1)
    N_EPOCHS = trial.suggest_int("epochs", 1, 10)

    model = LightFMWrapperModel(
        LightFM(
            no_components=N_FACTORS, 
            loss='warp', 
            random_state=42,
            learning_rate=LEARNING_RATE,
            user_alpha=USER_ALPHA,
            item_alpha=ITEM_ALPHA,
        ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
    )

    return objective_function(model)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-12-12 22:58:29,857][0m A new study created in memory with name: no-name-f644173e-5c92-4169-8d41-ad58fb79546c[0m
[32m[I 2022-12-12 22:59:00,232][0m Trial 0 finished with value: 0.0731286290468275 and parameters: {'n_factors': 13, 'learning_rate': 0.9047338921626018, 'user_alpha': 0.0270968677466501, 'item_alpha': 0.04904404627547998, 'epochs': 1}. Best is trial 0 with value: 0.0731286290468275.[0m
[32m[I 2022-12-12 23:08:56,857][0m Trial 1 finished with value: 0.000430126593557974 and parameters: {'n_factors': 40, 'learning_rate': 0.4052453780755447, 'user_alpha': 0.9475123455238704, 'item_alpha': 0.9466846587296105, 'epochs': 4}. Best is trial 0 with value: 0.0731286290468275.[0m
[32m[I 2022-12-12 23:15:24,025][0m Trial 2 finished with value: 0.0021169665641801766 and parameters: {'n_factors': 30, 'learning_rate': 0.11644543573618182, 'user_alpha': 0.9577078949136297, 'item_alpha': 0.47454145738571396, 'epochs': 9}. Best is trial 0 with value: 0.0731286290468275.

Number of finished trials: 100
Best trial:
  Value: 0.07712667232330807
  Params: 
    n_factors: 18
    learning_rate: 0.30112589099935444
    user_alpha: 0.20170808005297858
    item_alpha: 0.12523508146591408
    epochs: 2


### Make optimization for ALS

В целях экономии времени, т.к. LightFM считался практически 12 часов, пришлось уменьшить размер данных для ALS

In [85]:
def objective_function(
    model,
) -> float:

    model.fit(DATASET)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=DATASET,
        k=K_FOR_METRIC,
        filter_viewed=True,
    )

    return MAP(k=K_FOR_METRIC).calc(recos, TEST)

def objective(trial):


    N_FACTORS = trial.suggest_int("n_factors", 8, 64)
    REGULARIZATION = trial.suggest_float("regularization", 0, 1)
    ITERATIONS = trial.suggest_int("iterations", 10, 1000)
    VER_NEG_SAMPLES = trial.suggest_categorical("verify_negative_samples", [True, False])

    model = ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=N_FACTORS,
            random_state=42,
            iterations=ITERATIONS,
            regularization=REGULARIZATION,
            num_threads=NUM_THREADS
        ),
    )

    return objective_function(model)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-12-13 09:39:54,680][0m A new study created in memory with name: no-name-97e6eec7-1e84-4725-bfc7-1856cfdcc3fe[0m
[32m[I 2022-12-13 09:40:10,481][0m Trial 0 finished with value: 0.028644785009254936 and parameters: {'n_factors': 8, 'regularization': 0.14800262472442505, 'iterations': 171, 'verify_negative_samples': True}. Best is trial 0 with value: 0.028644785009254936.[0m
[32m[I 2022-12-13 09:41:01,932][0m Trial 1 finished with value: 0.02355095935326332 and parameters: {'n_factors': 52, 'regularization': 0.8249422934192033, 'iterations': 446, 'verify_negative_samples': True}. Best is trial 0 with value: 0.028644785009254936.[0m
[32m[I 2022-12-13 09:42:28,177][0m Trial 2 finished with value: 0.0258039396440153 and parameters: {'n_factors': 29, 'regularization': 0.9043450288395243, 'iterations': 776, 'verify_negative_samples': True}. Best is trial 0 with value: 0.028644785009254936.[0m
[32m[I 2022-12-13 09:43:27,018][0m Trial 3 finished with value: 0.0247187757

Number of finished trials: 100
Best trial:
  Value: 0.028788608580317746
  Params: 
    n_factors: 8
    regularization: 0.3404183964091668
    iterations: 232
    verify_negative_samples: False
