In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [2]:
import logging
from datetime import datetime
from pathlib import Path
import pandas as pd

pd.set_option("display.max_rows", 102)
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

import optuna

from src.utils import (
    load_MovieLens,
    train_test_val_split,
    seed_everything,
    split_test_df,
    train,
    predict,
    enrich_train_with_negatives,
)
from src.models import AutoInt
from src.metrics import reccomendation_report

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7
MODEL_NAME = "AutoInt"

In [4]:
seed_everything(RANDOM_STATE)

## Data

In [5]:
# subsample 30% of data for tuning
df_users, df_movies, df_ratings = load_MovieLens(DATA_FOLDER, sample_frac=0.3)

### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [6]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)
df_test = pd.concat([df_test, df_val], ignore_index=True)

Enriching test:   0%|          | 0/48 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/12 [00:00<?, ?it/s]

In [7]:
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

In [8]:
seed_everything(RANDOM_STATE)
df_train = enrich_train_with_negatives(df_train, df_movies)

  0%|          | 0/10585 [00:00<?, ?it/s]

### Loading additional data

In [9]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

In [10]:
ord_user = OrdinalEncoder()
user_cat = pd.DataFrame(
    ord_user.fit_transform(df_users[["gender", "occupation"]]),
    columns=ord_user.feature_names_in_,
).assign(userId=df_users["userId"])

ord_movie = OrdinalEncoder()
movie_cat = pd.DataFrame(
    ord_movie.fit_transform(df_movies[["genre"]]), columns=ord_movie.feature_names_in_
).assign(movieId=df_movies["movieId"])

ss_user = StandardScaler()
user_num = pd.DataFrame(
    ss_user.fit_transform(df_users[["age"]]), columns=ss_user.feature_names_in_
).assign(userId=df_users["userId"])

df_train = (
    df_train[["userId", "movieId", "label"]]
    .merge(user_cat, on="userId")
    .merge(user_num, on="userId")
    .merge(movie_cat, on="movieId")
    .rename(columns={"label": "action"})
)
df_test = (
    df_test[["userId", "movieId", "action"]]
    .merge(user_cat, on="userId")
    .merge(user_num, on="userId")
    .merge(movie_cat, on="movieId")
)

## Tuning

In [11]:
def objective(trial):
    # Define range of values to be tested for the hyperparameters
    num_attn_blocks = trial.suggest_int("num_attn_blocks", 1, 10)
    n_mlp_layers = trial.suggest_int("n_mlp_layers", 3, 10)
    mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
    mlp_kwargs_dropout = trial.suggest_categorical("mlp_kwargs_dropout", [True, False])
    mlp_kwargs_batchnorm = trial.suggest_categorical(
        "mlp_kwargs_batchnorm", [True, False]
    )
    mlp_kwargs_dropout_rate = trial.suggest_float("mlp_kwargs_dropout_rate", 0.1, 0.9)

    lr = trial.suggest_float("lr", 1e-4, 5e-3)
    n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)

    # Generate the model
    seed_everything(RANDOM_STATE)
    autoint = AutoInt(
        task="classification",
        target=["action"],
        learning_rate=lr,
        head="LinearHead",
        max_epochs=int(n_epochs),
        layers="-".join([str(mlp_layers_dim) for _ in range(n_mlp_layers)]),
        num_attn_blocks=num_attn_blocks,
        verbose=False,
    )

    autoint.fit(df_train)

    # Evaluate
    df_test["pred"] = autoint.predict(df_test)
    pred, target, pred_items = split_test_df(
        df_test, "userId", "movieId", "pred", "action"
    )
    k = 15
    hit_rate = reccomendation_report(
        pred, target, pred_items, cos_dist, popularity, k=k
    )[f"Hit rate @ {k}"]

    return hit_rate.item()

In [12]:
# set up logging
directory = Path(f"optuna/{MODEL_NAME}")
if not directory.exists():
    directory.mkdir()
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logger = logging.getLogger("optuna")
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
file_handler = logging.FileHandler(
    f"{directory.as_posix()}/optuna_logs_{current_time}.log"
)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

In [13]:
# launch tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=3600)

[I 2024-05-12 20:25:29,772] A new study created in memory with name: no-name-229d2507-68e2-4f53-8740-2131bca383ff
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col]

Output()

`Trainer.fit` stopped: `max_epochs=11` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-12 20:25:55,939] Trial 0 finished with value: 0.44999998807907104 and parameters: {'num_attn_blocks': 9, 'n_mlp_layers': 7, 'mlp_layers_dim': 336.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.6588939541623324, 'lr': 0.0018036696629278783, 'n_epochs': 11.0}. Best is trial 0 with value: 0.44999998807907104.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0.

Output()

`Trainer.fit` stopped: `max_epochs=7` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-12 20:26:02,208] Trial 1 finished with value: 0.18333333730697632 and parameters: {'num_attn_blocks': 2, 'n_mlp_layers': 10, 'mlp_layers_dim': 64.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.13240345615750293, 'lr': 0.0010291036117014286, 'n_epochs': 7.0}. Best is trial 0 with value: 0.44999998807907104.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. T

Output()

`Trainer.fit` stopped: `max_epochs=7` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-12 20:26:10,574] Trial 2 finished with value: 0.30000001192092896 and parameters: {'num_attn_blocks': 3, 'n_mlp_layers': 9, 'mlp_layers_dim': 32.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.1571934377888784, 'lr': 0.0010850462491468307, 'n_epochs': 7.0}. Best is trial 0 with value: 0.44999998807907104.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. Th

Output()

`Trainer.fit` stopped: `max_epochs=9` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-12 20:26:19,211] Trial 3 finished with value: 0.5666666626930237 and parameters: {'num_attn_blocks': 2, 'n_mlp_layers': 6, 'mlp_layers_dim': 112.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.44576393026785144, 'lr': 0.004574395353575305, 'n_epochs': 9.0}. Best is trial 3 with value: 0.5666666626930237.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. This

Output()

`Trainer.fit` stopped: `max_epochs=13` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-12 20:26:43,688] Trial 4 finished with value: 0.5 and parameters: {'num_attn_blocks': 7, 'n_mlp_layers': 6, 'mlp_layers_dim': 176.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.19911918241493717, 'lr': 0.0017849742937394368, 'n_epochs': 13.0}. Best is trial 3 with value: 0.5666666626930237.
