In [14]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [15]:
import logging
from datetime import datetime
from pathlib import Path
import pandas as pd

pd.set_option("display.max_rows", 102)
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

import optuna

from src.utils import (
    load_MovieLens,
    train_test_val_split,
    seed_everything,
    split_test_df,
    train,
    predict,
    enrich_train_with_negatives,
)
from src.models import AutoInt
from src.metrics import reccomendation_report

## Constants

In [16]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7
MODEL_NAME = "AutoInt"

In [17]:
seed_everything(RANDOM_STATE)

## Data

In [18]:
# subsample 30% of data for tuning
df_users, df_movies, df_ratings = load_MovieLens(DATA_FOLDER, sample_frac=0.3)

### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [19]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)
df_test = pd.concat([df_test, df_val], ignore_index=True)

Enriching test:   0%|          | 0/1449 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/363 [00:00<?, ?it/s]

In [20]:
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

In [21]:
seed_everything(RANDOM_STATE)
df_train = enrich_train_with_negatives(df_train, df_movies)

  0%|          | 0/297511 [00:00<?, ?it/s]

### Loading additional data

In [22]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

In [23]:
ord_user = OrdinalEncoder()
user_cat = pd.DataFrame(
    ord_user.fit_transform(df_users[["gender", "occupation"]]),
    columns=ord_user.feature_names_in_,
).assign(userId=df_users["userId"])

ord_movie = OrdinalEncoder()
movie_cat = pd.DataFrame(
    ord_movie.fit_transform(df_movies[["genre"]]), columns=ord_movie.feature_names_in_
).assign(movieId=df_movies["movieId"])

ss_user = StandardScaler()
user_num = pd.DataFrame(
    ss_user.fit_transform(df_users[["age"]]), columns=ss_user.feature_names_in_
).assign(userId=df_users["userId"])

df_train = (
    df_train[["userId", "movieId", "label"]]
    .merge(user_cat, on="userId")
    .merge(user_num, on="userId")
    .merge(movie_cat, on="movieId")
    .rename(columns={"label": "action"})
)
df_test = (
    df_test[["userId", "movieId", "action"]]
    .merge(user_cat, on="userId")
    .merge(user_num, on="userId")
    .merge(movie_cat, on="movieId")
)

## Tuning

In [24]:
def objective(trial):
    # Define range of values to be tested for the hyperparameters
    num_attn_blocks = trial.suggest_int("num_attn_blocks", 1, 10)
    n_mlp_layers = trial.suggest_int("n_mlp_layers", 3, 10)
    mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
    mlp_kwargs_dropout = trial.suggest_categorical("mlp_kwargs_dropout", [True, False])
    mlp_kwargs_batchnorm = trial.suggest_categorical(
        "mlp_kwargs_batchnorm", [True, False]
    )
    mlp_kwargs_dropout_rate = trial.suggest_float("mlp_kwargs_dropout_rate", 0.1, 0.9)

    lr = trial.suggest_float("lr", 1e-4, 5e-3)
    n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)

    # Generate the model
    seed_everything(RANDOM_STATE)
    autoint = AutoInt(
        task="classification",
        target=["action"],
        learning_rate=lr,
        head="LinearHead",
        max_epochs=int(n_epochs),
        layers="-".join([str(mlp_layers_dim) for _ in range(n_mlp_layers)]),
        num_attn_blocks=num_attn_blocks,
        verbose=False,
    )

    autoint.fit(df_train)

    # Evaluate
    df_test["pred"] = autoint.predict(df_test)
    pred, target, pred_items = split_test_df(
        df_test, "userId", "movieId", "pred", "action"
    )
    k = 15
    hit_rate = reccomendation_report(
        pred, target, pred_items, cos_dist, popularity, k=k
    )[f"Hit rate @ {k}"]

    return hit_rate.item()

In [25]:
# set up logging
directory = Path(f"optuna/{MODEL_NAME}")
if not directory.exists():
    directory.mkdir()
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logger = logging.getLogger("optuna")
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
file_handler = logging.FileHandler(
    f"{directory.as_posix()}/optuna_logs_{current_time}.log"
)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

In [26]:
# launch tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=3600)

[I 2024-05-13 17:51:08,516] A new study created in memory with name: no-name-7c17a0d8-e644-47ff-9bbf-082f8222bac7
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col]

Output()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:00:03,927] Trial 0 finished with value: 0.5728476643562317 and parameters: {'num_attn_blocks': 10, 'n_mlp_layers': 7, 'mlp_layers_dim': 400.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.260756289102412, 'lr': 0.00256962766874793, 'n_epochs': 13.0}. Best is trial 0 with value: 0.5728476643562317.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. Thi

Output()

`Trainer.fit` stopped: `max_epochs=17` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:11:57,194] Trial 1 finished with value: 0.685982346534729 and parameters: {'num_attn_blocks': 7, 'n_mlp_layers': 6, 'mlp_layers_dim': 480.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.4061264921701848, 'lr': 0.003073489226983736, 'n_epochs': 17.0}. Best is trial 1 with value: 0.685982346534729.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. This 

Output()

`Trainer.fit` stopped: `max_epochs=11` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:14:08,421] Trial 2 finished with value: 0.6986755132675171 and parameters: {'num_attn_blocks': 1, 'n_mlp_layers': 7, 'mlp_layers_dim': 496.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.6824616853932067, 'lr': 0.004633256565142509, 'n_epochs': 11.0}. Best is trial 2 with value: 0.6986755132675171.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. Thi

Output()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:19:38,975] Trial 3 finished with value: 0.573399543762207 and parameters: {'num_attn_blocks': 8, 'n_mlp_layers': 8, 'mlp_layers_dim': 448.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.615339443684543, 'lr': 0.004249875941194546, 'n_epochs': 13.0}. Best is trial 2 with value: 0.6986755132675171.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. This 

Output()

`Trainer.fit` stopped: `max_epochs=5` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:21:35,635] Trial 4 finished with value: 0.6147902607917786 and parameters: {'num_attn_blocks': 3, 'n_mlp_layers': 10, 'mlp_layers_dim': 256.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.8061700155101121, 'lr': 0.0030466430045243825, 'n_epochs': 5.0}. Best is trial 2 with value: 0.6986755132675171.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. T

Output()

`Trainer.fit` stopped: `max_epochs=9` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:26:17,123] Trial 5 finished with value: 0.6423841118812561 and parameters: {'num_attn_blocks': 5, 'n_mlp_layers': 4, 'mlp_layers_dim': 80.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.8492085385801856, 'lr': 0.0049785008060468155, 'n_epochs': 9.0}. Best is trial 2 with value: 0.6986755132675171.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. Thi

Output()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:32:49,059] Trial 6 finished with value: 0.5789183378219604 and parameters: {'num_attn_blocks': 10, 'n_mlp_layers': 3, 'mlp_layers_dim': 112.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.2091662035723502, 'lr': 0.0044008494597207685, 'n_epochs': 9.0}. Best is trial 2 with value: 0.6986755132675171.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. Th

Output()

`Trainer.fit` stopped: `max_epochs=17` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:38:50,618] Trial 7 finished with value: 0.6423841118812561 and parameters: {'num_attn_blocks': 3, 'n_mlp_layers': 9, 'mlp_layers_dim': 400.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.2558446764784378, 'lr': 0.0014025772185221178, 'n_epochs': 17.0}. Best is trial 2 with value: 0.6986755132675171.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. Th

Output()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:44:51,700] Trial 8 finished with value: 0.5722957849502563 and parameters: {'num_attn_blocks': 8, 'n_mlp_layers': 4, 'mlp_layers_dim': 288.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.14192971821739686, 'lr': 0.0040521558358012815, 'n_epochs': 11.0}. Best is trial 2 with value: 0.6986755132675171.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. T

Output()

`Trainer.fit` stopped: `max_epochs=7` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:50:49,637] Trial 9 finished with value: 0.5756070613861084 and parameters: {'num_attn_blocks': 9, 'n_mlp_layers': 4, 'mlp_layers_dim': 64.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.8218342956116115, 'lr': 0.004137426038315936, 'n_epochs': 7.0}. Best is trial 2 with value: 0.6986755132675171.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
Seed set to 42
The behavior will change in pandas 3.0. This 

Output()

`Trainer.fit` stopped: `max_epochs=19` reached.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
[I 2024-05-13 18:54:22,601] Trial 10 finished with value: 0.6683223247528076 and parameters: {'num_attn_blocks': 1, 'n_mlp_layers': 6, 'mlp_layers_dim': 272.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.6143091642308008, 'lr': 0.0012284355704123694, 'n_epochs': 19.0}. Best is trial 2 with value: 0.6986755132675171.
