In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [2]:
import logging
from datetime import datetime
from pathlib import Path
import pandas as pd

pd.set_option("display.max_rows", 102)
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset

import optuna

from src.utils import (
    load_MovieLens,
    train_test_val_split,
    seed_everything,
    UserMovieDataset,
    split_test_df,
    train,
    predict,
)
from src.models import NCF
from src.metrics import reccomendation_report

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7
MODEL_NAME = "NCF"

In [4]:
seed_everything(RANDOM_STATE)

## Data

In [5]:
# subsample 30% of data for tuning
df_users, df_movies, df_ratings = load_MovieLens(DATA_FOLDER, sample_frac=0.3)

### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [6]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)
df_test = pd.concat([df_test, df_val], ignore_index=True)

Enriching test:   0%|          | 0/1449 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/363 [00:00<?, ?it/s]

In [7]:
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

### Loading additional data

In [8]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

## Tuning

In [9]:
def objective(trial):
    # Define range of values to be tested for the hyperparameters
    n_mlp_layers = trial.suggest_int("n_mlp_layers", 3, 10)
    mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
    mlp_kwargs_dropout = trial.suggest_categorical("mlp_kwargs_dropout", [True, False])
    mlp_kwargs_batchnorm = trial.suggest_categorical(
        "mlp_kwargs_batchnorm", [True, False]
    )
    mlp_kwargs_dropout_rate = trial.suggest_float("mlp_kwargs_dropout_rate", 0.1, 0.9)

    lr = trial.suggest_float("lr", 1e-4, 5e-3)
    n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)

    # Generate the model
    seed_everything(RANDOM_STATE)
    model = NCF(
        num_users,
        num_items,
        n_mlp_layers=n_mlp_layers,
        mlp_layers_dim=int(mlp_layers_dim),
        mlp_kwargs={
            "activation": True,
            "dropout": mlp_kwargs_dropout,
            "batchnorm": mlp_kwargs_batchnorm,
            "dropout_rate": mlp_kwargs_dropout_rate,
        },
    )

    # Generate the optimizers
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
    criterion = torch.nn.BCELoss()
    n_epochs = int(n_epochs)
    device = "cpu"

    # Generate data
    seed_everything(RANDOM_STATE)
    train_loader = DataLoader(
        UserMovieDataset(df_train, df_movies["movieId"].nunique(), verbose=False),
        batch_size=2048,
        shuffle=True,
    )
    test_loader = DataLoader(
        TensorDataset(
            torch.tensor(df_test["userId"]), torch.tensor(df_test["movieId"])
        ),
        batch_size=4096,
        shuffle=False,
    )

    # Train model
    train(
        model,
        train_loader,
        optimizer,
        scheduler,
        criterion,
        n_epochs,
        verbose=False,
    )

    # Evaluate
    df_test["pred"] = predict(model, test_loader, verbose=False).numpy()
    pred, target, pred_items = split_test_df(
        df_test, "userId", "movieId", "pred", "action"
    )
    k = 15
    hit_rate = reccomendation_report(
        pred, target, pred_items, cos_dist, popularity, k=k
    )[f"Hit rate @ {k}"]

    return hit_rate.item()

In [10]:
# set up logging
directory = Path(f"optuna/{MODEL_NAME}")
if not directory.exists():
    directory.mkdir()
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logger = logging.getLogger("optuna")
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
file_handler = logging.FileHandler(
    f"{directory.as_posix()}/optuna_logs_{current_time}.log"
)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

In [11]:
# launch tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=1800)

[I 2024-05-12 17:08:26,941] A new study created in memory with name: no-name-a06c15ff-130b-44b1-9a0f-8bb8c39a4797
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
[I 2024-05-12 17:12:48,003] Trial 0 finished with value: 0.5551876425743103 and parameters: {'n_mlp_layers': 4, 'mlp_layers_dim': 304.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.6579776259944203, 'lr': 0.0011921813618379527, 'n_epochs': 11.0}. Best is trial 0 with value: 0.5551876425743103.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
[I 2024-05-12 17:16:56,142] Trial 1 finished with value: 0.6512141227722168 and parameters: {'n_mlp_layers': 6, 'mlp_layers_dim': 496.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.5587254241057561, 'lr': 0.