In [1]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [2]:
import logging
from datetime import datetime
from pathlib import Path
import pandas as pd

pd.set_option("display.max_rows", 102)
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

import optuna

from src.utils import (
    load_MovieLens,
    train_test_val_split,
    seed_everything,
    trainDatasetWithNumCatFeatures,
    split_test_df,
    train,
    predict,
)
from src.models import DCNv2
from src.metrics import reccomendation_report

## Constants

In [3]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7
MODEL_NAME = "DCNv2"

In [4]:
seed_everything(RANDOM_STATE)

## Data

In [5]:
# subsample 30% of data for tuning
df_users, df_movies, df_ratings = load_MovieLens(DATA_FOLDER, sample_frac=0.3)

### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [6]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)
df_test = pd.concat([df_test, df_val], ignore_index=True)

Enriching test:   0%|          | 0/48 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/12 [00:00<?, ?it/s]

In [7]:
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

### Loading additional data

In [8]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

In [9]:
ord_user = OrdinalEncoder()
user_cat = torch.tensor(ord_user.fit_transform(df_users[["gender", "occupation"]]))

ord_movie = OrdinalEncoder()
movie_cat = torch.tensor(ord_movie.fit_transform(df_movies[["genre"]]))

ss_user = StandardScaler()
user_num = torch.tensor(ss_user.fit_transform(df_users[["age"]]))

# features for test
user_cat_test = user_cat[df_test["userId"].values].clone().detach().to(torch.long)
user_num_test = user_num[df_test["userId"].values].clone().detach().to(torch.float)
movie_cat_test = movie_cat[df_test["movieId"].values].clone().detach().to(torch.long)
test_cat = torch.hstack((user_cat_test, movie_cat_test))


num_numeric_feats = 1
cat_feature_vocab = [len(i) for i in ord_user.categories_] + [
    len(i) for i in ord_movie.categories_
]

## Tuning

In [10]:
def objective(trial):
    # Define range of values to be tested for the hyperparameters
    l = trial.suggest_int("l", 1, 25)
    n_mlp_layers = trial.suggest_int("n_mlp_layers", 3, 10)
    mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
    mlp_kwargs_dropout = trial.suggest_categorical("mlp_kwargs_dropout", [True, False])
    mlp_kwargs_batchnorm = trial.suggest_categorical(
        "mlp_kwargs_batchnorm", [True, False]
    )
    mlp_kwargs_dropout_rate = trial.suggest_float("mlp_kwargs_dropout_rate", 0.1, 0.9)

    lr = trial.suggest_float("lr", 1e-4, 5e-3)
    n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)

    # Generate the model
    seed_everything(RANDOM_STATE)
    model = DCNv2(
        num_users,
        num_items,
        num_numeric_feats,
        cat_feature_vocab,
        l=l,
        n_mlp_layers=n_mlp_layers,
        mlp_layers_dim=int(mlp_layers_dim),
        mlp_kwargs={
            "activation": True,
            "dropout": mlp_kwargs_dropout,
            "batchnorm": mlp_kwargs_batchnorm,
            "dropout_rate": mlp_kwargs_dropout_rate,
        },
    )

    # Generate the optimizers
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
    criterion = torch.nn.BCELoss()
    n_epochs = int(n_epochs)
    device = "cpu"

    # Generate data
    seed_everything(RANDOM_STATE)
    train_loader = DataLoader(
        trainDatasetWithNumCatFeatures(
            df_train,
            df_movies["movieId"].nunique(),
            user_cat,
            user_num,
            movie_cat,
            verbose=False,
        ),
        batch_size=2048,
        shuffle=True,
    )
    test_loader = DataLoader(
        TensorDataset(
            torch.tensor(df_test["userId"]),
            torch.tensor(df_test["movieId"]),
            user_num_test,
            test_cat,
        ),
        batch_size=4096,
        shuffle=False,
    )

    # Train model
    train(
        model,
        train_loader,
        optimizer,
        scheduler,
        criterion,
        n_epochs,
        verbose=False,
    )

    # Evaluate
    df_test["pred"] = predict(model, test_loader, verbose=False).numpy()
    pred, target, pred_items = split_test_df(
        df_test, "userId", "movieId", "pred", "action"
    )
    k = 15
    hit_rate = reccomendation_report(
        pred, target, pred_items, cos_dist, popularity, k=k
    )[f"Hit rate @ {k}"]

    return hit_rate.item()

In [11]:
# set up logging
directory = Path(f"optuna/{MODEL_NAME}")
if not directory.exists():
    directory.mkdir()
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logger = logging.getLogger("optuna")
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
file_handler = logging.FileHandler(
    f"{directory.as_posix()}/optuna_logs_{current_time}.log"
)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

In [13]:
# launch tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=3600)

[I 2024-05-12 19:58:35,220] A new study created in memory with name: no-name-7f5b9e18-3a9a-4f9e-a3c5-cfd4852409ad
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
[I 2024-05-12 19:58:59,717] Trial 0 finished with value: 0.4333333373069763 and parameters: {'l': 25, 'n_mlp_layers': 4, 'mlp_layers_dim': 144.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.5363591245821298, 'lr': 0.004757231135862841, 'n_epochs': 19.0}. Best is trial 0 with value: 0.4333333373069763.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
[I 2024-05-12 19:59:17,876] Trial 1 finished with value: 0.3166666626930237 and parameters: {'l': 16, 'n_mlp_layers': 6, 'mlp_layers_dim': 128.0, 'mlp_kwargs_dropout': False, 'mlp_kwargs_batchnorm': False, 'mlp_kwargs_dropout_rate': 0.7655638706