In [13]:
## DO NOT ERASE THIS. IMPORTANT TO CORRECTLY IMPORT MODULES
import sys

sys.path.append("../")
sys.executable

'/Users/kristina/Desktop/University/COURSE_WORK/RecSys_thesis/recsysvenv/bin/python3.12'

In [14]:
import logging
from datetime import datetime
from pathlib import Path
import pandas as pd
import mmh3

pd.set_option("display.max_rows", 102)
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset

import optuna

from src.utils import (
    load_MovieLens,
    train_test_val_split,
    seed_everything,
    trainDatasetWithCrossFeatures,
    split_test_df,
    train,
    predict,
)
from src.models import wideAndDeep
from src.metrics import reccomendation_report

## Constants

In [15]:
DATA_FOLDER = "../../data/ml-1m/"
RANDOM_STATE = 7
MODEL_NAME = "Wide_and_deep"

In [16]:
seed_everything(RANDOM_STATE)

## Data

In [17]:
# subsample 30% of data for tuning
df_users, df_movies, df_ratings = load_MovieLens(DATA_FOLDER, sample_frac=0.3)

### Train-test split
Methodology: Last user interaction is a test item. The rest is train. Validation part is 20% of test.

In [18]:
df_train, df_test, df_val = train_test_val_split(df_ratings, df_movies, RANDOM_STATE)
df_test = pd.concat([df_test, df_val], ignore_index=True)

Enriching test:   0%|          | 0/1449 [00:00<?, ?it/s]

Enriching val:   0%|          | 0/363 [00:00<?, ?it/s]

In [19]:
num_users = df_users["userId"].nunique()
num_items = df_movies["movieId"].nunique()

### Loading additional data

In [20]:
# cos_dist = computeCosineSimilarities(df_train, "userId", "movieId", df_movies["movieId"].nunique())

# with open("../../data/cos_dist.pt", "wb") as f:
#     torch.save(cos_dist, f)

with open("../../data/cos_dist.pt", "rb") as f:
    cos_dist = torch.load(f)
popularity = torch.bincount(
    torch.tensor(df_train.movieId.values), minlength=df_movies["movieId"].nunique()
)
popularity = popularity / torch.max(popularity)

In [21]:
# building cross features
cross_feats_dim = 30
df_users["combined_feat"] = (
    df_users["gender"].astype(str)
    + df_users["age"].astype(str)
    + df_users["occupation"].astype(str)
)
user_features_mapping = {
    i: df_users.loc[df_users.userId == i, "combined_feat"].values[0]
    for i in df_users.userId.unique()
}
movie_features_mapping = {
    i: df_movies.loc[df_movies.movieId == i, "genre"].values[0]
    for i in df_movies.movieId.unique()
}


# cross features for test
test_idx = (
    (
        pd.Series(list(map(lambda x: user_features_mapping[x], df_test["userId"])))
        + pd.Series(list(map(lambda x: movie_features_mapping[x], df_test["movieId"])))
    )
    .apply(lambda x: mmh3.hash(x) % cross_feats_dim)
    .values
)
cross_test = torch.zeros(df_test.shape[0], cross_feats_dim)
cross_test[torch.arange(cross_test.shape[0]), test_idx] = 1

## Tuning

In [22]:
def objective(trial):
    # Define range of values to be tested for the hyperparameters
    n_mlp_layers = trial.suggest_int("n_mlp_layers", 3, 10)
    mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
    mlp_kwargs_dropout = trial.suggest_categorical("mlp_kwargs_dropout", [True, False])
    mlp_kwargs_batchnorm = trial.suggest_categorical(
        "mlp_kwargs_batchnorm", [True, False]
    )
    mlp_kwargs_dropout_rate = trial.suggest_float("mlp_kwargs_dropout_rate", 0.1, 0.9)

    lr = trial.suggest_float("lr", 1e-4, 5e-3)
    n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)

    # Generate the model
    seed_everything(RANDOM_STATE)
    model = wideAndDeep(
        num_users,
        num_items,
        cross_feats_dim=cross_feats_dim,
        n_mlp_layers=n_mlp_layers,
        mlp_layers_dim=int(mlp_layers_dim),
        mlp_kwargs={
            "activation": True,
            "dropout": mlp_kwargs_dropout,
            "batchnorm": mlp_kwargs_batchnorm,
            "dropout_rate": mlp_kwargs_dropout_rate,
        },
    )

    # Generate the optimizers
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
    criterion = torch.nn.BCELoss()
    n_epochs = int(n_epochs)
    device = "cpu"

    # Generate data
    seed_everything(RANDOM_STATE)
    train_loader = DataLoader(
        trainDatasetWithCrossFeatures(
            df_train,
            df_movies["movieId"].nunique(),
            user_features_mapping,
            movie_features_mapping,
            hash_bucket_size=cross_feats_dim,
            verbose=False,
        ),
        batch_size=2048,
        shuffle=True,
    )
    test_loader = DataLoader(
        TensorDataset(
            torch.tensor(df_test["userId"]),
            torch.tensor(df_test["movieId"]),
            cross_test,
        ),
        batch_size=4096,
        shuffle=False,
    )
    # Train model
    train(
        model,
        train_loader,
        optimizer,
        scheduler,
        criterion,
        n_epochs,
        verbose=False,
    )

    # Evaluate
    df_test["pred"] = predict(model, test_loader, verbose=False).numpy()
    pred, target, pred_items = split_test_df(
        df_test, "userId", "movieId", "pred", "action"
    )
    k = 15
    hit_rate = reccomendation_report(
        pred, target, pred_items, cos_dist, popularity, k=k
    )[f"Hit rate @ {k}"]

    return hit_rate.item()

In [23]:
# set up logging
directory = Path(f"optuna/{MODEL_NAME}")
if not directory.exists():
    directory.mkdir()
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logger = logging.getLogger("optuna")
logger.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
file_handler = logging.FileHandler(
    f"{directory.as_posix()}/optuna_logs_{current_time}.log"
)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

In [24]:
# launch tuning
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=3600)

[I 2024-05-14 15:23:20,421] A new study created in memory with name: no-name-973a406d-efe9-41ab-961b-e881c9f33fb7
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
[I 2024-05-14 15:31:30,349] Trial 0 finished with value: 0.19260485470294952 and parameters: {'n_mlp_layers': 9, 'mlp_layers_dim': 448.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.8180674440911959, 'lr': 0.0006128156928358027, 'n_epochs': 5.0}. Best is trial 0 with value: 0.19260485470294952.
  mlp_layers_dim = trial.suggest_discrete_uniform("mlp_layers_dim", 16, 512, 16)
  n_epochs = trial.suggest_discrete_uniform("n_epochs", 5, 20, 2)
[I 2024-05-14 15:38:38,872] Trial 1 finished with value: 0.567328929901123 and parameters: {'n_mlp_layers': 5, 'mlp_layers_dim': 48.0, 'mlp_kwargs_dropout': True, 'mlp_kwargs_batchnorm': True, 'mlp_kwargs_dropout_rate': 0.4643348562793439, 'lr': 0.00156