# Imports

In [2]:
import mlflow
import mlflow.pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
import pickle
import os
import sys
# Add new system path to import MODEL_CONFIG file
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../")))
# Build path two levels up
MODEL_CONFIG = {
    "data_path": "artifacts/train_val_data.pkl",
    "mlruns_dir": "artifacts/mlruns",
    "experiment_name": "two_tower_recommender",
    "num_trials": 10,
    "num_epochs": 10,
    "hpo_params": {
        "embedding_dim": [32, 64, 96],
        "dropout_range": (0.1, 0.3),
        "lr_range": (1e-4, 1e-2),
        "batch_size": [1024, 2048]
    }
}

# Section 1: Define Two-Tower Regression Model

In [None]:
class TwoTowerModel(nn.Module):
    def __init__(self, user_dim, content_dim, embedding_dim, dropout=0.2):
        super().__init__()
        self.user_tower = nn.Sequential(
            nn.Linear(user_dim, embedding_dim),
            nn.BatchNorm1d(embedding_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embedding_dim, embedding_dim),
            nn.BatchNorm1d(embedding_dim),
            nn.ReLU(),
        )
        self.content_tower = nn.Sequential(
            nn.Linear(content_dim, embedding_dim),
            nn.BatchNorm1d(embedding_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embedding_dim, embedding_dim),
            nn.BatchNorm1d(embedding_dim),
            nn.ReLU(),
        )
        self.output_layer = nn.Linear(embedding_dim * 2, 1)  # regression

    def forward(self, u, c):
        u_vec = self.user_tower(u)
        c_vec = self.content_tower(c)
        combined = torch.cat([u_vec, c_vec], dim=1)
        out = self.output_layer(combined)
        return out.squeeze(-1), u_vec, c_vec

# Section 2: Load Preprocessed Data

In [None]:
def load_data(path):
    with open(path, "rb") as f:
        data = pickle.load(f)
    return data

def to_tensor(x):
    if torch.is_tensor(x):
        return x.float()
    elif hasattr(x, "values"):
        return torch.tensor(x.values, dtype=torch.float32)
    else:
        return torch.tensor(x, dtype=torch.float32)

def create_dataloaders(tX_user, tX_content, ty, vX_user, vX_content, vy, batch_size):
    train_loader = DataLoader(TensorDataset(tX_user, tX_content, ty), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(vX_user, vX_content, vy), batch_size=batch_size, shuffle=False)
    return train_loader, val_loader

# Section 3: Define Objective Function for Optuna HPO

In [None]:
def objective(trial, user_dim, content_dim, tX_user, tX_content, ty, vX_user, vX_content, vy):
    # Sample hyperparameters
    embedding_dim = trial.suggest_categorical("embedding_dim", MODEL_CONFIG["hpo_params"]["embedding_dim"])
    dropout = trial.suggest_float("dropout", *MODEL_CONFIG["hpo_params"]["dropout_range"])
    lr = trial.suggest_float("lr", *MODEL_CONFIG["hpo_params"]["lr_range"], log=True)
    batch_size = trial.suggest_categorical("batch_size", MODEL_CONFIG["hpo_params"]["batch_size"])

    # Dataloaders
    train_loader, val_loader = create_dataloaders(tX_user, tX_content, ty, vX_user, vX_content, vy, batch_size)

    # MLflow tracking
    os.makedirs(MODEL_CONFIG["mlruns_dir"], exist_ok=True)
    mlflow.set_tracking_uri(f"file:{MODEL_CONFIG['mlruns_dir']}")
    mlflow.set_experiment(MODEL_CONFIG["experiment_name"])

    with mlflow.start_run(run_name=f"optuna_trial_{trial.number}"):
        mlflow.log_params({
            "embedding_dim": embedding_dim,
            "dropout": dropout,
            "lr": lr,
            "batch_size": batch_size
        })

        model = TwoTowerModel(user_dim, content_dim, embedding_dim, dropout)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        criterion = nn.MSELoss()

        # Training loop
        for epoch in range(MODEL_CONFIG["num_epochs"]):
            model.train()
            train_loss = 0
            for u, c, yb in train_loader:
                optimizer.zero_grad()
                preds, _, _ = model(u, c)
                loss = criterion(preds, yb)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            avg_train_loss = train_loss / len(train_loader)

            # Validation
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for u, c, yb in val_loader:
                    preds, _, _ = model(u, c)
                    val_loss += criterion(preds, yb).item()
            avg_val_loss = val_loss / len(val_loader)

            # Log metrics
            mlflow.log_metric("train_loss", avg_train_loss, step=epoch)
            mlflow.log_metric("val_loss", avg_val_loss, step=epoch)

        # Log model
        mlflow.pytorch.log_model(model, "model")

        trial.set_user_attr("run_id", mlflow.active_run().info.run_id)
        return avg_val_loss

[I 2025-10-19 23:08:30,952] A new study created in memory with name: no-name-5d505082-1f8c-434c-88c8-d7d43b986fef
[I 2025-10-19 23:09:55,640] Trial 0 finished with value: 0.002391652960795909 and parameters: {'embedding_dim': 96, 'dropout': 0.15486829135126262, 'lr': 0.00010723596714808144, 'batch_size': 2048}. Best is trial 0 with value: 0.002391652960795909.
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x103a25cf0>>
Traceback (most recent call last):
  File "/Users/guptayas/.pyenv/versions/langchain-env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


# Section 4: Run HPO Study

In [None]:
if __name__ == "__main__":
    # Load and preprocess data
    data = load_data(MODEL_CONFIG["data_path"])
    tX_user, tX_content, ty = to_tensor(data["tX_user"]), to_tensor(data["tX_content"]), to_tensor(data["ty"])
    vX_user, vX_content, vy = to_tensor(data["vX_user"]), to_tensor(data["vX_content"]), to_tensor(data["vy"])

    user_dim, content_dim = tX_user.shape[1], tX_content.shape[1]

    # Run Optuna HPO
    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, user_dim, content_dim, tX_user, tX_content, ty,
                                           vX_user, vX_content, vy),
                   n_trials=MODEL_CONFIG["num_trials"])

    # Print best trial
    best_trial = study.best_trial
    print(f"Best Validation Loss: {best_trial.value:.6f}")
    print("Best Hyperparameters:")
    for k, v in best_trial.params.items():
        print(f"  {k}: {v}")
    print(f"Best MLflow run_id: {best_trial.user_attrs['run_id']}")

In [2]:
! mlflow ui

/Users/guptayas/.pyenv/versions/3.10.13/envs/langchain-env/lib/python3.10/site-packages/mlflow/gateway/config.py:454: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  class Route(ConfigModel):
[MLflow] Security middleware enabled with default settings (localhost-only). To allow connections from other hosts, use --host 0.0.0.0 and configure --allowed-hosts and --cors-allowed-origins.
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:5000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started parent process [[36m[1m6683[0m]
[32mINFO[0m:     Started server process [[36m6688[0m]
[32mINFO[0m:     Started server process [[36m6685[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Started server process [[36m6686[0m]
