In [1]:
from src.mdn import MDN, mdn_loss
from src.mdn_train import CSVDataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
import numpy as np
import torch

import mlflow
import mlflow.pytorch
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load CSV
df = pd.read_csv('../data/train.csv')

# Columns
numeric_features = []
categorical_features = ['Professional', 'Country', 'WorkLang', 'CompanyType']
target_col = 'SalaryUSD'

# Encode categorical features
label_encoders = {}
for col in categorical_features:
	le = LabelEncoder()
	df[col] = le.fit_transform(df[col])
	label_encoders[col] = le  # Save encoders if needed later

# Combine all features
all_features = numeric_features + categorical_features
X = df[all_features].values.astype('float32')
y = np.log(df[target_col].values + 1).astype('float32')

In [3]:
# Create dataset
dataset = CSVDataset(X, y)

# variables for later
input_dim = X.shape[1]
num_mixtures = 1


In [4]:
# Training loop
def train_model(params):
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	
	train_loader = DataLoader(dataset, batch_size=params['batch_size'], shuffle=True)
	model = MDN(input_dim, num_mixtures, hidden_units=params['hidden_dim'], dropout=params['dropout']).to(device)
	optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
	
	for epoch in range(2000):
		model.train()
		total_loss = 0
		for batch_X, batch_y in train_loader:
			batch_X, batch_y = batch_X.to(device), batch_y.to(device)
			optimizer.zero_grad()
			pi, mu, sigma = model(batch_X)
			loss = mdn_loss(pi, mu, sigma, batch_y)
			loss.backward()
			optimizer.step()
			total_loss += loss.item()
		
		epoch_loss = total_loss / len(train_loader)
		mlflow.log_metric("loss", epoch_loss, step=epoch)
	
	return model, epoch_loss


In [5]:
# Objective function for Optuna
def objective(trial):
	params = {
		'hidden_dim': trial.suggest_int('hidden_dim', 16, 128),
		'dropout': trial.suggest_float('dropout', 0.1, 0.5),
		'lr': trial.suggest_loguniform('lr', 1e-5, 1e-2),
		'batch_size': trial.suggest_int('batch_size', 1000, 2000),
	}
	
	with mlflow.start_run():
		mlflow.log_params(params)
		model, final_loss = train_model(params)
		mlflow.log_metric("final_loss", final_loss)
		mlflow.pytorch.log_model(model, "model")
	
	return final_loss

In [6]:
mlflow.set_experiment("Hyperparam Optimization Example")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

print("Best trial:")
print(study.best_trial)

[I 2025-06-02 19:11:23,335] A new study created in memory with name: no-name-8f4e7ac7-cd32-45c6-9245-94acf27fd9b2
  'lr': trial.suggest_loguniform('lr', 1e-5, 1e-2),
[W 2025-06-02 19:11:29,658] Trial 0 failed with parameters: {'hidden_dim': 47, 'dropout': 0.4929817870349438, 'lr': 0.00019234696849774956, 'batch_size': 1651} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/filip/Developer/Python/DataMining/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/99/w3lm9dwd2q929kjsdm9dyy7c0000gn/T/ipykernel_80922/163613612.py", line 12, in objective
    model, final_loss = train_model(params)
                        ^^^^^^^^^^^^^^^^^^^
  File "/var/folders/99/w3lm9dwd2q929kjsdm9dyy7c0000gn/T/ipykernel_80922/3641264848.py", line 15, in train_model
    pi, mu, sigma = model(batch_X)
                    ^^^^^^^^^^^^^^
 

KeyboardInterrupt: 