# MLP with Optuna Tuning

This notebook replaces the LightGBM model with a PyTorch Multi-Layer Perceptron (MLP).
It implements a strict Cross-Validation pipeline where `TargetEncoder` and `CountVectorizer` are fit ONLY on the training folds to prevent data leakage.

### Strategy for Efficiency in the Cross-Validation Loop:
1. **Pre-computation**: We generate the 5 CV splits *before* starting the Optuna study.
2. **Leakage Prevention**: For each split, we fit encoders on the Training set, transform the Validation set, and store the resulting Tensors.
3. **Optuna**: The objective function simply loads these pre-prepared tensors, making the tuning loop very fast.

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from category_encoders import TargetEncoder
from sklearn.metrics import roc_auc_score

In [None]:
"""
Setting up the same seed as in the replication notebook.
"""
SEED = 3508706438

torch.manual_seed(SEED)
np.random.seed(SEED)

## 1. Data Loading & Feature Engineering (Pandas)

In [None]:
# Load datasets
test_hef = pd.read_csv('../data/MIMIC/MIMIC_III_dataset_death/mimic_test_death.csv')
train_hef = pd.read_csv('../data/MIMIC/MIMIC_III_dataset_death/mimic_train.csv')
extra_diag = pd.read_csv('../data/MIMIC/MIMIC_III_dataset_death/extra_data/MIMIC_diagnoses.csv')

In [None]:
extra_diag['ICD9_CODE'] = extra_diag['ICD9_CODE'].astype(str)
extra_diag['ICD9_CHAPTER'] = extra_diag['ICD9_CODE'].str[:3]

extra_diag['IS_SEPSIS'] = extra_diag['ICD9_CODE'].str.startswith(('9959', '7855')).astype(int)
extra_diag['IS_HEART_FAIL'] = extra_diag['ICD9_CODE'].str.startswith('428').astype(int)
extra_diag['IS_CANCER'] = extra_diag['ICD9_CODE'].str.startswith(('196', '197', '198', '199')).astype(int)
extra_diag['IS_RENAL'] = extra_diag['ICD9_CODE'].str.startswith(('584', '585')).astype(int)

diag_grouped = extra_diag.groupby('HADM_ID').agg({
    'ICD9_CODE': [
        ('NUM_DIAGNOSES', 'count'),                                  
        ('DIAG_STRING', lambda x: ' '.join(x.dropna().astype(str)))],
    'ICD9_CHAPTER': [('UNIQUE_CHAPTERS', 'nunique')],
    'IS_SEPSIS': [('HAS_SEPSIS', 'max')],
    'IS_HEART_FAIL': [('HAS_HEART_FAIL', 'max')],
    'IS_CANCER': [('HAS_CANCER', 'max')],
    'IS_RENAL': [('HAS_RENAL', 'max')]
})

diag_grouped.columns = diag_grouped.columns.droplevel(0)
diag_grouped = diag_grouped.reset_index()

# Merge features
train_hef = train_hef.merge(diag_grouped, left_on='hadm_id', right_on='HADM_ID', how='left')
test_hef = test_hef.merge(diag_grouped, left_on='hadm_id', right_on='HADM_ID', how='left')

# Drop HADM_ID as it's not needed anymore
train_hef.drop('HADM_ID', axis=1, inplace=True)
test_hef.drop('HADM_ID', axis=1, inplace=True)

In [None]:
def engineer_features(df_input):
    df = df_input.copy()
    
    df['ADMITTIME'] = pd.to_datetime(df['ADMITTIME'])
    df['DOB'] = pd.to_datetime(df['DOB'])
    df['AGE'] = df['ADMITTIME'].dt.year - df['DOB'].dt.year
    df.loc[df['AGE'] > 89, 'AGE'] = 90
    df.loc[df['AGE'] < 0, 'AGE'] = df['AGE'].median()
    
    original_index = df.index
    df = df.sort_values(by=['subject_id', 'ADMITTIME'])
    df['PREV_ICU_STAYS'] = df.groupby('subject_id').cumcount()
    df['LAST_ADMIT'] = df.groupby('subject_id')['ADMITTIME'].shift(1)
    seconds_diff = (df['ADMITTIME'] - df['LAST_ADMIT']).dt.total_seconds()
    df['DAYS_SINCE_LAST'] = seconds_diff / (24 * 3600)
    df['DAYS_SINCE_LAST'] = df['DAYS_SINCE_LAST'].fillna(-1)
    df = df.reindex(original_index)

    cols_to_drop = ['ADMITTIME', 'DOB', 'LAST_ADMIT', 'DISCHTIME', 'DEATHTIME', 
                    'DOD', 'LOS', 'Diff', 'MeanBP_Min', 'MeanBP_Max', 
                    'MeanBP_Mean', 'hadm_id', 'subject_id']
    df = df.drop([c for c in cols_to_drop if c in df.columns], axis=1)
    return df

train_processed = engineer_features(train_hef)
test_processed = engineer_features(test_hef)

In [None]:
# Define Column Groups
num_cols = [
    'HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 
    'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', 
    'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 
    'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 
    'TempC_Min', 'TempC_Max', 'TempC_Mean', 
    'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 
    'Glucose_Min', 'Glucose_Max', 'Glucose_Mean', 
    'PREV_ICU_STAYS', 'AGE', 'DAYS_SINCE_LAST', 
    'NUM_DIAGNOSES', 'UNIQUE_CHAPTERS', 
    'HAS_SEPSIS', 'HAS_HEART_FAIL', 'HAS_CANCER', 'HAS_RENAL'] 

categorical_cols = [
    'ICD9_diagnosis', 'DIAGNOSIS', 'FIRST_CAREUNIT', 
    'GENDER', 'ADMISSION_TYPE', 'INSURANCE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY']

text_col = 'DIAG_STRING'

## 2. Pre-computing CV Splits (To Prevent Leakage)
We iterate over 5 folds. In each fold, we fit `TargetEncoder` and `CountVectorizer` **only** on the training index, and then transform the validation index.

In [None]:
# Prepare for Cross Validation
X = train_processed.drop('HOSPITAL_EXPIRE_FLAG', axis=1)
y = train_processed['HOSPITAL_EXPIRE_FLAG'].values

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

# Store processed tensors for each fold
cv_datasets = []

print("Pre-processing CV folds...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    # Split DataFrames
    X_tr_df, X_val_df = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    # 1. Numerical: Standard Scale (Fit on Train, Transform Val)
    X_tr_num = torch.tensor(X_tr_df[num_cols].values, dtype=torch.float32)
    X_val_num = torch.tensor(X_val_df[num_cols].values, dtype=torch.float32)
    
    mean = X_tr_num.mean(dim=0)
    std = X_tr_num.std(dim=0)
    epsilon = 1e-7
    
    X_tr_num = (X_tr_num - mean) / (std + epsilon)
    X_val_num = (X_val_num - mean) / (std + epsilon)
    
    # # 2. Categorical: Target Encode (Fit on Train, Transform Val)
    # encoder = TargetEncoder(cols=categorical_cols)
    # # Note: TargetEncoder needs y to fit
    # X_tr_cat = encoder.fit_transform(X_tr_df[categorical_cols], y_tr)
    # X_val_cat = encoder.transform(X_val_df[categorical_cols])
    
    # X_tr_cat = torch.tensor(X_tr_cat.values, dtype=torch.float32)
    # X_val_cat = torch.tensor(X_val_cat.values, dtype=torch.float32)
    
    # 3. Text: CountVectorizer (Fit on Train, Transform Val)
    vect_max_features = 800
    vectorizer = CountVectorizer(binary=True, token_pattern=r'(?u)\b\w+\b', max_features=vect_max_features)
    
    X_tr_text_sparse = vectorizer.fit_transform(X_tr_df[text_col])
    X_val_text_sparse = vectorizer.transform(X_val_df[text_col])
    
    X_tr_text = torch.tensor(X_tr_text_sparse.todense(), dtype=torch.float32)
    X_val_text = torch.tensor(X_val_text_sparse.todense(), dtype=torch.float32)
    
    # 4. Concatenate
    X_tr_final = torch.cat([X_tr_num, X_tr_text], dim=1)
    X_val_final = torch.cat([X_val_num, X_val_text], dim=1)
    
    y_tr_tensor = torch.tensor(y_tr, dtype=torch.float32).unsqueeze(1)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
    
    # Store in list
    cv_datasets.append({
        'train': TensorDataset(X_tr_final, y_tr_tensor),
        'val': TensorDataset(X_val_final, y_val_tensor),
        'input_dim': X_tr_final.shape[1]
    })
    
    print(f"Fold {fold+1} prepared. Input shape: {X_tr_final.shape}")

## 3. Define MLP Model

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_rate):
        super(SimpleMLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.layer2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        x = self.sigmoid(x)
        return x

## 4. Optuna Hyperparameter Tuning
We optimize: Hidden Dimension, Dropout Rate, Learning Rate, and Batch Size.

In [None]:
def objective(trial):
    # Hyperparameters to tune
    hidden_dim = trial.suggest_int("hidden_dim", 32, 256)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    epochs = 100 # Better not to overdo epochs for tuning speed

    fold_aurocs = []
    
    # Iterate through pre-prepared folds
    for fold_data in cv_datasets:
        train_loader = DataLoader(fold_data['train'], batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(fold_data['val'], batch_size=batch_size, shuffle=False)
        
        model = SimpleMLP(fold_data['input_dim'], hidden_dim, dropout_rate)
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)
        
        # Training Loop
        model.train()
        for epoch in range(epochs):
            for X_batch, y_batch in train_loader:
                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
        
        # Validation Loop
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                preds = model(X_batch)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y_batch.cpu().numpy())
        
        try:
            auroc = roc_auc_score(all_labels, all_preds)
        except ValueError:
            auroc = 0.5 # Handle edge cases
            
        fold_aurocs.append(auroc)
    
    return np.mean(fold_aurocs)

# Create Study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

## 5. Final Training on Full Data
Now we retrain the best model on the entire training set (using the same leakage-prevention pipeline for the final test set transformation) and make predictions.

In [None]:
# 1. Prepare Full Training Data
X_train_df = train_processed.drop('HOSPITAL_EXPIRE_FLAG', axis=1)
y_train_full = train_processed['HOSPITAL_EXPIRE_FLAG'].values
X_test_df = test_processed.copy()

# --- Process Full Train ---
# Numerical
X_tr_num = torch.tensor(X_train_df[num_cols].values, dtype=torch.float32)
X_te_num = torch.tensor(X_test_df[num_cols].values, dtype=torch.float32)

mean = X_tr_num.mean(dim=0)
std = X_tr_num.std(dim=0)
X_tr_num = (X_tr_num - mean) / (std + 1e-7)
X_te_num = (X_te_num - mean) / (std + 1e-7)

# # Categorical
# encoder = TargetEncoder(cols=categorical_cols)
# X_tr_cat = encoder.fit_transform(X_train_df[categorical_cols], y_train_full)
# X_te_cat = encoder.transform(X_test_df[categorical_cols])
# X_tr_cat = torch.tensor(X_tr_cat.values, dtype=torch.float32)
# X_te_cat = torch.tensor(X_te_cat.values, dtype=torch.float32)

# Text
vectorizer = CountVectorizer(binary=True, token_pattern=r'(?u)\b\w+\b', max_features=800)
X_tr_text_sparse = vectorizer.fit_transform(X_train_df[text_col])
X_te_text_sparse = vectorizer.transform(X_test_df[text_col])
X_tr_text = torch.tensor(X_tr_text_sparse.todense(), dtype=torch.float32)
X_te_text = torch.tensor(X_te_text_sparse.todense(), dtype=torch.float32)

# Final Tensors
X_train_final = torch.cat([X_tr_num, X_tr_text], dim=1)
X_test_final = torch.cat([X_te_num, X_te_text], dim=1)
y_train_tensor = torch.tensor(y_train_full, dtype=torch.float32).unsqueeze(1)

# 2. Train Best Model
best_params = study.best_params
final_model = SimpleMLP(X_train_final.shape[1], best_params['hidden_dim'], best_params['dropout_rate'])
optimizer = optim.Adam(final_model.parameters(), lr=best_params['lr'])
criterion = nn.BCELoss()

train_loader = DataLoader(TensorDataset(X_train_final, y_train_tensor), 
                            batch_size=best_params['batch_size'], shuffle=True)

final_model.train()
for epoch in range(15): # Train a bit longer for final model
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = final_model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

# 3. Predict
final_model.eval()
with torch.no_grad():
    y_proba = final_model(X_test_final).numpy().flatten()

# Save
submission = pd.DataFrame({'icustay_id': test_hef['icustay_id'], 'prediction': y_proba})
submission.to_csv('../data/MIMIC/pytorch_mlp_optuna_submission.csv', index=False)
print("Saved prediction to ../data/MIMIC/pytorch_mlp_optuna_submission.csv")