Claude Bot - Optuna Full Proposal

In [None]:
# MLFlow + Optuna + KFold + Adaptive Learning Neural Network Training
# ====================================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import mlflow
import mlflow.pytorch
import optuna
import os
from pathlib import Path
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# ====================================================================
# 1. NEURAL NETWORK MODEL DEFINITION
# ====================================================================

class FeedforwardNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate=0.0):
        super(FeedforwardNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, 1)
        )
    
    def forward(self, x):
        return self.layers(x)

# ====================================================================
# 2. GRADIENT ADAPTIVE BATCHING LEARNING CLASS
# ====================================================================

class GradientAdaptiveBatchingLearning:
    def __init__(self, initial_lr=0.005, lr_min=0.0005, initial_batch_size=16,
                 grad_threshold=1.0, batch_multiplier=2, lr_multiplier=0.5, 
                 patience=10, max_batch_size=None):
        self.initial_lr = initial_lr
        self.lr_min = lr_min
        self.initial_batch_size = initial_batch_size
        self.grad_threshold = grad_threshold
        self.batch_multiplier = batch_multiplier
        self.lr_multiplier = lr_multiplier
        self.patience = patience
        self.max_batch_size = max_batch_size
        
        # Tracking variables
        self.current_lr = initial_lr
        self.current_batch_size = initial_batch_size
        self.loss_history = []
        self.lr_history = []
        self.batch_size_history = []
        self.no_improvement_count = 0
        self.best_loss = float('inf')
    
    def reset(self):
        """Reset adaptive parameters for new training"""
        self.current_lr = self.initial_lr
        self.current_batch_size = self.initial_batch_size
        self.loss_history = []
        self.lr_history = []
        self.batch_size_history = []
        self.no_improvement_count = 0
        self.best_loss = float('inf')
    
    def calculate_loss_slope(self, window_size=5):
        """Calculate the slope of recent loss values"""
        if len(self.loss_history) < window_size:
            return float('-inf')  # Not enough history
        
        recent_losses = self.loss_history[-window_size:]
        x = np.arange(len(recent_losses))
        
        # Calculate slope using least squares
        slope = np.polyfit(x, recent_losses, 1)[0]
        return abs(slope)
    
    def should_adapt(self):
        """Determine if we should adapt learning parameters"""
        if len(self.loss_history) < self.patience:
            return False
        
        # Check if slope is diminishing
        slope = self.calculate_loss_slope()
        return slope < self.grad_threshold
    
    def adapt_parameters(self, dataset_size):
        """Adapt learning rate and batch size"""
        if self.should_adapt():
            # Decrease learning rate
            new_lr = max(self.current_lr * self.lr_multiplier, self.lr_min)
            
            # Increase batch size
            new_batch_size = min(
                int(self.current_batch_size * self.batch_multiplier),
                dataset_size if self.max_batch_size is None else self.max_batch_size
            )
            
            # Only update if there's actually a change
            if new_lr != self.current_lr or new_batch_size != self.current_batch_size:
                self.current_lr = new_lr
                self.current_batch_size = new_batch_size
                return True
        
        return False
    
    def update_history(self, epoch_loss):
        """Update loss history and tracking"""
        self.loss_history.append(epoch_loss)
        self.lr_history.append(self.current_lr)
        self.batch_size_history.append(self.current_batch_size)
        
        if epoch_loss < self.best_loss:
            self.best_loss = epoch_loss
            self.no_improvement_count = 0
        else:
            self.no_improvement_count += 1

# ====================================================================
# 3. DATA PREPARATION FUNCTIONS
# ====================================================================

def load_and_prepare_data():
    """
    Load your dataset and prepare it for training
    Replace this function with your actual data loading logic
    """
    # Placeholder - replace with your actual data loading
    # For demonstration, creating synthetic data
    np.random.seed(42)
    n_samples = 238  # Your dataset size
    n_features = 9   # Your feature count
    
    X = np.random.randn(n_samples, n_features)
    y = np.random.randn(n_samples) * 2 + 1  # Some target variable
    
    return X, y

def split_and_scale_data(X, y, test_size=0.2, val_size=0.1, random_state=42):
    """
    Split data into train/val/test and scale features
    """
    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Second split: separate validation from training
    val_size_adjusted = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size_adjusted, random_state=random_state
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"Data splits - Train: {X_train_scaled.shape[0]}, Val: {X_val_scaled.shape[0]}, Test: {X_test_scaled.shape[0]}")
    
    return (X_train_scaled, X_val_scaled, X_test_scaled, 
            y_train, y_val, y_test, scaler)

# ====================================================================
# 4. TRAINING FUNCTIONS
# ====================================================================

def create_data_loader(X, y, batch_size, shuffle=True):
    """Create PyTorch DataLoader from numpy arrays"""
    X_tensor = torch.FloatTensor(X)
    y_tensor = torch.FloatTensor(y).view(-1, 1)
    dataset = TensorDataset(X_tensor, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

def train_one_epoch(model, optimizer, data_loader, criterion):
    """Train model for one epoch"""
    model.train()
    total_loss = 0.0
    num_batches = 0
    
    for batch_X, batch_y in data_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    return total_loss / num_batches if num_batches > 0 else 0.0

def evaluate_model(model, data_loader, criterion):
    """Evaluate model on validation/test data"""
    model.eval()
    total_loss = 0.0
    num_batches = 0
    
    with torch.no_grad():
        for batch_X, batch_y in data_loader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
            num_batches += 1
    
    return total_loss / num_batches if num_batches > 0 else 0.0

def train_with_adaptive_learning(model, X_train_fold, y_train_fold, X_val_fold, y_val_fold,
                               adaptive_trainer, max_epochs=100, patience_early_stop=20):
    """
    Train model with adaptive learning rate and batch size
    """
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=adaptive_trainer.current_lr)
    
    train_losses = []
    val_losses = []
    best_val_loss = float('inf')
    patience_count = 0
    
    for epoch in range(max_epochs):
        # Create data loader with current batch size
        train_loader = create_data_loader(
            X_train_fold, y_train_fold, 
            batch_size=adaptive_trainer.current_batch_size, 
            shuffle=True
        )
        val_loader = create_data_loader(
            X_val_fold, y_val_fold, 
            batch_size=adaptive_trainer.current_batch_size, 
            shuffle=False
        )
        
        # Train one epoch
        train_loss = train_one_epoch(model, optimizer, train_loader, criterion)
        val_loss = evaluate_model(model, val_loader, criterion)
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        # Update adaptive trainer
        adaptive_trainer.update_history(val_loss)
        
        # Check if we should adapt parameters
        if adaptive_trainer.adapt_parameters(len(X_train_fold)):
            # Update optimizer learning rate
            for param_group in optimizer.param_groups:
                param_group['lr'] = adaptive_trainer.current_lr
            
            print(f"  Epoch {epoch+1}: Adapted - LR: {adaptive_trainer.current_lr:.6f}, "
                  f"Batch Size: {adaptive_trainer.current_batch_size}")
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_count = 0
        else:
            patience_count += 1
            
        if patience_count >= patience_early_stop:
            print(f"  Early stopping at epoch {epoch+1}")
            break
    
    return {
        'train_losses': train_losses,
        'val_losses': val_losses,
        'final_val_loss': best_val_loss,
        'lr_history': adaptive_trainer.lr_history.copy(),
        'batch_size_history': adaptive_trainer.batch_size_history.copy()
    }

# ====================================================================
# 5. K-FOLD CROSS VALIDATION
# ====================================================================

def perform_kfold_cv(X_train, y_train, model_params, adaptive_params, k_folds=4, 
                    max_epochs=100, random_state=42):
    """
    Perform K-Fold cross validation with adaptive learning
    """
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=random_state)
    fold_results = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        print(f"Training Fold {fold + 1}/{k_folds}")
        
        # Split data for this fold
        X_train_fold = X_train[train_idx]
        X_val_fold = X_train[val_idx]
        y_train_fold = y_train[train_idx]
        y_val_fold = y_train[val_idx]
        
        # Create model for this fold
        model = FeedforwardNN(
            input_size=X_train.shape[1],
            hidden_size=model_params['neurons'],
            dropout_rate=model_params['dropout']
        )
        
        # Create adaptive trainer for this fold
        adaptive_trainer = GradientAdaptiveBatchingLearning(**adaptive_params)
        
        # Train model
        fold_result = train_with_adaptive_learning(
            model, X_train_fold, y_train_fold, X_val_fold, y_val_fold,
            adaptive_trainer, max_epochs=max_epochs
        )
        
        fold_result['fold'] = fold + 1
        fold_results.append(fold_result)
        
        print(f"  Fold {fold + 1} final validation loss: {fold_result['final_val_loss']:.6f}")
    
    # Calculate average performance
    avg_val_loss = np.mean([result['final_val_loss'] for result in fold_results])
    std_val_loss = np.std([result['final_val_loss'] for result in fold_results])
    
    print(f"Average CV Loss: {avg_val_loss:.6f} Â± {std_val_loss:.6f}")
    
    return {
        'fold_results': fold_results,
        'avg_cv_loss': avg_val_loss,
        'std_cv_loss': std_val_loss
    }

# ====================================================================
# 6. PLOTTING FUNCTIONS
# ====================================================================

def plot_cv_results(cv_results):
    """Plot cross-validation results"""
    fold_results = cv_results['fold_results']
    n_folds = len(fold_results)
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('K-Fold Cross Validation Results', fontsize=16)
    
    # Plot 1: Loss curves for each fold
    axes[0, 0].set_title('Loss Curves by Fold')
    for i, result in enumerate(fold_results):
        epochs = range(1, len(result['train_losses']) + 1)
        axes[0, 0].plot(epochs, result['train_losses'], 
                       label=f'Fold {i+1} Train', alpha=0.7)
        axes[0, 0].plot(epochs, result['val_losses'], 
                       label=f'Fold {i+1} Val', alpha=0.7, linestyle='--')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True)
    
    # Plot 2: Final validation losses
    axes[0, 1].set_title('Final Validation Loss by Fold')
    fold_nums = [r['fold'] for r in fold_results]
    final_losses = [r['final_val_loss'] for r in fold_results]
    axes[0, 1].bar(fold_nums, final_losses, alpha=0.7)
    axes[0, 1].axhline(y=cv_results['avg_cv_loss'], color='red', 
                      linestyle='--', label=f"Average: {cv_results['avg_cv_loss']:.4f}")
    axes[0, 1].set_xlabel('Fold')
    axes[0, 1].set_ylabel('Final Validation Loss')
    axes[0, 1].legend()
    axes[0, 1].grid(True)
    
    # Plot 3: Learning rate evolution
    axes[1, 0].set_title('Learning Rate Evolution')
    for i, result in enumerate(fold_results):
        epochs = range(1, len(result['lr_history']) + 1)
        axes[1, 0].plot(epochs, result['lr_history'], 
                       label=f'Fold {i+1}', alpha=0.7)
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('Learning Rate')
    axes[1, 0].set_yscale('log')
    axes[1, 0].legend()
    axes[1, 0].grid(True)
    
    # Plot 4: Batch size evolution
    axes[1, 1].set_title('Batch Size Evolution')
    for i, result in enumerate(fold_results):
        epochs = range(1, len(result['batch_size_history']) + 1)
        axes[1, 1].plot(epochs, result['batch_size_history'], 
                       label=f'Fold {i+1}', alpha=0.7)
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].set_ylabel('Batch Size')
    axes[1, 1].legend()
    axes[1, 1].grid(True)
    
    plt.tight_layout()
    return fig

def plot_optuna_optimization_history(study):
    """Plot Optuna optimization history"""
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot 1: Optimization history
    trials = study.trials_dataframe()
    axes[0].plot(trials['number'], trials['value'], 'b-', alpha=0.7)
    axes[0].set_xlabel('Trial')
    axes[0].set_ylabel('Objective Value (CV Loss)')
    axes[0].set_title('Optimization History')
    axes[0].grid(True)
    
    # Plot 2: Parameter importance (if enough trials)
    if len(trials) > 10:
        try:
            importance = optuna.importance.get_param_importances(study)
            params = list(importance.keys())
            values = list(importance.values())
            
            axes[1].barh(params, values)
            axes[1].set_xlabel('Importance')
            axes[1].set_title('Parameter Importance')
        except:
            axes[1].text(0.5, 0.5, 'Parameter importance\nnot available', 
                        ha='center', va='center', transform=axes[1].transAxes)
    else:
        axes[1].text(0.5, 0.5, 'Not enough trials for\nparameter importance', 
                    ha='center', va='center', transform=axes[1].transAxes)
    
    plt.tight_layout()
    return fig

# ====================================================================
# 7. MLFLOW INTEGRATION
# ====================================================================

def setup_mlflow(experiment_name="neural_network_optimization"):
    """Setup MLflow experiment"""
    mlflow.set_experiment(experiment_name)
    print(f"MLflow experiment: {experiment_name}")

def log_cv_results_to_mlflow(cv_results, trial_number=None):
    """Log cross-validation results to MLflow"""
    # Log metrics
    mlflow.log_metric("avg_cv_loss", cv_results['avg_cv_loss'])
    mlflow.log_metric("std_cv_loss", cv_results['std_cv_loss'])
    
    # Log individual fold results
    for i, fold_result in enumerate(cv_results['fold_results']):
        mlflow.log_metric(f"fold_{i+1}_final_loss", fold_result['final_val_loss'])
    
    # Create and log plots
    fig = plot_cv_results(cv_results)
    mlflow.log_figure(fig, f"cv_results_trial_{trial_number}.png" if trial_number else "cv_results.png")
    plt.close(fig)

# ====================================================================
# 8. OPTUNA INTEGRATION
# ====================================================================

def create_optuna_objective(X_train, y_train):
    """Create Optuna objective function"""
    
    def objective(trial):
        # Suggest hyperparameters
        model_params = {
            'neurons': trial.suggest_int('neurons', 8, 64),
            'dropout': trial.suggest_float('dropout', 0.0, 0.5)
        }
        
        adaptive_params = {
            'initial_lr': trial.suggest_float('initial_lr', 1e-4, 1e-1, log=True),
            'lr_min': trial.suggest_float('lr_min', 1e-6, 1e-3, log=True),
            'initial_batch_size': trial.suggest_categorical('initial_batch_size', [8, 16, 32]),
            'grad_threshold': trial.suggest_float('grad_threshold', 0.0001, 0.01, log=True),
            'batch_multiplier': trial.suggest_float('batch_multiplier', 1.5, 3.0),
            'lr_multiplier': trial.suggest_float('lr_multiplier', 0.3, 0.8),
            'patience': trial.suggest_int('patience', 5, 20)
        }
        
        # Start MLflow run for this trial
        with mlflow.start_run(run_name=f"trial_{trial.number}", nested=True):
            # Log trial parameters
            mlflow.log_params({**model_params, **adaptive_params})
            mlflow.log_param("trial_number", trial.number)
            
            # Perform cross-validation
            cv_results = perform_kfold_cv(
                X_train, y_train, 
                model_params, adaptive_params,
                k_folds=4, max_epochs=100
            )
            
            # Log results to MLflow
            log_cv_results_to_mlflow(cv_results, trial.number)
            
            return cv_results['avg_cv_loss']
    
    return objective

def run_optuna_optimization(X_train, y_train, n_trials=50):
    """Run Optuna optimization"""
    
    # Create study
    study = optuna.create_study(direction='minimize')
    
    # Create objective
    objective = create_optuna_objective(X_train, y_train)
    
    # Start parent MLflow run
    with mlflow.start_run(run_name="optuna_hyperparameter_search"):
        # Log study parameters
        mlflow.log_params({
            'n_trials': n_trials,
            'direction': 'minimize',
            'sampler': 'TPE'
        })
        
        # Optimize
        print(f"Starting Optuna optimization with {n_trials} trials...")
        study.optimize(objective, n_trials=n_trials)
        
        # Log best results
        mlflow.log_params({f"best_{k}": v for k, v in study.best_params.items()})
        mlflow.log_metric("best_cv_loss", study.best_value)
        
        # Create and log optimization plots
        fig = plot_optuna_optimization_history(study)
        mlflow.log_figure(fig, "optuna_optimization_history.png")
        plt.close(fig)
        
        print(f"Best trial: {study.best_trial.number}")
        print(f"Best CV loss: {study.best_value:.6f}")
        print(f"Best parameters: {study.best_params}")
    
    return study

# ====================================================================
# 9. MAIN EXECUTION
# ====================================================================

def main():
    """Main execution function"""
    print("=== Neural Network Hyperparameter Optimization ===")
    
    # Setup MLflow
    setup_mlflow("neural_network_adaptive_learning")
    
    # Load and prepare data
    print("\n1. Loading and preparing data...")
    X, y = load_and_prepare_data()
    X_train, X_val, X_test, y_train, y_val, y_test, scaler = split_and_scale_data(X, y)
    
    # Run Optuna optimization
    print("\n2. Starting hyperparameter optimization...")
    study = run_optuna_optimization(X_train, y_train, n_trials=20)  # Adjust n_trials as needed
    
    print("\n=== Optimization Complete ===")
    print(f"Best parameters: {study.best_params}")
    print(f"Best CV loss: {study.best_value:.6f}")
    
    return study, X_train, X_val, X_test, y_train, y_val, y_test, scaler

# ====================================================================
# 10. EXAMPLE SINGLE TRIAL (FOR TESTING)
# ====================================================================

def test_single_trial():
    """Test a single trial with fixed parameters"""
    print("=== Testing Single Trial ===")
    
    # Load data
    X, y = load_and_prepare_data()
    X_train, X_val, X_test, y_train, y_val, y_test, scaler = split_and_scale_data(X, y)
    
    # Fixed parameters for testing
    model_params = {
        'neurons': 16,
        'dropout': 0.2
    }
    
    adaptive_params = {
        'initial_lr': 0.01,
        'lr_min': 0.0001,
        'initial_batch_size': 16,
        'grad_threshold': 0.001,
        'batch_multiplier': 2.0,
        'lr_multiplier': 0.5,
        'patience': 10
    }
    
    # Setup MLflow
    setup_mlflow("test_single_trial")
    
    with mlflow.start_run(run_name="single_test_trial"):
        # Log parameters
        mlflow.log_params({**model_params, **adaptive_params})
        
        # Perform CV
        cv_results = perform_kfold_cv(
            X_train, y_train, 
            model_params, adaptive_params,
            k_folds=4, max_epochs=50
        )
        
        # Log results
        log_cv_results_to_mlflow(cv_results)
        
        print(f"Test trial CV loss: {cv_results['avg_cv_loss']:.6f}")
    
    return cv_results

# ====================================================================
# RUN THE CODE
# ====================================================================

if __name__ == "__main__":
    # Uncomment one of these to run:
    
    # For testing a single trial:
    # test_results = test_single_trial()
    
    # For full optimization:
    study, X_train, X_val, X_test, y_train, y_val, y_test, scaler = main()