In [None]:
##################################################################################################################################################
# Importing libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import gc
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
import numpy as np
import os
import pandas as pd
import random
import time
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
import sys
import pprint
import optuna
##################################################################################################################################################
# Logger
%load_ext dotenv
%dotenv
sys.path.append("./loggers/")
from utils.logger import get_logger
_logs = get_logger(__name__)
os.chdir('./experiments/neural_networks') if not load_dotenv() else None
load_dotenv()
_logs.info(f"load_dotenv() returned: {load_dotenv()}")
##################################################################################################################################################
# Load csv into df

# --- Ensure consistent working directory for data loading ---
# This block dynamically sets the current working directory to the Git repository root.
# This makes data paths reliable for all collaborators, regardless of where they open the notebook.

current_dir = os.getcwd()
repo_root = current_dir
while not os.path.exists(os.path.join(repo_root, '.git')):
    # Move up one directory
    parent_dir = os.path.dirname(repo_root)
    if parent_dir == repo_root: # Reached filesystem root, .git not found
        raise FileNotFoundError(
            "Could not find the .git directory. "
            "Please ensure you are running this code from within a Git repository."
        )
    repo_root = parent_dir

# Change the current working directory if it's not already the repo root
if os.getcwd() != repo_root:
    os.chdir(repo_root)
    _logs.info(f"Working directory set to: {os.getcwd()}") # Informative print for users


# --- Data Loading ---
# Path to the data file, relative to the repository root.
data_file_name = 'df_eng_customer_purchasing_features.csv'
data_file_path = os.path.join('data', 'processed', data_file_name)

try:
    df = pd.read_csv(data_file_path)
    _logs.info(f"Successfully loaded '{data_file_name}' into the DataFrame named df.")
    #_logs.info(df.head())
except FileNotFoundError:
    _logs.info(f"Error: The file '{data_file_name}' was not found at '{data_file_path}'.")
    _logs.info("Please ensure it exists in the 'data/processed/' folder relative to the repository root.")
except Exception as e:
    _logs.info(f"An error occurred during data loading: {e}")
##################################################################################################################################################
#Change the working directory
# Check current directory
_logs.info("Current directory:", os.getcwd())
# Change to new directory
os.chdir("./experiments/neural_networks")
# Verify the change
_logs.info("New directory:", os.getcwd())
##################################################################################################################################################
# Split the data between train portion and a holdout portion that will be used in another Notebook for validation and test
# Hold out 20% of the dataset (to become val + test in another Notebook, we will not use it here)
X_train, X_holdout, y_train, y_holdout = train_test_split(df.drop(['purchase_amount', 'log_purchase_amount'], axis=1), df[['purchase_amount', 'log_purchase_amount']], test_size=0.2, random_state=42)
df = pd.concat([X_train, y_train], axis=1)
##################################################################################################################################################
# Prepare data: scale numerical and encode categorical
# Preprocessing: Prepare Inputs

# Extract relevant columns
numerical_cols = ['age', 'annual_income', 'loyalty_score', 'purchase_frequency', 'log_annual_income', 'log_purchase_frequency']
categorical_cols = ['region_grouped']
target_col = 'purchase_amount'
target_col_log = 'log_purchase_amount'

# Filter to only use columns that actually exist
existing_numerical_cols = [col for col in numerical_cols if col in df.columns]
existing_categorical_cols = [col for col in categorical_cols if col in df.columns]

# Scale numerical features
scaler = StandardScaler()
X_num = scaler.fit_transform(df[existing_numerical_cols])

# Encode categorical feature
encoder = OneHotEncoder(sparse_output=False)
X_cat = encoder.fit_transform(df[existing_categorical_cols])

# Create DataFrames with distinct column names
num_df = pd.DataFrame(X_num, columns=existing_numerical_cols)
cat_columns = encoder.get_feature_names_out(existing_categorical_cols)
cat_df = pd.DataFrame(X_cat, columns=cat_columns)

# Combine safely
combined_df = pd.concat([num_df, cat_df], axis=1)

# Convert to torch tensor
X_train = torch.tensor(combined_df.values, dtype=torch.float32)

# Target variable
y_train = torch.tensor(df[target_col].values, dtype=torch.float32).view(-1, 1)
y_train_log = torch.tensor(df[target_col_log].values, dtype=torch.float32).view(-1, 1)

# Print dataset sizes and set max batch sizes
_logs.info(f"Dataset sizes:")
_logs.info(f"  Train (for KFold CV): {len(X_train)}")
_logs.info(f"  Holdout (not used): {len(X_holdout)}")
##################################################################################################################################################
# Gradient-Based Adaptive Batch Size Class
class GradientAdaptiveBatchingLearning:
    def __init__(self, initial_batch_size=16, initial_lr=0.001,
                 grad_threshold=0.01, batch_multiplier=1.4, lr_multiplier=0.8, grad_threshold_multiplier=0.25, lr_min=0.0005, max_batch_size=142):
        self.current_batch_size = initial_batch_size
        self.current_lr = initial_lr
        self.grad_threshold = grad_threshold
        self.batch_multiplier = batch_multiplier
        self.lr_multiplier = lr_multiplier
        self.grad_threshold_multiplier = grad_threshold_multiplier
        self.lr_min = lr_min
        self.max_batch_size = max_batch_size
        self.loss_history = []
        self.update_count = 0
            
    def update_params(self, current_loss, epoch):
        self.loss_history.append(current_loss)
        
        # Need at least 5 epochs of history for gradient calculation
        if len(self.loss_history) >= 5:
            # Calculate gradient (first derivative) over recent losses
            recent_losses = self.loss_history[-5:]
            gradient = np.mean(np.diff(recent_losses))
            
            # Additional stability check: don't update too frequently
            epochs_since_update = len(self.loss_history) - self.update_count
            
            # If gradient is small (slow improvement) and enough time passed
            if abs(gradient) < self.grad_threshold and epochs_since_update >= 100:
                new_batch_size = min(
                    int(self.current_batch_size * self.batch_multiplier), 
                    self.max_batch_size
                )
                # Decrease learning rate when increasing batch size
                lr_scaling = self.lr_multiplier
                self.current_lr = max(
                    self.current_lr * lr_scaling, 
                    self.lr_min
                )
                self.current_batch_size = new_batch_size
                self.update_count = len(self.loss_history)
                
                _logs.info(f"Epoch {epoch}: Gradient = {gradient:.6f} < {self.grad_threshold}")
                _logs.info(f"  → Increased batch size: {int(self.current_batch_size/self.batch_multiplier)} → {self.current_batch_size}")
                _logs.info(f"  → Decreased learning rate: {self.current_lr/lr_scaling:.6f} → {self.current_lr:.6f}")
                
                # Gradient Threshold updated
                self.grad_threshold = self.grad_threshold * self.grad_threshold_multiplier
                
        return self.current_batch_size, self.current_lr
##################################################################################################################################################
# Define the Neural Network Model
class FeedforwardNN(nn.Module):
    def __init__(self, input_size, width_1st_layer, width_2nd_layer, width_3rd_layer, 
                 width_4th_layer, width_output_layer, dropout1, dropout2, dropout3, dropout4):
        super(FeedforwardNN, self).__init__()

        layers = []
        prev_layer_size = input_size      
        # First hidden layer (always exists)
        layers.extend([
            nn.Linear(prev_layer_size , width_1st_layer),
            nn.ReLU(),
            nn.Dropout(dropout1)
        ])
        prev_layer_size = width_1st_layer
        
        # Second hidden layer (conditional)
        if width_2nd_layer > 0:
            layers.extend([
                nn.Linear(prev_layer_size, width_2nd_layer),
                nn.ReLU(),
                nn.Dropout(dropout2)
            ])
            prev_layer_size = width_2nd_layer

        # Third hidden layer (conditional)
        if width_3rd_layer > 0:
            layers.extend([
                nn.Linear(prev_layer_size, width_3rd_layer),
                nn.ReLU(),
                nn.Dropout(dropout3)
            ])
            prev_layer_size = width_3rd_layer
        
        # Fourth hidden layer (conditional)
        if width_4th_layer > 0:
            layers.extend([
                nn.Linear(prev_layer_size, width_4th_layer),
                nn.ReLU(),
                nn.Dropout(dropout4)
            ])
            prev_layer_size = width_4th_layer
        
        # Output layer
        layers.append(nn.Linear(prev_layer_size, width_output_layer))
        
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)
##################################################################################################################################################
def init_weights(m):
    '''Make the weight initialization reproducible'''
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            torch.nn.init.constant_(m.bias, 0.01)
##################################################################################################################################################
# Metric computation functions
def compute_rmse(predictions, targets):
    return torch.sqrt(F.mse_loss(predictions, targets))

def compute_mae(predictions, targets):
    return torch.mean(torch.abs(predictions - targets))

def compute_mape(predictions, targets):
    return torch.mean(torch.abs((predictions - targets) / targets) * 100)

def compute_r2(predictions, targets):
    ss_res = torch.sum((targets - predictions) ** 2)
    ss_tot = torch.sum((targets - torch.mean(targets)) ** 2)
    return 1 - ss_res / ss_tot
##################################################################################################################################################
def objective(trial, X_train, y_train, data_file_name="default_dataset"):
    """Optuna objective function for hyperparameter optimization"""
    
    # First decide the architecture depth
    n_hidden_layers = trial.suggest_categorical('n_hidden_layers', [1, 2, 3, 4])
    
    # Always suggest first layer
    width_1st_layer = trial.suggest_categorical('width_1st_layer', [8, 16, 32, 64])
    dropout1 = trial.suggest_float('dropout1', 0.0, 0.5, step=0.1)  # Always used
    
    # Conditional suggestions based on depth
    if n_hidden_layers >= 2:
        width_2nd_layer = trial.suggest_categorical('width_2nd_layer', [4, 8, 16, 32])
        dropout2 = trial.suggest_float('dropout2', 0.0, 0.5, step=0.1)
    else:
        width_2nd_layer = 0
        dropout2 = 0.0
        
    if n_hidden_layers >= 3:
        width_3rd_layer = trial.suggest_categorical('width_3rd_layer', [2, 4, 8, 16])
        dropout3 = trial.suggest_float('dropout3', 0.0, 0.5, step=0.1)
    else:
        width_3rd_layer = 0
        dropout3 = 0.0
        
    if n_hidden_layers >= 4:
        width_4th_layer = trial.suggest_categorical('width_4th_layer', [2, 4, 8])
        dropout4 = trial.suggest_float('dropout4', 0.0, 0.5, step=0.1)
    else:
        width_4th_layer = 0
        dropout4 = 0.0
    
    width_output_layer = 1  # Keep fixed for regression
    
    # Adaptive batching parameters
    initial_lr = trial.suggest_float('initial_lr', 0.005, 0.01)
    initial_batch_size = trial.suggest_categorical('initial_batch_size', [8, 16, 32])
    #grad_threshold = trial.suggest_float('grad_threshold', 0.1, 1)
    grad_threshold = 1
    batch_multiplier = trial.suggest_float('batch_multiplier', 1.5, 2.5)
    lr_multiplier = trial.suggest_float('lr_multiplier', 0.4, 0.7)
    grad_threshold_multiplier = trial.suggest_categorical('grad_threshold_multiplier', [0.1, 0.25, 0.4, 0.55])
    lr_min = trial.suggest_float('lr_min', 0.0001, 0.001)
    
    # Fixed parameters
    n_folds = 3  # Reduced for faster optimization
    epoch_max = 10000  # Reduced for faster trials
    patience = 5000  # Reduced patience
    
    params = {
        "dataset": data_file_name,
        "width_1st_layer": width_1st_layer,
        "dropout1": dropout1,
        "dropout2": dropout2,
        "dropout3": dropout3,
        "dropout4": dropout4,
        "width_2nd_layer": width_2nd_layer,
        "width_3rd_layer": width_3rd_layer,
        "width_4th_layer": width_4th_layer,
        "width_output_layer": width_output_layer,
        "activation": "relu",
        "criterion": "Mean Squared Error (MSE) loss",
        "optimizer": "adam",
        "adaptive_batching_learning_rate": True,
        "initial_lr": initial_lr,
        "initial_batch_size": initial_batch_size,
        "max_batch_size": len(X_train),
        "grad_threshold": grad_threshold,
        "batch_multiplier": batch_multiplier,
        "lr_multiplier": lr_multiplier,
        "grad_threshold_multiplier": grad_threshold_multiplier,
        "lr_min": lr_min,
        "epoch_max": epoch_max,
        "trial_number": trial.number
    }
    
    # KFold setup
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # Store validation scores for each fold
    fold_val_scores = []
    
    # KFold cross-validation loop
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        # Create fold-specific train/validation splits
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_val_fold = X_train[val_idx]
        y_val_fold = y_train[val_idx]

        # Make it reproducible
        SEED = 42
        random.seed(SEED)
        np.random.seed(SEED)
        torch.manual_seed(SEED)
        torch.cuda.manual_seed(SEED)
        torch.cuda.manual_seed_all(SEED)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        # Initialize model
        model = FeedforwardNN(
            input_size=X_train_fold.shape[1],
            width_1st_layer=width_1st_layer,
            width_2nd_layer=width_2nd_layer,
            width_3rd_layer=width_3rd_layer,
            width_4th_layer=width_4th_layer,
            width_output_layer=width_output_layer,
            dropout1=dropout1,
            dropout2=dropout2,
            dropout3=dropout3,
            dropout4=dropout4            
        )
        model.apply(init_weights)

        # Setup device
        device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
        
        # Move data to device
        X_train_fold = X_train_fold.to(device)
        y_train_fold = y_train_fold.to(device)
        X_val_fold = X_val_fold.to(device)
        y_val_fold = y_val_fold.to(device)
        model = model.to(device)
        
        # Initialize adaptive batch trainer
        adaptive_trainer = GradientAdaptiveBatchingLearning(
            initial_batch_size=initial_batch_size,
            initial_lr=initial_lr,
            grad_threshold=grad_threshold,
            batch_multiplier=batch_multiplier,
            lr_multiplier=lr_multiplier,
            grad_threshold_multiplier=grad_threshold_multiplier,
            lr_min=lr_min,
            max_batch_size=len(X_train_fold)
        )
        
        # Create DataLoaders
        train_dataset = TensorDataset(X_train_fold, y_train_fold)
        val_dataset = TensorDataset(X_val_fold, y_val_fold)
        
        train_loader = DataLoader(train_dataset, batch_size=initial_batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=initial_batch_size, shuffle=False)
        
        # Training setup
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=adaptive_trainer.current_lr)
        
        best_val_loss = float('inf')
        trigger_times = 0
        min_delta = 0
        
        # Training loop
        for epoch in range(epoch_max):
            # Training phase
            model.train()
            epoch_train_loss = 0.0
            num_train_batches = 0
            
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                output = model(batch_X)
                loss = criterion(output, batch_y)
                loss.backward()
                optimizer.step()
                
                epoch_train_loss += loss.item()
                num_train_batches += 1
            
            avg_train_loss = epoch_train_loss / num_train_batches

            # Validation phase
            model.eval()
            epoch_val_loss = 0.0
            num_val_batches = 0
            
            with torch.no_grad():
                for batch_X, batch_y in val_loader:
                    val_output = model(batch_X)
                    val_loss = criterion(val_output, batch_y)
                    epoch_val_loss += val_loss.item()
                    num_val_batches += 1
            
            avg_val_loss = epoch_val_loss / num_val_batches
            
            # Adaptive batch size and learning rate update
            new_batch_size, new_lr = adaptive_trainer.update_params(avg_val_loss, epoch)
            
            # Update optimizer learning rate if changed
            if new_lr != optimizer.param_groups[0]['lr']:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = new_lr
            
            # Create new dataloader if batch size changed
            if new_batch_size != train_loader.batch_size:
                train_loader = DataLoader(train_dataset, batch_size=new_batch_size, shuffle=True)
                val_loader = DataLoader(val_dataset, batch_size=new_batch_size, shuffle=False)
            
            # Early stopping logic
            if avg_val_loss + min_delta < best_val_loss:
                best_val_loss = avg_val_loss
                trigger_times = 0
            else:
                trigger_times += 1
                if trigger_times >= patience:
                    break
            
            # Report intermediate results to Optuna for pruning
            trial.report(avg_val_loss, epoch)
            
            # Handle pruning
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()
        
        # Final validation score for this fold
        model.eval()
        with torch.no_grad():
            val_output = model(X_val_fold)
            final_val_loss = criterion(val_output, y_val_fold)
            fold_val_scores.append(final_val_loss.item())
        
        # Clean up
        del model
        gc.collect()
        torch.cuda.empty_cache()
    
    # Return mean validation score across all folds
    mean_val_score = np.mean(fold_val_scores)
    return mean_val_score
##################################################################################################################################################
def run_optuna_optimization(X_train, y_train, data_file_name="default_dataset", 
                           n_trials=50, study_name="neural_net_optimization"):
    """Run Optuna hyperparameter optimization"""
    
    # Set up MLflow
    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "file:./mlruns"))
    mlflow.set_experiment(os.getenv("MLFLOW_EXPERIMENT_NAME", "optuna_neural_net_optimization"))
    
    # Create Optuna study
    study = optuna.create_study(
        direction='minimize',
        study_name=study_name,
        storage=None,  # Use in-memory storage, can be changed to persistent storage
        pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=100),
        sampler=optuna.samplers.TPESampler(seed=42)
    )
    
    _logs.info(f"Starting Optuna optimization with {n_trials} trials...")
    start_time = time.time()
    
    # Optimize
    study.optimize(
        lambda trial: objective(trial, X_train, y_train, data_file_name),
        n_trials=n_trials,
        callbacks=[lambda study, trial: _logs.info(f"Trial {trial.number} finished with value: {trial.value:.4f}")]
    )
    
    end_time = time.time()
    optimization_time = end_time - start_time
    
    _logs.info(f"\nOptimization completed in {optimization_time:.2f} seconds")
    _logs.info(f"Best trial: {study.best_trial.number}")
    _logs.info(f"Best value: {study.best_value:.4f}")
    _logs.info(f"Best parameters:")
    for key, value in study.best_params.items():
        _logs.info(f"  {key}: {value}")
    
    # Log best results to MLflow
    with mlflow.start_run(run_name=f"OPTUNA_BEST_{study_name}"):
        # Log best parameters
        mlflow.log_params(study.best_params)
        
        # Log best score
        mlflow.log_metric("best_val_loss", study.best_value)
        mlflow.log_metric("n_trials", n_trials)
        mlflow.log_metric("optimization_time_seconds", optimization_time)
        
        # Log study statistics
        mlflow.log_metric("n_completed_trials", len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]))
        mlflow.log_metric("n_pruned_trials", len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]))
    
    return study
##################################################################################################################################################
def train_final_model_with_best_params(X_train, y_train, best_params, data_file_name="default_dataset"):
    """Train final model with best parameters found by Optuna"""
    
    _logs.info("\nTraining final model with best parameters...")
    start_time = time.time()

    # Extended training parameters for final model
    n_folds = 4
    epoch_max = 25000
    patience = 10000
    
    # Update parameters with best found values
    params = {
        "dataset": data_file_name,
        "width_1st_layer": best_params['width_1st_layer'],
        "dropout1": best_params['dropout1'],
        "width_2nd_layer": best_params['width_2nd_layer'],
        "dropout2": best_params['dropout2'],
        "width_3rd_layer": best_params['width_3rd_layer'],
        "dropout3": best_params['dropout3'],
        "width_4th_layer": best_params['width_4th_layer'],
        "dropout4": best_params['dropout4'],
        "width_output_layer": 1,
        "activation": "relu",
        "criterion": "Mean Squared Error (MSE) loss",
        "optimizer": "adam",
        "adaptive_batching_learning_rate": True,
        "initial_lr": best_params['initial_lr'],
        "initial_batch_size": best_params['initial_batch_size'],
        "max_batch_size": len(X_train),
        "grad_threshold": best_params['grad_threshold'],
        "batch_multiplier": best_params['batch_multiplier'],
        "lr_multiplier": best_params['lr_multiplier'],
        "grad_threshold_multiplier": best_params['grad_threshold_multiplier'],
        "lr_min": best_params['lr_min'],
        "epoch_max": epoch_max,
        "final_model": True
    }
    
    _logs.info(f"Final model parameters:")
    _logs.info(pprint.pformat(params, indent=2, width=80))
    
    # KFold setup
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # Store results for each fold
    fold_results = {
        'val_mse': [], 'val_rmse': [], 'val_mae': [], 'val_mape': [], 'val_r2': [],
        'train_mse': [], 'train_rmse': [], 'train_mae': [], 'train_mape': [], 'train_r2': []
    }
    
    # KFold cross-validation loop
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        _logs.info(f"\n--- Fold {fold + 1}/{n_folds} ---")
        
        # Create fold-specific train/validation splits
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_val_fold = X_train[val_idx]
        y_val_fold = y_train[val_idx]

        # Make it reproducible
        SEED = 42
        random.seed(SEED)
        np.random.seed(SEED)
        torch.manual_seed(SEED)
        torch.cuda.manual_seed(SEED)
        torch.cuda.manual_seed_all(SEED)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        # Initialize model with best parameters
        model = FeedforwardNN(
            input_size=X_train_fold.shape[1],
            width_1st_layer=best_params['width_1st_layer'],
            width_2nd_layer=best_params['width_2nd_layer'],
            width_3rd_layer=best_params['width_3rd_layer'],
            width_4th_layer=best_params['width_4th_layer'],
            width_output_layer=1,
            dropout1=best_params['dropout1'],
            dropout2=best_params['dropout2'],
            dropout3=best_params['dropout3'],
            dropout4=best_params['dropout4']
        )
        model.apply(init_weights)

        # Setup device
        device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
        
        # Move data to device
        X_train_fold = X_train_fold.to(device)
        y_train_fold = y_train_fold.to(device)
        X_val_fold = X_val_fold.to(device)
        y_val_fold = y_val_fold.to(device)
        model = model.to(device)
        
        # Initialize adaptive batch trainer with best parameters
        adaptive_trainer = GradientAdaptiveBatchingLearning(
            initial_batch_size=best_params['initial_batch_size'],
            initial_lr=best_params['initial_lr'],
            grad_threshold=best_params['grad_threshold'],
            batch_multiplier=best_params['batch_multiplier'],
            lr_multiplier=best_params['lr_multiplier'],
            lr_min=best_params['lr_min'],
            max_batch_size=len(X_train_fold)
        )
        
        # Create DataLoaders
        train_dataset = TensorDataset(X_train_fold, y_train_fold)
        val_dataset = TensorDataset(X_val_fold, y_val_fold)
        
        train_loader = DataLoader(train_dataset, batch_size=best_params['initial_batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=best_params['initial_batch_size'], shuffle=False)
        
        # Training setup
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=adaptive_trainer.current_lr)
        
        best_val_loss = float('inf')
        trigger_times = 0
        min_delta = 0
        train_losses = []
        val_losses = []
        batch_size_history = []
        lr_history = []
        
        # Training loop with progress bar
        progress = tqdm(range(epoch_max), desc=f"Fold {fold+1} Training")
        for epoch in progress:
            # Training phase
            model.train()
            epoch_train_loss = 0.0
            num_train_batches = 0
            
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                output = model(batch_X)
                loss = criterion(output, batch_y)
                loss.backward()
                optimizer.step()
                
                epoch_train_loss += loss.item()
                num_train_batches += 1
            
            avg_train_loss = epoch_train_loss / num_train_batches

            # Validation phase
            model.eval()
            epoch_val_loss = 0.0
            num_val_batches = 0
            
            with torch.no_grad():
                for batch_X, batch_y in val_loader:
                    val_output = model(batch_X)
                    val_loss = criterion(val_output, batch_y)
                    epoch_val_loss += val_loss.item()
                    num_val_batches += 1
            
            avg_val_loss = epoch_val_loss / num_val_batches
            
            # Adaptive batch size and learning rate update
            new_batch_size, new_lr = adaptive_trainer.update_params(avg_val_loss, epoch)
            
            # Update optimizer learning rate if changed
            if new_lr != optimizer.param_groups[0]['lr']:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = new_lr
            
            # Create new dataloader if batch size changed
            if new_batch_size != train_loader.batch_size:
                train_loader = DataLoader(train_dataset, batch_size=new_batch_size, shuffle=True)
                val_loader = DataLoader(val_dataset, batch_size=new_batch_size, shuffle=False)
            
            # Track history
            batch_size_history.append(new_batch_size)
            lr_history.append(new_lr)
            train_losses.append(avg_train_loss)
            val_losses.append(avg_val_loss)
            
            # Update progress display
            with torch.no_grad():
                sample_output = model(X_train_fold[:new_batch_size])
                sample_rmse = compute_rmse(sample_output, y_train_fold[:new_batch_size])
                progress.set_postfix({
                    "Loss": avg_train_loss, 
                    "RMSE": sample_rmse.item(),
                    "BatchSize": new_batch_size,
                    "LR": f"{new_lr:.6f}"
                })
            
            # Early stopping logic
            if avg_val_loss + min_delta < best_val_loss:
                best_val_loss = avg_val_loss
                trigger_times = 0
            else:
                trigger_times += 1
                if trigger_times >= patience:
                    _logs.info(f"\n⏹️ Early stopping at epoch {epoch} — no validation improvement after {patience} epochs.")
                    break
            
            # Periodic logging
            if epoch % 10000 == 0:
                model.eval()
                with torch.no_grad():
                    train_output = model(X_train_fold)
                    val_output = model(X_val_fold)
                    train_loss_full = criterion(train_output, y_train_fold)
                    val_loss_full = criterion(val_output, y_val_fold)
                    
                    rmse = compute_rmse(train_output, y_train_fold)
                    mae = compute_mae(train_output, y_train_fold)
                    r2 = compute_r2(train_output, y_train_fold)
                    val_rmse = compute_rmse(val_output, y_val_fold)
                    val_mae = compute_mae(val_output, y_val_fold)
                    val_r2 = compute_r2(val_output, y_val_fold)
                    
                _logs.info(f"\nEpoch {epoch}:")
                _logs.info(f"Train → MSE = {train_loss_full.item():.4f}, RMSE = {rmse.item():.4f}, MAE = {mae.item():.4f}, R² = {r2.item():.4f}")
                _logs.info(f"Val   → MSE = {val_loss_full.item():.4f}, RMSE = {val_rmse.item():.4f}, MAE = {val_mae.item():.4f}, R² = {val_r2.item():.4f}")
        
        # Final evaluation
        model.eval()
        with torch.no_grad():
            train_output = model(X_train_fold)
            val_output = model(X_val_fold)
            
            # Training metrics
            training_mse = criterion(train_output, y_train_fold)
            training_rmse = compute_rmse(train_output, y_train_fold)
            training_mae = compute_mae(train_output, y_train_fold)
            training_mape = compute_mape(train_output, y_train_fold)
            training_r2_score = compute_r2(train_output, y_train_fold)
        
            # Validation metrics
            val_mse = criterion(val_output, y_val_fold)
            val_rmse = compute_rmse(val_output, y_val_fold)
            val_mae = compute_mae(val_output, y_val_fold)
            val_mape = compute_mape(val_output, y_val_fold)
            val_r2_score = compute_r2(val_output, y_val_fold)
            
            _logs.info(f"\nFold {fold+1} Final Results:")
            _logs.info(f"Train → MSE = {training_mse.item():.4f}, RMSE = {training_rmse.item():.4f}, MAE = {training_mae.item():.4f}, R² Score = {training_r2_score.item():.4f}")
            _logs.info(f"Val   → MSE = {val_mse.item():.4f}, RMSE = {val_rmse.item():.4f}, MAE = {val_mae.item():.4f}, R² Score = {val_r2_score.item():.4f}")

        # Store fold results
        fold_results['train_mse'].append(training_mse.item())
        fold_results['train_rmse'].append(training_rmse.item())
        fold_results['train_mae'].append(training_mae.item())
        fold_results['train_mape'].append(training_mape.item())
        fold_results['train_r2'].append(training_r2_score.item())
        
        fold_results['val_mse'].append(val_mse.item())
        fold_results['val_rmse'].append(val_rmse.item())
        fold_results['val_mae'].append(val_mae.item())
        fold_results['val_mape'].append(val_mape.item())
        fold_results['val_r2'].append(val_r2_score.item())

        # Create visualization
        start = 100 
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
        
        # Loss curves
        ax1.plot(range(start, len(train_losses)), train_losses[start:], label='Training Loss', color='blue')
        ax1.plot(range(start, len(val_losses)), val_losses[start:], label='Validation Loss', color='orange')
        ax1.set_xlabel("Epoch")
        ax1.set_ylabel("Loss")
        ax1.set_title(f"Loss Curves - Fold {fold+1}")
        ax1.legend()
        ax1.grid(True)
        
        # Batch size evolution
        ax2.plot(batch_size_history, color='green')
        ax2.set_xlabel("Epoch")
        ax2.set_ylabel("Batch Size")
        ax2.set_title(f"Batch Size Evolution - Fold {fold+1}")
        ax2.grid(True)
        
        # Learning rate evolution
        ax3.plot(lr_history, color='red')
        ax3.set_xlabel("Epoch")
        ax3.set_ylabel("Learning Rate")
        ax3.set_title(f"Learning Rate Evolution - Fold {fold+1}")
        ax3.grid(True)
        
        # Validation loss gradient
       if len(val_losses) > 50:
            val_gradients = np.diff(val_losses)
            ax4.plot(val_gradients[-min(10000, len(val_gradients)):], color='orange', alpha=0.7)
            ax4.axhline(y=best_params['grad_threshold'], color='red', linestyle='--', 
                        label=f'Threshold: {best_params["grad_threshold"]}')
            ax4.axhline(y=-best_params['grad_threshold'], color='red', linestyle='--')
            ax4.set_xlabel("Epoch")
            ax4.set_ylabel("Validation Loss Gradient")
            ax4.set_title(f"Recent Validation Loss Gradient - Fold {fold+1}")
            ax4.legend()
            ax4.grid(True)
    
        plt.tight_layout()   
        # Log this fold's plot to MLflow immediately
        plot_filename = f"final_fold_{fold+1}_training_plots.png"
        plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
        plt.show()
        
        # Log to MLflow in a separate run for this fold
        fold_run_name = f"FINAL_Fold_{fold+1}_{params['width_1st_layer']}x{params['width_2nd_layer']}x{params['width_3rd_layer']}x{params['width_4th_layer']}"
        with mlflow.start_run(run_name=fold_run_name):
            # Log fold-specific parameters
            fold_params = params.copy()
            fold_params["fold"] = fold + 1
            fold_params["max_batch_size"] = len(X_train_fold)
            mlflow.log_params(fold_params)
            
            # Log the plot artifact
            mlflow.log_artifact(plot_filename)
            
        plt.close()  # Close to free memory
        os.remove(plot_filename)  # Clean up the file
        
        # Print summary of adaptive batching
        _logs.info(f"\n📊 Adaptive Batching Summary - Fold {fold+1}:")
        _logs.info(f"  Initial batch size: {best_params['initial_batch_size']}")
        _logs.info(f"  Final batch size: {batch_size_history[-1]}")
        _logs.info(f"  Initial learning rate: {best_params['initial_lr']:.6f}")
        _logs.info(f"  Final learning rate: {lr_history[-1]:.6f}")
        _logs.info(f"  Number of batch size increases: {len(set(batch_size_history)) - 1}")

        # Clean up
        del model
        gc.collect()
        torch.cuda.empty_cache()

    # Compute and log cross-validation summary statistics
    end_time = time.time()
    total_time = end_time - start_time
    
    _logs.info(f"\n{'='*60}")
    _logs.info(f"Final Model Cross-Validation Summary")
    _logs.info(f"{'='*60}")

    cv_summary = {}
    for metric_name, values in fold_results.items():
        mean_val = np.mean(values)
        std_val = np.std(values)
        cv_summary[f"{metric_name}_mean"] = mean_val
        cv_summary[f"{metric_name}_std"] = std_val
        _logs.info(f"{metric_name}: {mean_val:.4f} ± {std_val:.4f}")

    # Log cross-validation summary to MLflow
    run_name = f"FINAL_MODEL_CV_SUMMARY_{params['width_1st_layer']}x{params['width_2nd_layer']}x{params['width_3rd_layer']}x{params['width_4th_layer']}"
    with mlflow.start_run(run_name=run_name):
        # Log best parameters used
        mlflow.log_params(params)
        
        # Log all CV summary metrics
        for metric_name, metric_value in cv_summary.items():
            mlflow.log_metric(metric_name, round(metric_value, 4))
        
        # Log training time
        mlflow.log_metric("total_training_time_seconds", total_time)

    _logs.info(f"\n⏳ Final model training completed in {total_time:.2f} seconds.")
    
    # Clean memory
    gc.collect()
    torch.cuda.empty_cache()
    
    return cv_summary
##################################################################################################################################################
# Step 1: Find best hyperparameters
_logs.info("Starting hyperparameter optimization...")
study = run_optuna_optimization(
    X_train=X_train, 
    y_train=y_train, 
    data_file_name=data_file_name,
    n_trials=50,
    study_name="neural_net_optimization"
)
# Step 2: Train final model with best parameters
_logs.info("Training final model with optimized parameters...")
cv_results = train_final_model_with_best_params(
    X_train=X_train,
    y_train=y_train,
    best_params=study.best_params,
    data_file_name=data_file_name
)
# Access the results
_logs.info(f"Final validation RMSE: {cv_results['val_rmse_mean']:.4f} ± {cv_results['val_rmse_std']:.4f}")

2025-08-03 02:11:02,900, 2692577433.py, 34, INFO, load_dotenv() returned: True
2025-08-03 02:11:02,900, 2692577433.py, 57, INFO, Working directory set to: c:\Users\The Winner\DSI\customer_purchasing_behaviour
2025-08-03 02:11:02,900, 2692577433.py, 67, INFO, Successfully loaded 'df_eng_customer_purchasing_features.csv' into the DataFrame named df.
--- Logging error ---
Traceback (most recent call last):
  File "c:\miniconda3\envs\gtx1060_tf\lib\logging\__init__.py", line 1100, in emit
    msg = self.format(record)
  File "c:\miniconda3\envs\gtx1060_tf\lib\logging\__init__.py", line 943, in format
    return fmt.format(record)
  File "c:\miniconda3\envs\gtx1060_tf\lib\logging\__init__.py", line 678, in format
    record.message = record.getMessage()
  File "c:\miniconda3\envs\gtx1060_tf\lib\logging\__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "c:\miniconda3\envs\gtx1060_tf\lib\runpy.py"

TypeError: Trial.suggest_categorical() takes 3 positional arguments but 4 were given