In [None]:
# CTGAN Hyperparameter Optimization\nprint(\"🔄 CTGAN Hyperparameter Optimization\")\nprint(\"=\" * 50)\n\ndef ctgan_objective(trial):\n    \"\"\"Optuna objective function for CTGAN\"\"\"\n    \n    # Get hyperparameter space from model\n    ctgan_space = CTGANModel().get_hyperparameter_space()\n    \n    # Sample hyperparameters\n    params = {\n        'epochs': trial.suggest_int('epochs', 100, 1000, step=50),\n        'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256, 512]),\n        'generator_lr': trial.suggest_loguniform('generator_lr', 1e-5, 1e-3),\n        'discriminator_lr': trial.suggest_loguniform('discriminator_lr', 1e-5, 1e-3),\n        'generator_dim': trial.suggest_categorical('generator_dim', \n            [(128, 128), (256, 256), (256, 128, 64), (512, 256, 128)]),\n        'discriminator_dim': trial.suggest_categorical('discriminator_dim',\n            [(128, 128), (256, 256), (256, 128, 64), (512, 256, 128)]),\n        'pac': trial.suggest_int('pac', 5, 20),\n        'embedding_dim': trial.suggest_int('embedding_dim', 64, 256, step=32),\n        'generator_decay': trial.suggest_loguniform('generator_decay', 1e-6, 1e-3),\n        'discriminator_decay': trial.suggest_loguniform('discriminator_decay', 1e-6, 1e-3)\n    }\n    \n    try:\n        # Initialize and train model\n        model = CTGANModel()\n        model.train(data, **params)\n        \n        # Generate synthetic data\n        synthetic_data = model.generate(len(data))\n        \n        # Calculate objective score\n        objective_score, sim_score, acc_score = enhanced_objective_function_v2(\n            data, synthetic_data, target_column)\n        \n        # Store additional metrics\n        trial.set_user_attr('similarity_score', sim_score)\n        trial.set_user_attr('accuracy_score', acc_score)\n        \n        return objective_score\n    \n    except Exception as e:\n        print(f\"Trial failed: {e}\")\n        return 0.0\n\n# Run CTGAN optimization\nctgan_study = optuna.create_study(direction='maximize', study_name='CTGAN_Optimization')\nprint(\"Starting CTGAN optimization (20 trials)...\")\nctgan_study.optimize(ctgan_objective, n_trials=20, timeout=3600)  # 1 hour timeout\n\n# Display results\nprint(f\"\\n✅ CTGAN Optimization Complete:\")\nprint(f\"   - Best objective score: {ctgan_study.best_value:.4f}\")\nprint(f\"   - Best parameters: {ctgan_study.best_params}\")\nprint(f\"   - Best similarity: {ctgan_study.best_trial.user_attrs.get('similarity_score', 'N/A')}\")\nprint(f\"   - Best accuracy: {ctgan_study.best_trial.user_attrs.get('accuracy_score', 'N/A')}\")\n\n# Store best parameters\nctgan_best_params = ctgan_study.best_params"

### 2.2 CTGAN Hyperparameter Optimization

# Multi-Model Synthetic Data Generation: Breast Cancer Dataset

## Comprehensive Demo and Hyperparameter Tuning of 5 Models

This notebook demonstrates a comprehensive synthetic data generation framework using five state-of-the-art models:
- **CTGAN** (Conditional Tabular GAN)
- **TVAE** (Tabular Variational Autoencoder)
- **CopulaGAN** (Copula-based GAN)
- **TableGAN** (Table-focused GAN)
- **GANerAid** (Healthcare-focused GAN)

### Enhanced Framework Features

- **Enhanced Objective Function**: 60% similarity + 40% accuracy weighting
- **Comprehensive Hyperparameter Optimization**: Using Optuna with production-ready parameter spaces
- **Advanced Similarity Metrics**: Earth Mover's Distance and correlation-based analysis
- **Clinical Focus**: Designed for healthcare applications with privacy considerations

---

## Setup and Configuration

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
import optuna
from pathlib import Path
from scipy.stats import wasserstein_distance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Import model implementations
from src.models.implementations.ctgan_model import CTGANModel
from src.models.implementations.tvae_model import TVAEModel
from src.models.implementations.copulagan_model import CopulaGANModel
from src.models.implementations.tablegan_model import TableGANModel
from src.models.implementations.ganeraid_model import GANerAidModel

# Import optimization and evaluation functions
from src.optimization.objective_functions import enhanced_objective_function
from src.evaluation.metrics import calculate_similarity_metrics, evaluate_model_performance
from src.visualization.training_history_plotter import plot_training_history

# Configuration
warnings.filterwarnings('ignore')
np.random.seed(42)
plt.style.use('seaborn-v0_8')

# Create output directories
output_dir = Path('outputs/multi_model_results')
output_dir.mkdir(parents=True, exist_ok=True)

print('✅ Setup complete - All libraries imported successfully')

## Data Loading and Preprocessing

In [None]:
# Load breast cancer dataset
data_file = 'data/Breast_cancer_data.csv'
target_column = 'diagnosis'

# Load and examine the data
data = pd.read_csv(data_file)
print(f'Dataset shape: {data.shape}')
print(f'Target column: {target_column}')
print(f'Target distribution:')
print(data[target_column].value_counts())

# Display basic statistics
print(f'\nDataset Info:')
data.info()

# Display first few rows
print(f'\nFirst 5 rows:')
data.head()

## Phase 1: Demo All Models with Default Parameters

Before hyperparameter optimization, we demonstrate each model with default parameters to establish baseline performance.

### 1.1 CTGAN Demo

In [None]:
# CTGAN Demo with default parameters
print("🔄 CTGAN Demo - Default Parameters")
print("=" * 40)

# Initialize CTGAN model
ctgan_model = CTGANModel()

# Train with minimal parameters for demo
demo_params = {'epochs': 50, 'batch_size': 100}
start_time = time.time()
ctgan_model.train(data, **demo_params)
train_time = time.time() - start_time

# Generate synthetic data
demo_samples = len(data)  # Same size as original dataset
synthetic_data_ctgan = ctgan_model.generate(demo_samples)

print(f"✅ CTGAN Demo Complete:")
print(f"   - Training time: {train_time:.2f} seconds")
print(f"   - Generated samples: {len(synthetic_data_ctgan)}")
print(f"   - Original shape: {data.shape}")
print(f"   - Synthetic shape: {synthetic_data_ctgan.shape}")

### 1.2 TVAE Demo

In [None]:
# TVAE Demo with default parameters
print("🔄 TVAE Demo - Default Parameters")
print("=" * 40)

# Initialize TVAE model
tvae_model = TVAEModel()

# Train with minimal parameters for demo
demo_params = {'epochs': 50, 'batch_size': 100}
start_time = time.time()
tvae_model.train(data, **demo_params)
train_time = time.time() - start_time

# Generate synthetic data
synthetic_data_tvae = tvae_model.generate(demo_samples)

print(f"✅ TVAE Demo Complete:")
print(f"   - Training time: {train_time:.2f} seconds")
print(f"   - Generated samples: {len(synthetic_data_tvae)}")
print(f"   - Original shape: {data.shape}")
print(f"   - Synthetic shape: {synthetic_data_tvae.shape}")

### 1.3 CopulaGAN Demo

In [None]:
# CopulaGAN Demo with default parameters
print("🔄 CopulaGAN Demo - Default Parameters")
print("=" * 40)

# Initialize CopulaGAN model
copulagan_model = CopulaGANModel()

# Train with minimal parameters for demo
demo_params = {'epochs': 50, 'batch_size': 100}
start_time = time.time()
copulagan_model.train(data, **demo_params)
train_time = time.time() - start_time

# Generate synthetic data
synthetic_data_copulagan = copulagan_model.generate(demo_samples)

print(f"✅ CopulaGAN Demo Complete:")
print(f"   - Training time: {train_time:.2f} seconds")
print(f"   - Generated samples: {len(synthetic_data_copulagan)}")
print(f"   - Original shape: {data.shape}")
print(f"   - Synthetic shape: {synthetic_data_copulagan.shape}")

### 1.4 TableGAN Demo

In [None]:
# TableGAN Demo with default parameters
print("🔄 TableGAN Demo - Default Parameters")
print("=" * 40)

# Initialize TableGAN model
tablegan_model = TableGANModel()

# Train with minimal parameters for demo
demo_params = {'epochs': 50, 'batch_size': 100}
start_time = time.time()
tablegan_model.train(data, **demo_params)
train_time = time.time() - start_time

# Generate synthetic data
synthetic_data_tablegan = tablegan_model.generate(demo_samples)

print(f"✅ TableGAN Demo Complete:")
print(f"   - Training time: {train_time:.2f} seconds")
print(f"   - Generated samples: {len(synthetic_data_tablegan)}")
print(f"   - Original shape: {data.shape}")
print(f"   - Synthetic shape: {synthetic_data_tablegan.shape}")

### 1.5 GANerAid Demo

In [None]:
# GANerAid Demo with default parameters
print("🔄 GANerAid Demo - Default Parameters")
print("=" * 40)

# Initialize GANerAid model
ganeraid_model = GANerAidModel()

# Train with minimal parameters for demo
demo_params = {'epochs': 50, 'batch_size': 100}
start_time = time.time()
ganeraid_model.train(data, **demo_params)
train_time = time.time() - start_time

# Generate synthetic data
synthetic_data_ganeraid = ganeraid_model.generate(demo_samples)

print(f"✅ GANerAid Demo Complete:")
print(f"   - Training time: {train_time:.2f} seconds")
print(f"   - Generated samples: {len(synthetic_data_ganeraid)}")
print(f"   - Original shape: {data.shape}")
print(f"   - Synthetic shape: {synthetic_data_ganeraid.shape}")

## Hyperparameter Space Summary and Rationale

Before proceeding with optimization, this section provides comprehensive documentation of the hyperparameter spaces for each model, based on production-ready configurations and extensive research.

### Enhanced Objective Function Design

Our optimization uses an enhanced objective function that balances **data similarity** and **utility accuracy**:

**Objective Function**: `0.6 × Similarity Score + 0.4 × Accuracy Score`

- **Similarity Component (60%)**:
  - Univariate similarity via Earth Mover's Distance (EMD)
  - Bivariate similarity via Euclidean distance between correlation matrices
- **Accuracy Component (40%)**:
  - TRTS (Train Real, Test Synthetic) evaluation
  - TRTR (Train Real, Test Real) baseline comparison

### Model-Specific Hyperparameter Spaces

Each model has been configured with production-ready hyperparameter ranges optimized for diverse tabular datasets:

#### CTGAN Hyperparameter Space
- **Epochs**: 100-1000 (step=50) - Extended training for GAN convergence
- **Batch Size**: [64, 128, 256, 512] - Balanced for memory and training stability
- **Learning Rate**: 1e-5 to 1e-3 (log scale) - Optimized for Adam optimizer
- **Generator/Discriminator Dims**: Multiple architectures from (128,128) to (512,256,128)
- **PAC**: 5-20 - Packed samples for improved discriminator training

#### TVAE Hyperparameter Space
- **Epochs**: 100-1000 (step=50) - VAE convergence typically requires more epochs
- **Compress/Decompress Dims**: Symmetric and asymmetric architectures
- **L2 Scale**: 1e-7 to 1e-2 (log scale) - Regularization for overfitting prevention
- **Loss Factor**: 1-10 - Balances reconstruction vs KL divergence

#### CopulaGAN, TableGAN, GANerAid
Similar comprehensive spaces tailored to each model's specific architecture and training dynamics.

### Rationale for Parameter Ranges

1. **Production-Ready**: All ranges tested across diverse healthcare datasets
2. **Computational Balance**: Optimized for performance vs runtime trade-offs
3. **Robustness**: Wide enough ranges to handle various data complexities
4. **Clinical Focus**: Special attention to privacy-preserving parameters

---

## Phase 2: Hyperparameter Tuning for Each Model

Using Optuna for systematic hyperparameter optimization with the enhanced objective function.

### 2.1 Enhanced Objective Function Implementation

In [None]:
# Enhanced Objective Function Implementation
def enhanced_objective_function_v2(real_data, synthetic_data, target_column, 
                                 similarity_weight=0.6, accuracy_weight=0.4):
    """
    Enhanced objective function: 60% similarity + 40% accuracy
    
    Args:
        real_data: Original dataset
        synthetic_data: Generated synthetic dataset  
        target_column: Name of target column
        similarity_weight: Weight for similarity component (default 0.6)
        accuracy_weight: Weight for accuracy component (default 0.4)
    
    Returns:
        Combined objective score (higher is better)
    """
    
    # 1. Similarity Component (60%)
    similarity_scores = []
    
    # Univariate similarity using Earth Mover's Distance
    numeric_columns = real_data.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        if col != target_column:
            emd_distance = wasserstein_distance(real_data[col], synthetic_data[col])
            # Convert to similarity score (lower distance = higher similarity)
            similarity_scores.append(1.0 / (1.0 + emd_distance))
    
    # Bivariate similarity using correlation matrices
    real_corr = real_data[numeric_columns].corr().values
    synth_corr = synthetic_data[numeric_columns].corr().values
    corr_distance = np.linalg.norm(real_corr - synth_corr, 'fro')
    corr_similarity = 1.0 / (1.0 + corr_distance)
    similarity_scores.append(corr_similarity)
    
    # Average similarity score
    similarity_score = np.mean(similarity_scores)
    
    # 2. Accuracy Component (40%)
    # TRTS/TRTR framework
    X_real = real_data.drop(columns=[target_column])
    y_real = real_data[target_column]
    X_synth = synthetic_data.drop(columns=[target_column])
    y_synth = synthetic_data[target_column]
    
    # Split data
    X_real_train, X_real_test, y_real_train, y_real_test = train_test_split(
        X_real, y_real, test_size=0.3, random_state=42, stratify=y_real)
    X_synth_train, X_synth_test, y_synth_train, y_synth_test = train_test_split(
        X_synth, y_synth, test_size=0.3, random_state=42)
    
    # TRTS: Train on synthetic, test on real
    classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    classifier.fit(X_synth_train, y_synth_train)
    trts_score = classifier.score(X_real_test, y_real_test)
    
    # TRTR: Train on real, test on real (baseline)
    classifier.fit(X_real_train, y_real_train)
    trtr_score = classifier.score(X_real_test, y_real_test)
    
    # Utility score (TRTS/TRTR ratio)
    accuracy_score = trts_score / trtr_score if trtr_score > 0 else 0
    
    # 3. Combined Objective Function
    # Normalize weights
    total_weight = similarity_weight + accuracy_weight
    norm_sim_weight = similarity_weight / total_weight
    norm_acc_weight = accuracy_weight / total_weight
    
    final_objective = norm_sim_weight * similarity_score + norm_acc_weight * accuracy_score
    
    return final_objective, similarity_score, accuracy_score

print("✅ Enhanced Objective Function Implemented")
print("   - Similarity: 60% (EMD + Correlation Distance)")
print("   - Accuracy: 40% (TRTS/TRTR Framework)")