<a href="https://colab.research.google.com/github/codeslayr/Neural_Network_Optimization/blob/main/Advance_Opt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Optimizing CNN hyperparameters using SGD**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
import numpy as np
import random

# Set seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Device Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data Preparation
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Load and split FashionMNIST dataset
full_train_dataset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)
train_dataset, val_dataset = random_split(full_train_dataset, [55000, 5000])
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# CNN Model
class FashionCNN(nn.Module):
    def __init__(self):
        super(FashionCNN, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        return self.net(x)

# Training and evaluation functions
def train(model, dataloader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

# Random Search Config
search_space = {
    'lr': lambda: 10 ** np.random.uniform(-4, -1),
    'momentum': lambda: np.random.uniform(0.5, 0.99),
    'batch_size': lambda: random.choice([32, 64, 128]),
    'weight_decay': lambda: np.random.uniform(0.0, 1e-2)
}

results = []
best_config = None
best_acc = 0
best_epoch = -1
num_trials = 20

# Run random search
for trial in range(num_trials):
    config = {key: sampler() for key, sampler in search_space.items()}
    batch_size = int(config['batch_size'])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = FashionCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=config['momentum'], weight_decay=config['weight_decay'])

    train(model, train_loader, criterion, optimizer, epochs=3)
    val_loss, val_acc = evaluate(model, val_loader, criterion)

    results.append((config, val_loss, val_acc))
    print(f"Epoch {trial+1}/{num_trials} - Acc: {val_acc:.4f}, Loss: {val_loss:.4f}, Config: {config}")

    if val_acc > best_acc:
        best_acc = val_acc
        best_config = config
        best_epoch = trial + 1

# Final training and evaluation
print(f"\n🔍 Best Hyperparameter Found at epoch {best_epoch} {best_config}")
print(f"Best hyperparameters from SGD: {best_config}")

final_model = FashionCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(final_model.parameters(), lr=best_config['lr'], momentum=best_config['momentum'], weight_decay=best_config['weight_decay'])

combined_loader = DataLoader(full_train_dataset, batch_size=int(best_config['batch_size']), shuffle=True)
print()
for ep in range(5):
    train(final_model, combined_loader, criterion, optimizer, epochs=1)
    print(f"Epoch {ep+1} completed.")

test_loss, test_acc = evaluate(final_model, test_loader, criterion)
print(f"\nTest Loss: {test_loss:.6f}, Test Accuracy: {test_acc:.3f}")

plotting

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import time
import psutil
from matplotlib.ticker import MaxNLocator


# 1. Convergence Progression
def plot_convergence(results):
    """Plot validation accuracy and loss progression across trials"""
    trials = range(1, len(results) + 1)
    val_accs = [res[2] for res in results]
    val_losses = [res[1] for res in results]

    # Find best accuracy positions
    best_accs = [max(val_accs[:i+1]) for i in range(len(val_accs))]
    best_losses = [min(val_losses[:i+1]) for i in range(len(val_losses))]

    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Accuracy plot
    color = 'tab:blue'
    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Accuracy', color=color)
    ax1.plot(trials, val_accs, 'o-', color='lightblue', label='Trial Accuracy')
    ax1.plot(trials, best_accs, 's-', color=color, label='Best Accuracy')
    ax1.tick_params(axis='y', labelcolor=color)
    ax1.set_ylim(0.7, 0.95)
    ax1.grid(alpha=0.3)
    ax1.set_title('Convergence Progression')

    # Loss plot
    ax2 = ax1.twinx()
    color = 'tab:red'
    ax2.set_ylabel('Loss', color=color)
    ax2.plot(trials, val_losses, 'o-', color='salmon', label='Trial Loss')
    ax2.plot(trials, best_losses, 's-', color=color, label='Best Loss')
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.set_ylim(0.1, 1.0)

    # Combined legend
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='lower right')

    plt.tight_layout()
    plt.show()
    plt.savefig('convergence_progression.png')
    plt.close()

# 2. Hyperparameter Evolution
def plot_hyperparameter_evolution(results):
    """Visualize hyperparameter trajectories and correlation with accuracy"""
    # Prepare data
    df = pd.DataFrame({
        'trial': range(1, len(results)+1),
        'lr': [res[0]['lr'] for res in results],
        'momentum': [res[0]['momentum'] for res in results],
        'batch_size': [res[0]['batch_size'] for res in results],
        'weight_decay': [res[0]['weight_decay'] for res in results],
        'accuracy': [res[2] for res in results]
    })

    # Parallel coordinates plot
    plt.figure(figsize=(12, 8))
    pd.plotting.parallel_coordinates(
        df, 'trial',
        cols=['lr', 'momentum', 'batch_size', 'weight_decay', 'accuracy'],
        color=plt.cm.viridis(np.linspace(0, 1, len(df)))
    )
    plt.xticks(rotation=45)
    plt.title('Hyperparameter Evolution Across Trials')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    plt.savefig('hyperparameter_evolution.png')
    plt.close()

    # Correlation matrix
    corr = df.corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, mask=mask, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title('Hyperparameter-Accuracy Correlation')
    plt.tight_layout()
    plt.show()
    plt.savefig('hyperparameter_correlation.png')
    plt.close()

# 3. Search Space Exploration
def plot_search_space_exploration(results):
    """Visualize hyperparameter distribution and density"""
    # Prepare data
    df = pd.DataFrame({
        'lr': [res[0]['lr'] for res in results],
        'momentum': [res[0]['momentum'] for res in results],
        'batch_size': [res[0]['batch_size'] for res in results],
        'weight_decay': [res[0]['weight_decay'] for res in results],
        'accuracy': [res[2] for res in results]
    })

    # Pairwise relationships
    g = sns.PairGrid(df, diag_sharey=False)
    g.map_upper(sns.scatterplot, s=50)
    g.map_lower(sns.kdeplot, fill=True)
    g.map_diag(sns.histplot, kde=True)
    g.fig.suptitle('Search Space Exploration', y=1.02)
    plt.show()
    plt.savefig('search_space_pairplot.png')
    plt.close()

    # PCA projection
    pca = PCA(n_components=2)
    params = df[['lr', 'momentum', 'batch_size', 'weight_decay']]
    pca_result = pca.fit_transform(params)

    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(
        pca_result[:, 0], pca_result[:, 1],
        c=df['accuracy'], cmap='viridis', s=100, alpha=0.8
    )
    plt.colorbar(scatter, label='Accuracy')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('Hyperparameter Space Projection (PCA)')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    plt.savefig('search_space_pca.png')
    plt.close()

# 4. Training Dynamics (For Best Run)
def plot_training_dynamics(model, train_loader, val_loader):
    """Plot training and validation metrics during training"""
    # Track metrics during training
    train_losses, val_losses = [], []
    train_accs, val_accs = [], []

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=best_config['lr'],
                          momentum=best_config['momentum'],
                          weight_decay=best_config['weight_decay'])

    for epoch in range(5):
        # Training
        model.train()
        epoch_train_loss = 0.0
        correct_train = 0
        total_train = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

        train_loss = epoch_train_loss / len(train_loader)
        train_acc = correct_train / total_train
        train_losses.append(train_loss)
        train_accs.append(train_acc)

        # Validation
        val_loss, val_acc = evaluate(model, val_loader, criterion)
        val_losses.append(val_loss)
        val_accs.append(val_acc)

        print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f} | "
              f"Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

    # Create plots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))

    # Loss plot
    epochs = range(1, 6)
    ax1.plot(epochs, train_losses, 'b-o', label='Training Loss')
    ax1.plot(epochs, val_losses, 'r--s', label='Validation Loss')
    ax1.set_title('Training and Validation Loss')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(alpha=0.3)
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))

    # Accuracy plot
    ax2.plot(epochs, train_accs, 'g-o', label='Training Accuracy')
    ax2.plot(epochs, val_accs, 'm--s', label='Validation Accuracy')
    ax2.set_title('Training and Validation Accuracy')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    ax2.grid(alpha=0.3)
    ax2.xaxis.set_major_locator(MaxNLocator(integer=True))

    plt.tight_layout()
    plt.show()
    plt.savefig('training_dynamics.png')
    plt.close()

    return train_losses, train_accs, val_losses, val_accs

# 5. Computation Profile (Modified Training Loop)
def track_computation_profile():
    """Track computational resources during training"""
    # Modify training function to track resources
    train_times = []
    cpu_usages = []
    mem_usages = []
    gpu_usages = []

    for trial in range(num_trials):
        config = {key: sampler() for key, sampler in search_space.items()}
        batch_size = int(config['batch_size'])
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        model = FashionCNN().to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=config['lr'],
                              momentum=config['momentum'],
                              weight_decay=config['weight_decay'])

        # Start tracking
        start_time = time.time()
        cpu_before = psutil.cpu_percent()
        mem_before = psutil.virtual_memory().percent

        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats(device)
            gpu_before = torch.cuda.memory_allocated(device) / (1024 ** 2)  # MB

        # Training
        model.train()
        for epoch in range(3):
            for images, labels in train_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # End tracking
        train_time = time.time() - start_time
        cpu_after = psutil.cpu_percent()
        mem_after = psutil.virtual_memory().percent

        if torch.cuda.is_available():
            gpu_after = torch.cuda.max_memory_allocated(device) / (1024 ** 2)  # MB
        else:
            gpu_after = 0

        train_times.append(train_time)
        cpu_usages.append((cpu_before + cpu_after) / 2)
        mem_usages.append((mem_before + mem_after) / 2)
        gpu_usages.append(gpu_after)

        # Evaluation
        val_loss, val_acc = evaluate(model, val_loader, criterion)
        results.append((config, val_loss, val_acc))

    # Plot computation profile
    trials = range(1, num_trials+1)
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Time and Memory
    ax1.set_xlabel('Trial')
    ax1.set_ylabel('Time (s) / Memory (%)')
    ax1.plot(trials, train_times, 'b-o', label='Training Time')
    ax1.plot(trials, mem_usages, 'g--s', label='Memory Usage')
    ax1.tick_params(axis='y')
    ax1.legend(loc='upper left')

    # GPU Usage
    if torch.cuda.is_available():
        ax2 = ax1.twinx()
        ax2.set_ylabel('GPU Memory (MB)', color='r')
        ax2.plot(trials, gpu_usages, 'r-^', label='GPU Memory')
        ax2.tick_params(axis='y', labelcolor='r')
        ax2.legend(loc='upper right')

    plt.title('Computation Profile Across Trials')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()
    plt.savefig('computation_profile.png')
    plt.close()

    return train_times, cpu_usages, mem_usages, gpu_usages

# 6. Error Analysis
def plot_error_analysis(model, test_loader):
    """Analyze model errors with confusion matrix and class metrics"""
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    class_names = ['T-shirt', 'Trouser', 'Pullover', 'Dress', 'Coat',
                   'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.show()
    plt.savefig('confusion_matrix.png')
    plt.close()

    # Class-wise accuracy
    class_acc = cm.diagonal() / cm.sum(axis=1)
    plt.figure(figsize=(10, 6))
    plt.bar(class_names, class_acc, color='skyblue')
    plt.axhline(np.mean(class_acc), color='r', linestyle='--',
                label=f'Mean Accuracy: {np.mean(class_acc):.4f}')
    plt.ylim(0.7, 1.0)
    plt.title('Class-wise Accuracy')
    plt.xlabel('Class')
    plt.ylabel('Accuracy')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()
    plt.savefig('class_accuracy.png')
    plt.close()

    # Loss distribution (sample level)
    model.eval()
    losses = []

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = nn.CrossEntropyLoss(reduction='none')(outputs, labels)
            losses.extend(loss.cpu().numpy())

    plt.figure(figsize=(10, 6))
    plt.hist(losses, bins=50, color='purple', alpha=0.7)
    plt.axvline(np.mean(losses), color='r', linestyle='--',
                label=f'Mean Loss: {np.mean(losses):.4f}')
    plt.title('Loss Distribution Across Test Samples')
    plt.xlabel('Loss')
    plt.ylabel('Frequency')
    plt.legend()
    plt.tight_layout()
    plt.show()
    plt.savefig('loss_distribution.png')
    plt.close()

    return cm, class_acc, losses

# Main Execution
if __name__ == "__main__":
    # Run after your optimization code
    print("\nGenerating visualizations...")

    # 1. Convergence Progression
    plot_convergence(results)
    print("Generated Convergence Progression plots")

    # 2. Hyperparameter Evolution
    plot_hyperparameter_evolution(results)
    print("Generated Hyperparameter Evolution plots")

    # 3. Search Space Exploration
    plot_search_space_exploration(results)
    print("Generated Search Space Exploration plots")

    # 4. Training Dynamics (for best model)
    # Prepare dataloaders for training dynamics
    batch_size = int(best_config['batch_size'])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Create a new model for training dynamics
    model_dynamics = FashionCNN().to(device)
    plot_training_dynamics(model_dynamics, train_loader, val_loader)
    print("Generated Training Dynamics plot")

    # 5. Computation Profile (requires modified training loop)
    # We need to re-run the optimization with tracking
    print("Tracking computation profile (this will re-run trials)...")
    # Save previous results
    original_results = results.copy()
    results = []
    track_computation_profile()
    # Restore original results
    results = original_results
    print("Generated Computation Profile plot")

    # 6. Error Analysis (on final model)
    plot_error_analysis(final_model, test_loader)
    print("Generated Error Analysis plots")

    print("\nAll visualizations saved to current directory!")

### **Optimizing CNN hyperparameters using LBFGS**



In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split, Subset
import numpy as np
import random

# plotting libs
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pandas.plotting import parallel_coordinates
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import plotly.express as px

# ————————— Set seeds & device —————————
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ————————— Data Prep —————————
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

full_train = torchvision.datasets.FashionMNIST('./data', train=True, download=True, transform=transform)
train_ds, val_ds = random_split(full_train, [55000,5000])
test_ds = torchvision.datasets.FashionMNIST('./data', train=False, download=True, transform=transform)

# subset for search
subset_size=5000
train_sub = Subset(train_ds, list(range(subset_size)))
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)

# ————————— Model —————————
class FashionCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1,32,3,padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32,64,3,padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Flatten(), nn.Linear(64*7*7,128), nn.ReLU(), nn.Linear(128,10)
        )
    def forward(self,x): return self.net(x)

criterion = nn.CrossEntropyLoss()

# ————————— Train/Eval Helpers —————————
def train_lbfgs(model, loader, criterion, optimizer, epochs=3, weight_decay=0.0):
    history = {'train_loss':[], 'train_acc':[]}
    model.train()
    for ep in range(epochs):
        total_loss, correct, total = 0.0,0,0
        for imgs, labs in loader:
            imgs, labs = imgs.to(device), labs.to(device)
            def closure():
                optimizer.zero_grad()
                out = model(imgs)
                loss = criterion(out,labs)
                if weight_decay>0:
                    l2 = sum(p.pow(2).sum() for p in model.parameters())
                    loss = loss + weight_decay*l2
                loss.backward()
                return loss
            loss = optimizer.step(closure)
            total_loss += loss.item()
            with torch.no_grad():
                preds = model(imgs).argmax(1)
                correct += (preds==labs).sum().item()
                total += labs.size(0)
        history['train_loss'].append(total_loss/len(loader))
        history['train_acc'].append(correct/total)
        print(f"Epoch {ep+1} completed.")
    return history

def evaluate(model, loader, criterion):
    model.eval()
    tot_loss, correct, total = 0.0,0,0
    with torch.no_grad():
        for imgs, labs in loader:
            imgs, labs = imgs.to(device), labs.to(device)
            out = model(imgs)
            tot_loss += criterion(out,labs).item()
            preds = out.argmax(1)
            correct += (preds==labs).sum().item()
            total += labs.size(0)
    return tot_loss/len(loader), correct/total

# ————————— Hyperparameter Search —————————
search_space = {
    'lr':          lambda: 10**np.random.uniform(-2,0),
    'batch_size':  lambda: random.choice([16,32,64]),
    'weight_decay':lambda: np.random.uniform(0,1e-2)
}

results = []
best_acc,best_cfg,best_trial=0, None, -1
num_trials=20

for t in range(num_trials):
    cfg = {k:fn() for k,fn in search_space.items()}
    bs,wd,lr = int(cfg['batch_size']), cfg['weight_decay'], cfg['lr']
    train_loader = DataLoader(train_sub, batch_size=bs, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=bs, shuffle=False)

    model = FashionCNN().to(device)
    optimizer = optim.LBFGS(
        model.parameters(), lr=lr, max_iter=5,
        history_size=10, line_search_fn='strong_wolfe'
    )

    start=time.time()
    _ = train_lbfgs(model, train_loader, criterion, optimizer, epochs=3, weight_decay=wd)
    val_loss,val_acc = evaluate(model, val_loader, criterion)
    elapsed = time.time() - start

    results.append((cfg, val_loss, val_acc, elapsed))
    print(f"Trial {t+1}/{num_trials} — Val Acc: {val_acc:.4f}, Val Loss: {val_loss:.4f}, Time: {elapsed:.1f}s, CFG: {cfg}")

    if val_acc > best_acc:
        best_acc,best_cfg,best_trial = val_acc,cfg,t+1

print(f"\n🔍 Best Hyperparameter Found at trial {best_trial}: {best_cfg}")

# ————————— Final Retrain & Dynamics —————————
full_loader = DataLoader(full_train, batch_size=int(best_cfg['batch_size']), shuffle=True)
test_loader = DataLoader(test_ds, batch_size=int(best_cfg['batch_size']), shuffle=False)

model = FashionCNN().to(device)
optimizer = optim.LBFGS(
    model.parameters(), lr=best_cfg['lr'],
    max_iter=5, history_size=10, line_search_fn='strong_wolfe'
)
train_dyn = train_lbfgs(model, full_loader, criterion, optimizer, epochs=5, weight_decay=best_cfg['weight_decay'])
test_loss,test_acc = evaluate(model, test_loader, criterion)
print(f"\nTest Loss: {test_loss:.6f}, Test Acc: {test_acc:.3f}")

# ————————— 1. Convergence Progression —————————
df_conv = pd.DataFrame([
    {'trial': i+1, 'val_acc': acc, 'val_loss': loss, 'time': tm}
    for i, (_, loss, acc, tm) in enumerate(results)
])
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.lineplot(data=df_conv, x='trial', y='val_acc', marker='o')
plt.title("Val Acc per Trial")
plt.subplot(1,2,2)
sns.lineplot(data=df_conv, x='trial', y='val_loss', marker='o', color='r')
plt.title("Val Loss per Trial")
plt.tight_layout()
plt.show()

# ————————— 2. Hyperparameter Evolution —————————
df_hp = df_conv.copy()
for i,(cfg,_,_,_) in enumerate(results):
    for k in ['lr','batch_size','weight_decay']:
        df_hp.loc[i,k] = cfg[k]
df_pc = df_hp[['lr','batch_size','weight_decay','val_acc']].copy()
df_pc['idx'] = df_pc.index.astype(str)
plt.figure(figsize=(8,4))
parallel_coordinates(df_pc.rename(columns={'idx':'trial'}), 'trial',
                     cols=['lr','batch_size','weight_decay','val_acc'],
                     color=plt.cm.viridis(np.linspace(0,1,len(df_pc))))
plt.xticks(rotation=45); plt.title("Hyperparameter Evolution"); plt.show()

# ————————— 3. Search Space Exploration —————————
X = df_hp[['lr','batch_size','weight_decay']].astype(float)
Xp = StandardScaler().fit_transform(X)
pcs = PCA(3).fit_transform(Xp)
df_pca = pd.DataFrame(pcs,columns=['PC1','PC2','PC3'])
df_pca['val_acc'] = df_hp['val_acc']

fig = px.scatter_3d(df_pca, x='PC1', y='PC2', z='PC3',
                    color='val_acc', size='val_acc',
                    title="Search Space Exploration (PCA)")
fig.show()

# ————————— 4. Training Dynamics —————————
epochs = list(range(1,len(train_dyn['train_loss'])+1))
fig, ax1 = plt.subplots(figsize=(8,4))
ax1.plot(epochs, train_dyn['train_loss'], 'b-o', label='Train Loss')
ax1.plot(epochs, train_dyn['val_loss']  if 'val_loss' in train_dyn else [None]*len(epochs),
         'b--s', label='Val Loss')
ax1.set_xlabel("Epoch"); ax1.set_ylabel("Loss", color='b')
ax2 = ax1.twinx()
ax2.plot(epochs, train_dyn['train_acc'], 'g-o', label='Train Acc')
ax2.plot(epochs, train_dyn.get('val_acc',[None]*len(epochs)), 'g--s', label='Val Acc')
ax2.set_ylabel("Accuracy", color='g')
plt.title("Train vs Val Dynamics"); fig.legend(loc='upper right'); plt.show()

# ————————— 5. Computation Profile —————————
plt.figure(figsize=(6,3))
sns.barplot(x='trial',y='time',data=df_conv,color='skyblue')
plt.title("Evaluation Time per Trial"); plt.show()

# ————————— 6. Error Analysis —————————
# Confusion Matrix
all_preds,all_lbls=[],[]
model.eval()
with torch.no_grad():
    for ims,lbs in test_loader:
        ims,lbs=ims.to(device),lbs.to(device)
        out=model(ims).argmax(1)
        all_preds+=out.cpu().tolist()
        all_lbls+=lbs.cpu().tolist()

cm = confusion_matrix(all_lbls,all_preds)
disp = ConfusionMatrixDisplay(cm, display_labels=test_ds.classes)
fig,ax=plt.subplots(figsize=(5,5))
disp.plot(ax=ax,cmap='Blues')
plt.title("Confusion Matrix"); plt.show()
# Class-wise accuracy
accs = cm.diagonal()/cm.sum(axis=1)
plt.figure(figsize=(6,3))
sns.barplot(x=list(range(10)), y=accs, palette='viridis')
plt.title("Per-Class Accuracy"); plt.ylim(0,1); plt.show()

### **Optimizing CNN hyperparameters using GA and PSO**

Baseline CNN

In [None]:
# Step 1: Imports & Setup
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Step 2: Data Loading
transform = transforms.Compose([
    transforms.ToTensor(),  # Converts to [0, 1]
    transforms.Normalize((0.5,), (0.5,))  # Normalize to [-1, 1]
])

train_dataset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Using device: cuda


In [None]:
# Step 3: Define CNN Model
class FashionCNN(nn.Module):
    def __init__(self):
        super(FashionCNN, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),  # output: (32, 28, 28)
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # output: (32, 14, 14)

            nn.Conv2d(32, 64, kernel_size=3, padding=1),  # output: (64, 14, 14)
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # output: (64, 7, 7)

            nn.Flatten(),  # output: (64*7*7,)
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        return self.net(x)

model = FashionCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train_model(model, dataloader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0

        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss:.4f}, Accuracy: {100 * correct / total:.2f}%")

def evaluate_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    loss_total = 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            loss_total += loss.item()

    acc = 100 * correct / total
    avg_loss = loss_total / len(dataloader)
    print(f"Test Loss: {avg_loss:.4f}, Test Accuracy: {acc:.2f}%")
    return avg_loss, acc

In [None]:
train_model(model, train_loader, criterion, optimizer, epochs=5)
evaluate_model(model, test_loader)

Epoch [1/5], Loss: 408.0778, Accuracy: 84.32%
Epoch [2/5], Loss: 263.7032, Accuracy: 89.76%
Epoch [3/5], Loss: 218.7325, Accuracy: 91.42%
Epoch [4/5], Loss: 189.3577, Accuracy: 92.58%
Epoch [5/5], Loss: 167.2148, Accuracy: 93.36%
Test Loss: 0.2429, Test Accuracy: 91.36%


(0.24288778877846753, 91.36)

GA on CNN  

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import random
import numpy as np
from copy import deepcopy

# GPU Config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Preparation
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.FashionMNIST('./data', train=True, download=True, transform=transform)
val_size = 5000
train_subset, val_subset = torch.utils.data.random_split(train_dataset, [len(train_dataset)-val_size, val_size])
train_loader = torch.utils.data.DataLoader(train_subset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_subset, batch_size=64, shuffle=False)

# CNN Model Builder
class CNN(nn.Module):
    def __init__(self, conv1_out, conv2_out, fc_units):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, conv1_out, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(conv1_out, conv2_out, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.flatten_size = (conv2_out * 7 * 7)
        self.fc1 = nn.Linear(self.flatten_size, fc_units)
        self.fc2 = nn.Linear(fc_units, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, self.flatten_size)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# GA Hyperparameters
POP_SIZE = 20
GENERATIONS = 10
TOP_K = 4

param_bounds = {
    "lr": [1e-4, 1e-2],
    "conv1_out": [16, 64],
    "conv2_out": [32, 128],
    "fc_units": [64, 256],
}

def random_gene():
    return {
        "lr": round(10 ** random.uniform(np.log10(param_bounds['lr'][0]), np.log10(param_bounds['lr'][1])), 5),
        "conv1_out": random.randint(*param_bounds['conv1_out']),
        "conv2_out": random.randint(*param_bounds['conv2_out']),
        "fc_units": random.randint(*param_bounds['fc_units'])
    }

def mutate(gene):
    mutated = deepcopy(gene)
    key = random.choice(list(param_bounds.keys()))
    return random_gene() if random.random() < 0.3 else gene

def crossover(parent1, parent2):
    child = {}
    for key in param_bounds:
        child[key] = parent1[key] if random.random() < 0.5 else parent2[key]
    return child

# Fitness Evaluation
def evaluate(gene):
    model = CNN(gene['conv1_out'], gene['conv2_out'], gene['fc_units']).to(device)
    optimizer = optim.Adam(model.parameters(), lr=gene['lr'])
    criterion = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(1):  # train only 1 epoch
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

    model.eval()
    val_loss, correct = 0.0, 0
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            val_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    val_loss /= len(val_loader)
    accuracy = correct / len(val_loader.dataset)
    return val_loss, accuracy

# Genetic Algorithm Core
def run_ga():
    population = [random_gene() for _ in range(POP_SIZE)]

    # Create history storage
    ga_history = []

    best_overall = None
    best_loss = float('inf')
    best_acc = 0.0
    best_gen = -1

    for gen in range(GENERATIONS):
        # Evaluate all individuals
        scored = []
        for gene in population:
            loss, acc = evaluate(gene)
            scored.append((gene, loss, acc))  # <-- MODIFIED TO STORE TUPLE

        scored.sort(key=lambda x: x[1])  # sort by loss

        best_gene, loss, acc = scored[0]

        # Track global best
        if loss < best_loss:
            best_loss = loss
            best_acc = acc
            best_overall = best_gene
            best_gen = gen

        print(f"Gen {gen}: Best Loss = {loss:.4f}, Acc = {acc:.4f}")
        print(f"         Best Hyperparams @ Gen {gen}: {best_gene}")

        # ======== STORE GENERATION HISTORY ======== <-- ADD THIS BLOCK
        gen_data = {
            'generation': gen,
            'population': [gene for gene, _, _ in scored],
            'fitness': [(loss, acc) for _, loss, acc in scored],
            'best_gene': best_gene,
            'best_loss': loss,
            'best_acc': acc
        }
        ga_history.append(gen_data)
        # ======== END HISTORY STORAGE ========

        # Genetic operations
        top_genes = [x[0] for x in scored[:TOP_K]]
        new_pop = top_genes.copy()

        while len(new_pop) < POP_SIZE:
            p1, p2 = random.sample(top_genes, 2)
            child = crossover(p1, p2)
            child = mutate(child)
            new_pop.append(child)

        population = new_pop

    print(f"\n✅ Best hyperparameters found at generation {best_gen}: {best_overall}")
    print(f"   Final Best Loss = {best_loss:.4f}, Accuracy = {best_acc:.4f}")
    return best_overall, ga_history

# Run GA
best_hyperparams, ga_history = run_ga()
print("\nBest hyperparameters found:", best_hyperparams)

KeyboardInterrupt: 

In [None]:
# Load Official Test Dataset
test_dataset = datasets.FashionMNIST('./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# Evaluate Best Hyperparams on Test Set
def train_and_test(hparams, epochs=5):
    model = CNN(hparams['conv1_out'], hparams['conv2_out'], hparams['fc_units']).to(device)
    optimizer = optim.Adam(model.parameters(), lr=hparams['lr'])
    criterion = nn.CrossEntropyLoss()

    print(f"\n🔁 Training final model for {epochs} epochs...")
    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} completed.")

    # Test
    model.eval()
    test_loss, correct = 0.0, 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader)
    accuracy = correct / len(test_loader.dataset)

    print(f"\n📊 Final Evaluation on Test Set:")
    print(f"   Test Loss     = {test_loss:.4f}")
    print(f"   Test Accuracy = {accuracy:.4f}")

# Evaluate on test set
train_and_test(best_hyperparams)

PSO on CNN

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import random
import numpy as np
from copy import deepcopy

# ------------------------------
# Device Setup
# ------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ------------------------------
# Dataset (Subset for Speed)
# ------------------------------
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.FashionMNIST('./data', train=True, download=True, transform=transform)
val_size = 5000
train_subset, val_subset = torch.utils.data.random_split(train_dataset, [len(train_dataset) - val_size, val_size])
train_loader = torch.utils.data.DataLoader(train_subset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_subset, batch_size=64, shuffle=False)

# Test Set (Official Evaluation)
test_dataset = datasets.FashionMNIST('./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# ------------------------------
# CNN Definition
# ------------------------------
class CNN(nn.Module):
    def __init__(self, conv1_out, conv2_out, fc_units):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, conv1_out, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(conv1_out, conv2_out, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.flatten_size = conv2_out * 7 * 7
        self.fc1 = nn.Linear(self.flatten_size, fc_units)
        self.fc2 = nn.Linear(fc_units, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, self.flatten_size)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# ------------------------------
# Hyperparameter Bounds
# ------------------------------
param_bounds = {
    'lr': [1e-4, 1e-2],
    'conv1_out': [16, 64],
    'conv2_out': [32, 128],
    'fc_units': [64, 256]
}

# ------------------------------
# Fitness Function
# ------------------------------
def evaluate(hparams):
    model = CNN(int(hparams['conv1_out']), int(hparams['conv2_out']), int(hparams['fc_units'])).to(device)
    optimizer = optim.Adam(model.parameters(), lr=hparams['lr'])
    criterion = nn.CrossEntropyLoss()

    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx >= 9:  # train on 10 mini-batches
            break

    model.eval()
    val_loss, correct = 0.0, 0
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            val_loss += criterion(output, target).item()
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()

    val_loss /= len(val_loader)
    accuracy = correct / len(val_loader.dataset)
    return val_loss, accuracy

# ------------------------------
# PSO Core
# ------------------------------
def run_pso(particles=10, generations=20, w=0.5, c1=1.5, c2=1.5):
    def init_particle():
        return {
            'lr': 10 ** random.uniform(np.log10(param_bounds['lr'][0]), np.log10(param_bounds['lr'][1])),
            'conv1_out': random.randint(*param_bounds['conv1_out']),
            'conv2_out': random.randint(*param_bounds['conv2_out']),
            'fc_units': random.randint(*param_bounds['fc_units'])
        }

    def clip(hp):
        hp['lr'] = float(np.clip(hp['lr'], *param_bounds['lr']))
        hp['conv1_out'] = int(np.clip(round(hp['conv1_out']), *param_bounds['conv1_out']))
        hp['conv2_out'] = int(np.clip(round(hp['conv2_out']), *param_bounds['conv2_out']))
        hp['fc_units'] = int(np.clip(round(hp['fc_units']), *param_bounds['fc_units']))
        return hp

    population = [init_particle() for _ in range(particles)]
    velocity = [{k: 0.0 for k in p} for p in population]

    personal_best = deepcopy(population)
    personal_best_scores = []
    personal_best_accs = []

    for p in personal_best:
        loss, acc = evaluate(p)
        personal_best_scores.append(loss)
        personal_best_accs.append(acc)

    global_best_idx = np.argmin(personal_best_scores)
    global_best = deepcopy(personal_best[global_best_idx])
    global_best_loss = personal_best_scores[global_best_idx]
    global_best_acc = personal_best_accs[global_best_idx]

    pso_history = []

    for gen in range(generations):
        for i in range(particles):
            new_vel = {}
            new_pos = {}
            for key in population[i]:
                r1, r2 = random.random(), random.random()
                vel = (w * velocity[i][key]
                       + c1 * r1 * (personal_best[i][key] - population[i][key])
                       + c2 * r2 * (global_best[key] - population[i][key]))
                new_vel[key] = vel
                new_pos[key] = population[i][key] + vel
            velocity[i] = new_vel
            new_pos = clip(new_pos)
            population[i] = new_pos

            loss, acc = evaluate(new_pos)
            if loss < personal_best_scores[i]:
                personal_best[i] = deepcopy(new_pos)
                personal_best_scores[i] = loss
                personal_best_accs[i] = acc

        global_best_idx = np.argmin(personal_best_scores)
        global_best = deepcopy(personal_best[global_best_idx])
        global_best_loss = personal_best_scores[global_best_idx]
        global_best_acc = personal_best_accs[global_best_idx]

        print(f"Gen {gen}: Best Loss = {global_best_loss:.4f}, Acc = {global_best_acc:.4f}")

        # ======== STORE ITERATION HISTORY ======== <-- ADD THIS BLOCK
        gen_data = {
            'iteration': gen,
            'swarm': deepcopy(population),
            'fitness': [(personal_best_scores[i], personal_best_accs[i])
                        for i in range(len(population))],
            'global_best': deepcopy(global_best),
            'global_loss': global_best_loss,
            'global_acc': global_best_acc
        }
        pso_history.append(gen_data)
        # ======== END HISTORY STORAGE ========

    return global_best, pso_history

# ------------------------------
# Run PSO
# ------------------------------
best_pso_hyperparams, pso_history = run_pso()
print("\nBest hyperparameters from PSO:", best_pso_hyperparams)

# ------------------------------
# Final Full Training & Test Evaluation
# ------------------------------
def train_and_test(hparams, epochs=5):
    model = CNN(hparams['conv1_out'], hparams['conv2_out'], hparams['fc_units']).to(device)
    optimizer = optim.Adam(model.parameters(), lr=hparams['lr'])
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} completed.")

    model.eval()
    test_loss, correct = 0.0, 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()

    test_loss /= len(test_loader)
    test_acc = correct / len(test_loader.dataset)
    print(f"\nTest Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

train_and_test(best_pso_hyperparams)

### **Plotting of GA vs PSO**

In [None]:
# ==================================================================
# PLOTTING FUNCTIONS
# ==================================================================

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from mpl_toolkits.mplot3d import Axes3D
from pandas.plotting import parallel_coordinates
import pandas as pd
import seaborn as sns
import plotly.graph_objects as go

# Normalization helper
def normalize_param(value, param_name):
    bounds = param_bounds[param_name]
    if param_name == 'lr':
        log_val = np.log10(value)
        log_min = np.log10(bounds[0])
        log_max = np.log10(bounds[1])
        return (log_val - log_min) / (log_max - log_min)
    return (value - bounds[0]) / (bounds[1] - bounds[0])

In [None]:
# 1. CONVERGENCE PLOT
def plot_convergence(ga_history, pso_history):
    plt.figure(figsize=(12, 6))

    # GA data
    ga_loss = [gen['best_loss'] for gen in ga_history]
    ga_acc = [gen['best_acc'] for gen in ga_history]

    # PSO data
    pso_loss = [gen['global_loss'] for gen in pso_history]
    pso_acc = [gen['global_acc'] for gen in pso_history]

    # Loss plot
    plt.subplot(1, 2, 1)
    plt.plot(ga_loss, 'b-o', label='GA')
    plt.plot(pso_loss, 'r-s', label='PSO')
    plt.xlabel('Generation/Iteration')
    plt.ylabel('Validation Loss')
    plt.title('Loss Convergence')
    plt.legend()
    plt.grid(True)

    # Accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(ga_acc, 'b-o', label='GA')
    plt.plot(pso_acc, 'r-s', label='PSO')
    plt.xlabel('Generation/Iteration')
    plt.ylabel('Validation Accuracy')
    plt.title('Accuracy Convergence')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.savefig('convergence.png', dpi=300)
    plt.show()

plot_convergence(ga_history, pso_history)

In [None]:
# 2. 3D TRAJECTORY ANIMATION
def create_3d_animation(ga_history, pso_history):
    fig = plt.figure(figsize=(14, 10))
    ax = fig.add_subplot(111, projection='3d')

    # Prepare data containers
    ga_points = []
    pso_points = []

    # Create animation function
    def update(frame):
        ax.clear()

        # GA points
        for gen in range(min(frame+1, len(ga_history))):
            for gene in ga_history[gen]['population']:
                x = normalize_param(gene['lr'], 'lr')
                y = normalize_param(gene['conv1_out'], 'conv1_out')
                z = normalize_param(gene['conv2_out'], 'conv2_out')
                ga_points.append((x, y, z))

        # PSO points
        for iter in range(min(frame+1, len(pso_history))):
            for particle in pso_history[iter]['swarm']:
                x = normalize_param(particle['lr'], 'lr')
                y = normalize_param(particle['conv1_out'], 'conv1_out')
                z = normalize_param(particle['conv2_out'], 'conv2_out')
                pso_points.append((x, y, z))

        # Convert to arrays
        if ga_points:
            ga_arr = np.array(ga_points)
            ax.scatter(ga_arr[:,0], ga_arr[:,1], ga_arr[:,2],
                      c='blue', alpha=0.3, s=10, label='GA')

        if pso_points:
            pso_arr = np.array(pso_points)
            ax.scatter(pso_arr[:,0], pso_arr[:,1], pso_arr[:,2],
                      c='red', alpha=0.3, s=10, marker='s', label='PSO')

        # Plot best trajectories
        if ga_history:
            ga_best = []
            for gen in range(min(frame+1, len(ga_history))):
                gene = ga_history[gen]['best_gene']
                x = normalize_param(gene['lr'], 'lr')
                y = normalize_param(gene['conv1_out'], 'conv1_out')
                z = normalize_param(gene['conv2_out'], 'conv2_out')
                ga_best.append((x, y, z))
            ga_best_arr = np.array(ga_best)
            ax.plot(ga_best_arr[:,0], ga_best_arr[:,1], ga_best_arr[:,2],
                   'b-', linewidth=2, label='GA Best')
            ax.scatter(ga_best_arr[-1,0], ga_best_arr[-1,1], ga_best_arr[-1,2],
                      c='gold', s=100, edgecolor='black', label='Current Best')

        if pso_history:
            pso_best = []
            for iter in range(min(frame+1, len(pso_history))):
                particle = pso_history[iter]['global_best']
                x = normalize_param(particle['lr'], 'lr')
                y = normalize_param(particle['conv1_out'], 'conv1_out')
                z = normalize_param(particle['conv2_out'], 'conv2_out')
                pso_best.append((x, y, z))
            pso_best_arr = np.array(pso_best)
            ax.plot(pso_best_arr[:,0], pso_best_arr[:,1], pso_best_arr[:,2],
                   'r-', linewidth=2, label='PSO Best')

        # Set labels and title
        ax.set_xlabel('log10(Learning Rate)')
        ax.set_ylabel('conv1_out')
        ax.set_zlabel('conv2_out')
        ax.set_title(f'Hyperparameter Optimization Trajectories\nGeneration/Iteration: {frame}')
        ax.legend()

        # Set viewing angle
        ax.view_init(30, frame * 2)

    # Create animation
    ani = FuncAnimation(fig, update, frames=max(len(ga_history), len(pso_history)),
                        interval=500, blit=False)

    # Save animation
    ani.save('optimization_trajectory.mp4', writer='ffmpeg', fps=2, dpi=150)
    plt.close()

create_3d_animation(ga_history, pso_history)

In [None]:
# 3. PARALLEL COORDINATES PLOT
def plot_parallel_coordinates(ga_history, pso_history):
    data = []

    # Collect GA data
    for gen in ga_history:
        for gene, (loss, acc) in zip(gen['population'], gen['fitness']):
            row = gene.copy()
            row['loss'] = loss
            row['accuracy'] = acc
            row['method'] = 'GA'
            data.append(row)

    # Collect PSO data
    for gen in pso_history:
        for particle, (loss, acc) in zip(gen['swarm'], gen['fitness']):
            row = particle.copy()
            row['loss'] = loss
            row['accuracy'] = acc
            row['method'] = 'PSO'
            data.append(row)

    # Create DataFrame
    df = pd.DataFrame(data)

    # Normalize parameters for better visualization
    for param in param_bounds.keys():
        if param == 'lr':
            df['log_lr'] = np.log10(df['lr'])
            min_val = np.log10(param_bounds['lr'][0])
            max_val = np.log10(param_bounds['lr'][1])
            df['norm_lr'] = (df['log_lr'] - min_val) / (max_val - min_val)
        else:
            min_val = param_bounds[param][0]
            max_val = param_bounds[param][1]
            df[f'norm_{param}'] = (df[param] - min_val) / (max_val - min_val)

    plt.figure(figsize=(14, 8))
    parallel_coordinates(
        df[['method', 'norm_lr', 'norm_conv1_out', 'norm_conv2_out', 'norm_fc_units', 'accuracy']],
        'method',
        color=['blue', 'red'],
        alpha=0.1
    )
    plt.title('Hyperparameter Space Exploration')
    plt.xlabel('Parameters (Normalized)')
    plt.ylabel('Value')
    plt.xticks(rotation=45)
    plt.grid(alpha=0.3)
    plt.savefig('parallel_coordinates.png', dpi=300, bbox_inches='tight')
    plt.show()

plot_parallel_coordinates(ga_history, pso_history)

In [None]:
# 4. HYPERPARAMETER EVOLUTION
def plot_hyperparameter_evolution(ga_history, pso_history):
    fig, axs = plt.subplots(2, 2, figsize=(16, 12))
    params = ['lr', 'conv1_out', 'conv2_out', 'fc_units']

    for i, param in enumerate(params):
        ax = axs[i//2, i%2]

        # GA evolution
        ga_vals = []
        for gen in ga_history:
            gen_vals = [gene[param] for gene in gen['population']]
            ga_vals.append(gen_vals)
        ga_vals = np.array(ga_vals)

        # Plot GA
        for j in range(ga_vals.shape[1]):
            ax.plot(ga_vals[:, j], 'b-', alpha=0.1)
        ax.plot(np.median(ga_vals, axis=1), 'b-o', linewidth=2, label='GA Median')

        # PSO evolution
        pso_vals = []
        for gen in pso_history:
            gen_vals = [particle[param] for particle in gen['swarm']]
            pso_vals.append(gen_vals)
        pso_vals = np.array(pso_vals)

        # Plot PSO
        for j in range(pso_vals.shape[1]):
            ax.plot(pso_vals[:, j], 'r-', alpha=0.1)
        ax.plot(np.median(pso_vals, axis=1), 'r-s', linewidth=2, label='PSO Median')

        # Formatting
        ax.set_xlabel('Generation/Iteration')
        ax.set_ylabel(param)
        if param == 'lr':
            ax.set_yscale('log')
        ax.set_title(f'{param} Evolution')
        ax.legend()
        ax.grid(True)

    plt.tight_layout()
    plt.savefig('hyperparameter_evolution.png', dpi=300)
    plt.show()

plot_hyperparameter_evolution(ga_history, pso_history)

In [None]:
# 5. PERFORMANCE HEATMAP
def plot_performance_heatmap(ga_history, pso_history):
    # Prepare data
    all_data = []

    # GA data
    for gen in ga_history:
        for gene, (loss, acc) in zip(gen['population'], gen['fitness']):
            all_data.append({
                'method': 'GA',
                'conv1_out': gene['conv1_out'],
                'conv2_out': gene['conv2_out'],
                'accuracy': acc
            })

    # PSO data
    for gen in pso_history:
        for particle, (loss, acc) in zip(gen['swarm'], gen['fitness']):
            all_data.append({
                'method': 'PSO',
                'conv1_out': particle['conv1_out'],
                'conv2_out': particle['conv2_out'],
                'accuracy': acc
            })

    df = pd.DataFrame(all_data)

    # Create pivot tables
    ga_pivot = df[df['method']=='GA'].pivot_table(
        index='conv1_out', columns='conv2_out', values='accuracy', aggfunc='mean'
    )
    pso_pivot = df[df['method']=='PSO'].pivot_table(
        index='conv1_out', columns='conv2_out', values='accuracy', aggfunc='mean'
    )

    # Plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7), sharey=True)

    # GA heatmap
    sns.heatmap(ga_pivot, annot=True, fmt=".3f", cmap="Blues", ax=ax1)
    ax1.set_title('GA Accuracy Heatmap')
    ax1.set_xlabel('conv2_out')
    ax1.set_ylabel('conv1_out')

    # PSO heatmap
    sns.heatmap(pso_pivot, annot=True, fmt=".3f", cmap="Reds", ax=ax2)
    ax2.set_title('PSO Accuracy Heatmap')
    ax2.set_xlabel('conv2_out')

    plt.tight_layout()
    plt.savefig('performance_heatmap.png', dpi=300)
    plt.show()

plot_performance_heatmap(ga_history, pso_history)

In [None]:
# 6. INTERACTIVE 3D PLOT (PLOTLY)
def create_interactive_3d(ga_history, pso_history):
    # Prepare data
    ga_data = []
    pso_data = []

    # GA points
    for gen in ga_history:
        for gene, (loss, acc) in zip(gen['population'], gen['fitness']):
            ga_data.append({
                'x': np.log10(gene['lr']),
                'y': gene['conv1_out'],
                'z': gene['conv2_out'],
                'accuracy': acc,
                'fc_units': gene['fc_units'],
                'generation': gen['generation']
            })

    # PSO points
    for gen in pso_history:
        for particle, (loss, acc) in zip(gen['swarm'], gen['fitness']):
            pso_data.append({
                'x': np.log10(particle['lr']),
                'y': particle['conv1_out'],
                'z': particle['conv2_out'],
                'accuracy': acc,
                'fc_units': particle['fc_units'],
                'iteration': gen['iteration']
            })

    # Create GA trace
    ga_df = pd.DataFrame(ga_data)
    ga_trace = go.Scatter3d(
        x=ga_df['x'],
        y=ga_df['y'],
        z=ga_df['z'],
        mode='markers',
        marker=dict(
            size=ga_df['fc_units']/50,
            color=ga_df['accuracy'],
            colorscale='Blues',
            opacity=0.7,
            colorbar=dict(title='Accuracy')
        ),
        name='GA',
        hovertext=ga_df.apply(lambda row:
            f"Accuracy: {row['accuracy']:.4f}<br>"
            f"LR: {10**row['x']:.5f}<br>"
            f"conv1: {row['y']}<br>"
            f"conv2: {row['z']}<br>"
            f"fc: {row['fc_units']}<br>"
            f"Gen: {row['generation']}", axis=1)
    )

    # Create PSO trace
    pso_df = pd.DataFrame(pso_data)
    pso_trace = go.Scatter3d(
        x=pso_df['x'],
        y=pso_df['y'],
        z=pso_df['z'],
        mode='markers',
        marker=dict(
            size=pso_df['fc_units']/50,
            color=pso_df['accuracy'],
            colorscale='Reds',
            opacity=0.7,
            colorbar=dict(title='Accuracy')
        ),
        name='PSO',
        hovertext=pso_df.apply(lambda row:
            f"Accuracy: {row['accuracy']:.4f}<br>"
            f"LR: {10**row['x']:.5f}<br>"
            f"conv1: {row['y']}<br>"
            f"conv2: {row['z']}<br>"
            f"fc: {row['fc_units']}<br>"
            f"Iter: {row['iteration']}", axis=1)
    )

    # Create figure
    fig = go.Figure(data=[ga_trace, pso_trace])

    # Set layout
    fig.update_layout(
        title='Hyperparameter Optimization Space',
        scene=dict(
            xaxis_title='log10(Learning Rate)',
            yaxis_title='conv1_out',
            zaxis_title='conv2_out',
            camera=dict(
                eye=dict(x=1.5, y=1.5, z=1.5)
            )
        ),
        width=1200,
        height=800
    )

    # Save and show
    fig.write_html("interactive_optimization.html")
    fig.show()

create_interactive_3d(ga_history, pso_history)