<a href="https://colab.research.google.com/github/foxtrotmike/CS909/blob/master/nn_optimization_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# XOR dataset
X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32).to(device)
y = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32).to(device)

dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Different strategies implemented in model variants
class XORNet(nn.Module):
    def __init__(self, use_batch_norm=False, use_skip_connections=False, init_method=None, activation=F.relu, deeper=False):
        super(XORNet, self).__init__()
        self.fc1 = nn.Linear(2, 4)
        self.fc2 = nn.Linear(4, 8)
        self.fc3 = nn.Linear(8, 4)
        self.fc4 = nn.Linear(4, 1)

        self.use_batch_norm = use_batch_norm
        self.use_skip_connections = use_skip_connections
        self.activation = activation
        self.deeper = deeper

        # Batch Normalization
        if use_batch_norm:
            self.bn1 = nn.BatchNorm1d(4)
            self.bn2 = nn.BatchNorm1d(8)
            self.bn3 = nn.BatchNorm1d(4)

        # Skip connection
        if use_skip_connections:
            self.skip = nn.Linear(2, 4)

        # Weight initialization
        if init_method:
            init_method(self.fc1.weight)
            init_method(self.fc2.weight)
            init_method(self.fc3.weight)
            init_method(self.fc4.weight)

    def forward(self, x):
        if self.use_skip_connections:
            x_skip = self.skip(x)
        x = self.fc1(x)
        if self.use_batch_norm:
            x = self.bn1(x)
        x = self.activation(x)
        x = self.fc2(x)
        if self.use_batch_norm:
            x = self.bn2(x)
        x = self.activation(x)
        x = self.fc3(x)
        if self.use_batch_norm:
            x = self.bn3(x)
        if self.use_skip_connections:
            x = x + x_skip
        x = self.activation(x)
        x = torch.sigmoid(self.fc4(x))
        return x

# Function to train and return the epoch reaching the target error
def train_model(model, optimizer_class=optim.SGD, lr_scheduler=None, max_norm=None, epochs=500, target_error=0.00001):
    model = model.to(device)
    criterion = nn.BCELoss()
    optimizer = optimizer_class(model.parameters(), lr=0.01)
    if lr_scheduler:
        scheduler = lr_scheduler(optimizer)
    for epoch in range(epochs):
        for batch_X, batch_y in dataloader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            if max_norm:
                nn.utils.clip_grad_norm_(model.parameters(), max_norm)
            optimizer.step()
        if lr_scheduler:
            scheduler.step()
        if loss.item() <= target_error:
            return epoch
    return epochs

# Define strategies with Adam optimizer for all except the baseline
strategies = {
    'Base Model (SGD)': (XORNet(deeper=True), optim.SGD),
    'Adam Optimizer': (XORNet(deeper=True), optim.Adam),
    'Add Weight Initialization': (XORNet(deeper=True, init_method=nn.init.xavier_uniform_), optim.Adam),
    'Gradient Clipping': (XORNet(deeper=True), optim.Adam),
    'Skip Connections': (XORNet(deeper=True, use_skip_connections=True), optim.Adam),
    'Batch Normalization': (XORNet(deeper=True, use_batch_norm=True), optim.Adam),
    'Leaky ReLU': (XORNet(deeper=True, activation=F.leaky_relu), optim.Adam),
    'One Cycle LR': (XORNet(deeper=True), optim.Adam)
}

# Run each model multiple times and record average epochs to target error
results = []
num_runs = 30
target_error = 0.00001

for name, (model, optimizer_class) in strategies.items():
    epochs_to_target = []
    for _ in range(num_runs):
        if name == 'Gradient Clipping':
            epochs = train_model(model, optimizer_class, max_norm=1.0, target_error=target_error)
        elif name == 'One Cycle LR':
            epochs = train_model(model, optimizer_class, lr_scheduler=lambda opt: optim.lr_scheduler.OneCycleLR(opt, max_lr=0.1, steps_per_epoch=len(dataloader), epochs=500), target_error=target_error)
        else:
            epochs = train_model(model, optimizer_class, target_error=target_error)
        epochs_to_target.append(epochs)
    avg_epochs = np.mean(epochs_to_target)
    # Bootstrap confidence interval
    bootstraps = [np.mean(resample(epochs_to_target)) for _ in range(1000)]
    ci_lower, ci_upper = np.percentile(bootstraps, [2.5, 97.5])
    results.append({'Strategy': name, 'Mean Epochs': avg_epochs, 'CI Lower': ci_lower, 'CI Upper': ci_upper})

# Display results as a table
results_df = pd.DataFrame(results)
print(results_df)


Using device: cuda
                    Strategy  Mean Epochs    CI Lower    CI Upper
0           Base Model (SGD)   500.000000  500.000000  500.000000
1             Adam Optimizer    65.700000   15.700000  131.400000
2  Add Weight Initialization   500.000000  500.000000  500.000000
3          Gradient Clipping    66.933333   16.666667  133.600000
4           Skip Connections   134.500000   51.166667  216.666667
5        Batch Normalization    29.433333    0.000000   75.567500
6                 Leaky ReLU    22.866667    0.000000   62.400000
7               One Cycle LR     4.800000    0.000000   14.400000
