# Adam Optimizer Experiments
# Matching the SGD experimental structure with Adam-specific hyperparameters

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.optim.lr_scheduler import StepLR, CosineAnnealingLR

In [None]:

transform_mnist = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

trainsetMNIST = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform_mnist)
testsetMNIST = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform_mnist)

transform_cifar = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
trainsetCIFAR10 = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_cifar)
testsetCIFAR10 = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_cifar)

trainsetCIFAR100 = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_cifar)
testsetCIFAR100 = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_cifar)

Files already downloaded and verified
Files already downloaded and verified


# Model Architecture

In [None]:

class Block(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, use_residual=True):
        super(Block, self).__init__()
        self.use_residual = use_residual
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

    
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        
    
        if self.use_residual:
            out += self.shortcut(x) 
            
        out = F.relu(out)
        return out

class StressTestNet(nn.Module):
    def __init__(self, use_residual=True, num_classes = 10, in_channels=3):
        super(StressTestNet, self).__init__()
        self.in_channels = 16
        
    
        self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        
    
        self.layer1 = self._make_layer(16, 3, stride=1, use_residual=use_residual)
        self.layer2 = self._make_layer(32, 3, stride=2, use_residual=use_residual)
        self.layer3 = self._make_layer(64, 3, stride=2, use_residual=use_residual)
        
        self.fc = nn.Linear(64, num_classes)

    def _make_layer(self, out_channels, blocks, stride, use_residual):
        layers = []
        layers.append(Block(self.in_channels, out_channels, stride, use_residual))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(Block(out_channels, out_channels, stride=1, use_residual=use_residual))
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = F.adaptive_avg_pool2d(out, (1,1))
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

# Gradient Norm Tracking

In [None]:
def get_gradient_norms(model):
    """
    Calculate layer-wise gradient norms for detailed analysis
    Returns: dict mapping layer names to their gradient norms
    """
    layer_norms = {}
    total_norm = 0.0
    
    for name, param in model.named_parameters():
        if param.grad is not None:
            param_norm = param.grad.data.norm(2)
            layer_norms[name] = param_norm.item()
            total_norm += param_norm.item() ** 2
    
    layer_norms['global_norm'] = total_norm ** 0.5
    return layer_norms

def evaluate(model, dataloader, device, criterion):
    """
    Evaluate model accuracy on given dataloader
    Returns: accuracy percentage and average loss
    """
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    avg_loss = running_loss / len(dataloader)
    return accuracy, avg_loss

In [None]:
NUM_EPOCHS = 15 

learning_rates = [0.1, 0.01, 0.001, 0.0001, 1e-5] 
batch_sizes = [32, 128, 512, 1024] 
architectures = [True, False] 
schedulers = ["None", "StepLR", "Cosine"]

betas = (0.9, 0.999) 
eps = 1e-8 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")

datasets_config = {
    'MNIST': {
        'trainset': trainsetMNIST,
        'testset': testsetMNIST,
        'num_classes': 10,
        'in_channels': 1
    },
    'CIFAR10': {
        'trainset': trainsetCIFAR10,
        'testset': testsetCIFAR10,
        'num_classes': 10,
        'in_channels': 3
    },
    'CIFAR100': {
        'trainset': trainsetCIFAR100,
        'testset': testsetCIFAR100,
        'num_classes': 100,
        'in_channels': 3
    }
}

Running on: mps


In [None]:
def run_experiment(dataset_name, config):
    """
    Generic experiment runner for any dataset configuration
    """
    print(f"\n=== Starting {dataset_name} Experiments ===")
    
    for use_res in architectures:
        model_type = "ResNet" if use_res else "PlainNet"
        
        for batch_size in batch_sizes:
        
            trainloader = torch.utils.data.DataLoader(
                config['trainset'], batch_size=batch_size, shuffle=True, num_workers=2
            )
            testloader = torch.utils.data.DataLoader(
                config['testset'], batch_size=batch_size, shuffle=False, num_workers=2
            )
            
            for lr in learning_rates:
                for sched in schedulers:
                
                    model = StressTestNet(
                        use_residual=use_res, 
                        num_classes=config['num_classes'], 
                        in_channels=config['in_channels']
                    ).to(device)

                
                    optimizer = optim.Adam(
                        model.parameters(), lr=lr, betas=betas, eps=eps, weight_decay=5e-4
                    )
                    criterion = nn.CrossEntropyLoss()

                    scheduler = None
                    if sched == "StepLR":
                        scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
                    elif sched == "Cosine":
                        scheduler = CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS)

                
                    run_name = f"runs/{dataset_name}{model_type}_BS{batch_size}_LR{lr}_SCHDL{sched}_Adam"
                    writer = SummaryWriter(run_name)
                    print(f"Starting Run: {run_name}")

                    step = 0
                    for epoch in range(NUM_EPOCHS):
                        model.train()
                        epoch_train_loss = 0.0
                        
                        for i, (inputs, labels) in enumerate(trainloader):
                            inputs, labels = inputs.to(device), labels.to(device)
                            
                            optimizer.zero_grad()
                            outputs = model(inputs)
                            loss = criterion(outputs, labels)
                            loss.backward()
                            
                            grad_norms = get_gradient_norms(model)
                            
                            writer.add_scalar('GradientNorms/Global', grad_norms['global_norm'], step)
                            
                            layer_grad_norms = {k: v for k, v in grad_norms.items() if k != 'global_norm'}
                            writer.add_scalars('GradientNorms/Layers', layer_grad_norms, step)
                            
                            writer.add_scalar('Training/Loss', loss.item(), step)
                            
                            optimizer.step()
                            step += 1
                            epoch_train_loss += loss.item()
                        
                        if scheduler is not None:
                            scheduler.step()
                        
                        train_acc, train_loss = evaluate(model, trainloader, device, criterion)
                        test_acc, test_loss = evaluate(model, testloader, device, criterion)
                        generalization_gap = train_acc - test_acc
                        
                        writer.add_scalar('Accuracy/Train', train_acc, epoch)
                        writer.add_scalar('Accuracy/Test', test_acc, epoch)
                        writer.add_scalar('Accuracy/GeneralizationGap', generalization_gap, epoch)
                        writer.add_scalar('Loss/Train_Epoch', train_loss, epoch)
                        writer.add_scalar('Loss/Test_Epoch', test_loss, epoch)
                        
                        print(f"  Epoch {epoch+1}/{NUM_EPOCHS}: "
                              f"Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%, "
                              f"Gap: {generalization_gap:.2f}%")

                
                    writer.add_hparams(
                        {
                            'lr': lr, 'bsize': batch_size, 'residual': use_res, 
                            'beta1': betas[0], 'beta2': betas[1], 'eps': eps,
                            'dataset': dataset_name, 'scheduler': sched
                        },
                        {
                            'hparam/final_train_acc': train_acc,
                            'hparam/final_test_acc': test_acc,
                            'hparam/final_gen_gap': generalization_gap,
                            'hparam/final_loss': test_loss
                        }
                    )
                    
                    writer.close()
                    print(f"--> Completed. Final Test Acc: {test_acc:.2f}%, Gen Gap: {generalization_gap:.2f}%")

Running on: cuda
Starting Run: runs/MNISTResNet_BS32_LR0.01_SCHDLNone_Adam
--> Done. Final Loss: 0.0455
Starting Run: runs/MNISTResNet_BS32_LR0.01_SCHDLStepLR_Adam
--> Done. Final Loss: 0.3300
Starting Run: runs/MNISTResNet_BS32_LR0.01_SCHDLCosine_Adam
--> Done. Final Loss: 0.2130
Starting Run: runs/MNISTResNet_BS32_LR0.005_SCHDLNone_Adam
--> Done. Final Loss: 0.2036
Starting Run: runs/MNISTResNet_BS32_LR0.005_SCHDLStepLR_Adam
--> Done. Final Loss: 0.1426
Starting Run: runs/MNISTResNet_BS32_LR0.005_SCHDLCosine_Adam
--> Done. Final Loss: 0.0835
Starting Run: runs/MNISTResNet_BS32_LR0.001_SCHDLNone_Adam
--> Done. Final Loss: 0.0451
Starting Run: runs/MNISTResNet_BS32_LR0.001_SCHDLStepLR_Adam
--> Done. Final Loss: 0.0067
Starting Run: runs/MNISTResNet_BS32_LR0.001_SCHDLCosine_Adam
--> Done. Final Loss: 0.0192
Starting Run: runs/MNISTResNet_BS64_LR0.01_SCHDLNone_Adam
--> Done. Final Loss: 0.0062
Starting Run: runs/MNISTResNet_BS64_LR0.01_SCHDLStepLR_Adam
--> Done. Final Loss: 0.2449
Starti

# MNIST Experiments

In [None]:
run_experiment('MNIST', datasets_config['MNIST'])
print("MNIST experiments complete!")

# CIFAR-10 Experiments

In [None]:
run_experiment('CIFAR10', datasets_config['CIFAR10'])
print("CIFAR-10 experiments complete!")

# CIFAR-100 Experiments

In [None]:
run_experiment('CIFAR100', datasets_config['CIFAR100'])
print("CIFAR-100 experiments complete!")

# Experiment Summary

In [None]:
print(f"\n=== All Adam Experiments Complete ===")
print(f"Total configurations per dataset: {len(architectures) * len(batch_sizes) * len(learning_rates) * len(schedulers)}")
print(f"Total datasets tested: {len(datasets_config)}")
print(f"Grand total configurations: {len(architectures) * len(batch_sizes) * len(learning_rates) * len(schedulers) * len(datasets_config)}")
print("\nResults saved to TensorBoard logs:")
print("- General gradient norms: GradientNorms/Global")
print("- Layer-wise gradient norms: GradientNorms/Layers")
print("- Training/Test accuracy and generalization gap tracked per epoch")
print("\nRun 'tensorboard --logdir=runs' to analyze results.")