# Week 12 â€” Regularization at Scale

This notebook covers regularization techniques for deep models. You'll:
- Implement and compare dropout, weight decay, and data augmentation
- Design rigorous ablation experiments
- Build ensemble models
- Measure generalization improvements

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

torch.manual_seed(42)
print(f"PyTorch version: {torch.__version__}")

## 1. Dropout Regularization

In [None]:
class MLPWithDropout(nn.Module):
    def __init__(self, input_size=784, hidden_size=256, num_classes=10, dropout_p=0.5):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.dropout1 = nn.Dropout(dropout_p)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.dropout2 = nn.Dropout(dropout_p)
        self.fc3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Compare with and without dropout
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('./data', train=False, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

def train_and_evaluate(model, train_loader, test_loader, n_epochs=5):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    train_losses, test_accs = [], []
    for epoch in range(n_epochs):
        model.train()
        epoch_loss = 0
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        train_losses.append(epoch_loss / len(train_loader))
        
        # Evaluate
        model.eval()
        correct = 0
        with torch.no_grad():
            for data, target in test_loader:
                output = model(data)
                pred = output.argmax(dim=1)
                correct += pred.eq(target).sum().item()
        test_accs.append(100. * correct / len(test_dataset))
        
        print(f"Epoch {epoch+1}: Train Loss={train_losses[-1]:.4f}, Test Acc={test_accs[-1]:.2f}%")
    
    return train_losses, test_accs

print("Training WITHOUT dropout...")
model_no_dropout = MLPWithDropout(dropout_p=0.0)
losses_no_drop, accs_no_drop = train_and_evaluate(model_no_dropout, train_loader, test_loader)

print("\nTraining WITH dropout (p=0.5)...")
model_dropout = MLPWithDropout(dropout_p=0.5)
losses_drop, accs_drop = train_and_evaluate(model_dropout, train_loader, test_loader)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(losses_no_drop, label='No dropout', linewidth=2)
axes[0].plot(losses_drop, label='Dropout p=0.5', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Training Loss')
axes[0].set_title('Training Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(accs_no_drop, label='No dropout', linewidth=2)
axes[1].plot(accs_drop, label='Dropout p=0.5', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Test Accuracy (%)')
axes[1].set_title('Test Accuracy')
axes[1].legend()
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 2. Data Augmentation

In [None]:
# Define augmentation transforms
transform_augmented = transforms.Compose([
    transforms.RandomRotation(10),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ToTensor()
])

train_dataset_aug = datasets.MNIST('./data', train=True, download=True, transform=transform_augmented)
train_loader_aug = DataLoader(train_dataset_aug, batch_size=64, shuffle=True)

print("Training WITH data augmentation...")
model_aug = MLPWithDropout(dropout_p=0.0)
losses_aug, accs_aug = train_and_evaluate(model_aug, train_loader_aug, test_loader)

print(f"\nFinal test accuracy (no augmentation): {accs_no_drop[-1]:.2f}%")
print(f"Final test accuracy (with augmentation): {accs_aug[-1]:.2f}%")

## 3. Weight Decay (L2 Regularization)

In [None]:
# Compare different weight decay values
weight_decays = [0.0, 1e-4, 1e-3, 1e-2]
results = {}

for wd in weight_decays:
    print(f"\nTraining with weight_decay={wd}...")
    model = MLPWithDropout(dropout_p=0.0)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=wd)
    
    # Train for 5 epochs
    for epoch in range(5):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    
    # Evaluate
    model.eval()
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
    acc = 100. * correct / len(test_dataset)
    results[wd] = acc
    print(f"  Test accuracy: {acc:.2f}%")

# Plot
plt.figure(figsize=(8, 5))
plt.plot(list(results.keys()), list(results.values()), 'o-', linewidth=2, markersize=8)
plt.xlabel('Weight Decay')
plt.ylabel('Test Accuracy (%)')
plt.title('Effect of Weight Decay')
plt.xscale('log')
plt.grid(alpha=0.3)
plt.show()

## 4. Ensemble Methods

In [None]:
# Train multiple models with different seeds
n_models = 5
ensemble_models = []

print(f"Training ensemble of {n_models} models...")
for i in range(n_models):
    torch.manual_seed(i)
    model = MLPWithDropout(dropout_p=0.3)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Quick training
    for epoch in range(3):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    
    ensemble_models.append(model)
    print(f"  Model {i+1} trained")

# Ensemble prediction (average logits)
def ensemble_predict(models, data):
    outputs = []
    for model in models:
        model.eval()
        with torch.no_grad():
            outputs.append(model(data))
    avg_output = torch.stack(outputs).mean(dim=0)
    return avg_output.argmax(dim=1)

# Evaluate ensemble
correct_ensemble = 0
correct_single = 0
for data, target in test_loader:
    # Ensemble prediction
    pred_ensemble = ensemble_predict(ensemble_models, data)
    correct_ensemble += pred_ensemble.eq(target).sum().item()
    
    # Single model prediction
    ensemble_models[0].eval()
    with torch.no_grad():
        pred_single = ensemble_models[0](data).argmax(dim=1)
    correct_single += pred_single.eq(target).sum().item()

acc_ensemble = 100. * correct_ensemble / len(test_dataset)
acc_single = 100. * correct_single / len(test_dataset)

print(f"\nSingle model accuracy: {acc_single:.2f}%")
print(f"Ensemble accuracy: {acc_ensemble:.2f}%")
print(f"Improvement: {acc_ensemble - acc_single:.2f}%")

## Exercises

1. **BatchNorm + Dropout**: Combine BatchNorm and Dropout and compare
2. **Ablation Tables**: Create comprehensive ablation tables for all regularization methods
3. **Augmentation Study**: Try different augmentation strategies and measure effects
4. **Real Dataset**: Apply to CIFAR-10 or similar

## Deliverables

- [ ] Dropout experiments with validation curves
- [ ] Data augmentation study
- [ ] Weight decay comparison
- [ ] Ensemble model implementation
- [ ] Ablation table with clear recommendations