# Week 08 — Training Pathologies

This notebook explores common failure modes and their fixes. You'll:
- Diagnose vanishing/exploding gradients
- Compare activation functions and their effects
- Apply fixes: initialization, normalization, gradient clipping
- Track and visualize gradient flow

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
%matplotlib inline

np.random.seed(42)
torch.manual_seed(42)
print("Libraries imported!")

## 1. Track Gradient Norms Across Layers

Build a deep network and monitor gradient magnitudes to detect vanishing/exploding gradients.

In [None]:
# Deep network for gradient tracking
class DeepNet(nn.Module):
    def __init__(self, input_size=10, hidden_size=50, n_layers=10, activation='sigmoid'):
        super().__init__()
        self.layers = nn.ModuleList()
        
        # First layer
        self.layers.append(nn.Linear(input_size, hidden_size))
        
        # Hidden layers
        for _ in range(n_layers - 2):
            self.layers.append(nn.Linear(hidden_size, hidden_size))
        
        # Output layer
        self.layers.append(nn.Linear(hidden_size, 1))
        
        # Activation
        if activation == 'sigmoid':
            self.activation = nn.Sigmoid()
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        elif activation == 'relu':
            self.activation = nn.ReLU()
        else:
            raise ValueError(f"Unknown activation: {activation}")
    
    def forward(self, x):
        for i, layer in enumerate(self.layers[:-1]):
            x = self.activation(layer(x))
        x = self.layers[-1](x)  # No activation on output
        return x

# Function to compute gradient norms
def compute_gradient_norms(model, loss):
    """Compute L2 norm of gradients for each layer"""
    norms = []
    for i, layer in enumerate(model.layers):
        if hasattr(layer, 'weight') and layer.weight.grad is not None:
            norm = layer.weight.grad.norm().item()
            norms.append(norm)
        else:
            norms.append(0.0)
    return norms

# Create toy data
X = torch.randn(100, 10)
y = torch.randn(100, 1)

# Test different activations
activations = ['sigmoid', 'tanh', 'relu']
gradient_norms = {}

for act in activations:
    model = DeepNet(input_size=10, hidden_size=50, n_layers=10, activation=act)
    
    # Forward and backward
    outputs = model(X)
    loss = nn.MSELoss()(outputs, y)
    loss.backward()
    
    # Get gradient norms
    norms = compute_gradient_norms(model, loss)
    gradient_norms[act] = norms
    
    print(f"{act:8s} - Gradient norms: {norms[:5]} ... {norms[-2:]}")

# Plot gradient norms
plt.figure(figsize=(12, 5))
for act, norms in gradient_norms.items():
    plt.plot(range(len(norms)), norms, 'o-', label=act, linewidth=2, markersize=6)

plt.xlabel('Layer Index')
plt.ylabel('Gradient L2 Norm')
plt.title('Gradient Norms Across Layers (10-layer deep network)')
plt.legend()
plt.grid(alpha=0.3)
plt.yscale('log')
plt.show()

print("\n→ Sigmoid/Tanh show vanishing gradients in early layers!")
print("→ ReLU maintains better gradient flow.")

## 2. Activation Function Comparison

Train networks with different activations and compare learning dynamics.

In [None]:
# Training function
def train_model(model, X, y, n_epochs=100, lr=0.01):
    """Train model and return loss history"""
    optimizer = optim.SGD(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    losses = []
    
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    
    return losses

# Generate larger dataset
torch.manual_seed(42)
X_train = torch.randn(500, 10)
y_train = torch.randn(500, 1)

# Compare activations
results = {}
for act in ['sigmoid', 'tanh', 'relu']:
    print(f"Training with {act}...")
    model = DeepNet(input_size=10, hidden_size=50, n_layers=6, activation=act)
    losses = train_model(model, X_train, y_train, n_epochs=200, lr=0.01)
    results[act] = losses
    print(f"  Final loss: {losses[-1]:.6f}")

# Plot training curves
plt.figure(figsize=(10, 5))
for act, losses in results.items():
    plt.plot(losses, label=act, linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Dynamics: Activation Function Comparison')
plt.legend()
plt.grid(alpha=0.3)
plt.yscale('log')
plt.show()

## 3. Initialization Strategies

Compare Xavier/Glorot and He initialization to solve gradient problems.

In [None]:
# Apply different initializations
def initialize_model(model, init_type='xavier'):
    """Initialize model weights"""
    for layer in model.layers:
        if hasattr(layer, 'weight'):
            if init_type == 'xavier':
                nn.init.xavier_uniform_(layer.weight)
            elif init_type == 'he':
                nn.init.kaiming_uniform_(layer.weight, nonlinearity='relu')
            elif init_type == 'small':
                nn.init.normal_(layer.weight, mean=0.0, std=0.01)
            
            if hasattr(layer, 'bias') and layer.bias is not None:
                nn.init.zeros_(layer.bias)

# Test different initializations with ReLU
init_types = ['small', 'xavier', 'he']
init_results = {}

for init_type in init_types:
    print(f"Training with {init_type} initialization...")
    model = DeepNet(input_size=10, hidden_size=50, n_layers=8, activation='relu')
    initialize_model(model, init_type)
    losses = train_model(model, X_train, y_train, n_epochs=200, lr=0.01)
    init_results[init_type] = losses
    print(f"  Final loss: {losses[-1]:.6f}")

# Plot
plt.figure(figsize=(10, 5))
for init_type, losses in init_results.items():
    plt.plot(losses, label=init_type, linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Initialization Comparison (ReLU activation)')
plt.legend()
plt.grid(alpha=0.3)
plt.yscale('log')
plt.show()

print("\n→ He initialization works best with ReLU!")
print("→ Xavier works better with tanh/sigmoid.")

## 4. Batch Normalization

Add BatchNorm to stabilize training and improve gradient flow.

In [None]:
# Network with BatchNorm
class DeepNetWithBatchNorm(nn.Module):
    def __init__(self, input_size=10, hidden_size=50, n_layers=10):
        super().__init__()
        self.layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        
        # First layer
        self.layers.append(nn.Linear(input_size, hidden_size))
        self.batch_norms.append(nn.BatchNorm1d(hidden_size))
        
        # Hidden layers
        for _ in range(n_layers - 2):
            self.layers.append(nn.Linear(hidden_size, hidden_size))
            self.batch_norms.append(nn.BatchNorm1d(hidden_size))
        
        # Output layer
        self.layers.append(nn.Linear(hidden_size, 1))
        self.activation = nn.ReLU()
    
    def forward(self, x):
        for i in range(len(self.layers) - 1):
            x = self.layers[i](x)
            x = self.batch_norms[i](x)
            x = self.activation(x)
        x = self.layers[-1](x)
        return x

# Compare with and without BatchNorm
print("Training WITHOUT BatchNorm...")
model_no_bn = DeepNet(input_size=10, hidden_size=50, n_layers=10, activation='relu')
losses_no_bn = train_model(model_no_bn, X_train, y_train, n_epochs=200, lr=0.01)

print("Training WITH BatchNorm...")
model_bn = DeepNetWithBatchNorm(input_size=10, hidden_size=50, n_layers=10)
losses_bn = train_model(model_bn, X_train, y_train, n_epochs=200, lr=0.01)

# Plot comparison
plt.figure(figsize=(10, 5))
plt.plot(losses_no_bn, label='Without BatchNorm', linewidth=2)
plt.plot(losses_bn, label='With BatchNorm', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Effect of Batch Normalization (10-layer network)')
plt.legend()
plt.grid(alpha=0.3)
plt.yscale('log')
plt.show()

print(f"\nFinal loss WITHOUT BatchNorm: {losses_no_bn[-1]:.6f}")
print(f"Final loss WITH BatchNorm: {losses_bn[-1]:.6f}")
print("\n→ BatchNorm significantly stabilizes deep network training!")

## 5. Gradient Clipping

Use gradient clipping to prevent exploding gradients.

In [None]:
# Training with gradient clipping
def train_with_clipping(model, X, y, n_epochs=100, lr=0.01, clip_value=None):
    """Train with optional gradient clipping"""
    optimizer = optim.SGD(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    losses = []
    grad_norms = []
    
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        
        # Compute total gradient norm
        total_norm = 0.0
        for p in model.parameters():
            if p.grad is not None:
                total_norm += p.grad.data.norm(2).item() ** 2
        total_norm = total_norm ** 0.5
        grad_norms.append(total_norm)
        
        # Clip gradients
        if clip_value is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
        
        optimizer.step()
        losses.append(loss.item())
    
    return losses, grad_norms

# Create a problem that might have exploding gradients (high LR)
print("Training WITHOUT gradient clipping...")
model_no_clip = DeepNet(input_size=10, hidden_size=50, n_layers=8, activation='relu')
losses_no_clip, grads_no_clip = train_with_clipping(model_no_clip, X_train, y_train, 
                                                     n_epochs=100, lr=0.1, clip_value=None)

print("Training WITH gradient clipping...")
model_clip = DeepNet(input_size=10, hidden_size=50, n_layers=8, activation='relu')
losses_clip, grads_clip = train_with_clipping(model_clip, X_train, y_train, 
                                              n_epochs=100, lr=0.1, clip_value=1.0)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(losses_no_clip, label='No clipping', linewidth=2)
axes[0].plot(losses_clip, label='With clipping (max=1.0)', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)
axes[0].set_yscale('log')

axes[1].plot(grads_no_clip, label='No clipping', linewidth=2)
axes[1].plot(grads_clip, label='With clipping (max=1.0)', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Gradient Norm')
axes[1].set_title('Gradient Magnitude')
axes[1].legend()
axes[1].grid(alpha=0.3)
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

print("\n→ Gradient clipping prevents explosions and stabilizes training!")

## Exercises for Further Practice

1. **Layer Normalization**: Implement and compare LayerNorm to BatchNorm
2. **Residual Connections**: Add skip connections to improve gradient flow
3. **Different Depths**: Test very deep networks (20+ layers) with and without fixes
4. **Learning Rate Analysis**: Study how LR interacts with initialization and normalization
5. **Real Data**: Apply these techniques to MNIST or CIFAR-10

## Deliverables Checklist

- [ ] Gradient norm tracking across layers with visualizations
- [ ] Activation function comparison experiments
- [ ] Initialization and BatchNorm experiments
- [ ] Gradient clipping demonstration
- [ ] Short write-up on which fixes work best for which pathologies

## Recommended Resources

- Glorot & Bengio (2010): "Understanding the difficulty of training deep feedforward neural networks"
- Ioffe & Szegedy (2015): "Batch Normalization"
- He et al. (2015): "Delving Deep into Rectifiers" (He initialization)
- PyTorch documentation on initialization and normalization