# PyTorch Optimizer Comparison

## Simple 2-Parameter Linear Regression

This notebook compares different PyTorch optimizers on a minimal regression task:

$$y = w \cdot x + b + \epsilon$$

Where:
- `w` (weight) and `b` (bias) are the 2 learnable parameters
- We can visualize the optimization trajectory in 2D parameter space

### Optimizers Compared
1. **SGD** - Vanilla Stochastic Gradient Descent
2. **SGD + Momentum** - SGD with momentum
3. **Adam** - Adaptive Moment Estimation
4. **AdamW** - Adam with decoupled weight decay
5. **RMSprop** - Root Mean Square Propagation
6. **Adagrad** - Adaptive Gradient

---

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
import copy

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print(f"PyTorch version: {torch.__version__}")

## 1. Generate Synthetic Data

Create a simple linear dataset with known ground truth parameters.

In [None]:
# Ground truth parameters
TRUE_W = 2.5
TRUE_B = -1.0

# Generate data
N_SAMPLES = 100
NOISE_STD = 0.3

x_data = torch.linspace(-2, 2, N_SAMPLES).reshape(-1, 1)
y_data = TRUE_W * x_data + TRUE_B + NOISE_STD * torch.randn(N_SAMPLES, 1)

print(f"Data shape: x={x_data.shape}, y={y_data.shape}")
print(f"Ground truth: w={TRUE_W}, b={TRUE_B}")

# Visualize data
plt.figure(figsize=(8, 5))
plt.scatter(x_data.numpy(), y_data.numpy(), alpha=0.6, label='Data points')
plt.plot(x_data.numpy(), TRUE_W * x_data.numpy() + TRUE_B, 'r-', linewidth=2, label=f'True: y = {TRUE_W}x + {TRUE_B}')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Synthetic Linear Regression Data')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 2. Define Simple 2-Parameter Model

A minimal linear model with just weight `w` and bias `b`.

In [None]:
class SimpleLinearModel(nn.Module):
    """Minimal 2-parameter linear model: y = w*x + b"""
    
    def __init__(self, init_w=0.0, init_b=0.0):
        super().__init__()
        # Single weight and bias
        self.w = nn.Parameter(torch.tensor([init_w]))
        self.b = nn.Parameter(torch.tensor([init_b]))
    
    def forward(self, x):
        return self.w * x + self.b
    
    def get_params(self):
        """Return current (w, b) as tuple"""
        return self.w.item(), self.b.item()

# Test the model
test_model = SimpleLinearModel(init_w=0.5, init_b=0.5)
print(f"Model parameters: w={test_model.w.item():.3f}, b={test_model.b.item():.3f}")
print(f"Total parameters: {sum(p.numel() for p in test_model.parameters())}")

## 3. Visualize Loss Landscape

Plot the MSE loss surface in (w, b) parameter space.

In [None]:
def compute_loss_landscape(x_data, y_data, w_range, b_range, resolution=100):
    """Compute MSE loss for a grid of (w, b) values."""
    w_vals = np.linspace(w_range[0], w_range[1], resolution)
    b_vals = np.linspace(b_range[0], b_range[1], resolution)
    W, B = np.meshgrid(w_vals, b_vals)
    
    loss_surface = np.zeros_like(W)
    
    for i in range(resolution):
        for j in range(resolution):
            w, b = W[i, j], B[i, j]
            y_pred = w * x_data.numpy() + b
            loss = np.mean((y_pred - y_data.numpy()) ** 2)
            loss_surface[i, j] = loss
    
    return W, B, loss_surface

# Compute loss landscape
W_RANGE = (-1, 5)
B_RANGE = (-4, 2)
W, B, loss_surface = compute_loss_landscape(x_data, y_data, W_RANGE, B_RANGE)

# Plot loss landscape
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 3D surface
ax1 = fig.add_subplot(121, projection='3d')
ax1.plot_surface(W, B, loss_surface, cmap='viridis', alpha=0.8)
ax1.scatter([TRUE_W], [TRUE_B], [0], color='red', s=100, marker='*', label='Optimum')
ax1.set_xlabel('w (weight)')
ax1.set_ylabel('b (bias)')
ax1.set_zlabel('MSE Loss')
ax1.set_title('Loss Landscape (3D)')

# Contour plot
ax2 = axes[1]
contour = ax2.contour(W, B, loss_surface, levels=30, cmap='viridis')
ax2.clabel(contour, inline=True, fontsize=8)
ax2.scatter([TRUE_W], [TRUE_B], color='red', s=100, marker='*', zorder=5, label=f'Optimum ({TRUE_W}, {TRUE_B})')
ax2.set_xlabel('w (weight)')
ax2.set_ylabel('b (bias)')
ax2.set_title('Loss Landscape (Contour)')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Training Function with Trajectory Tracking

Train the model and record parameter trajectory for visualization.

In [None]:
def train_model(optimizer_class, optimizer_kwargs, x_data, y_data, 
                init_w=0.0, init_b=0.0, n_epochs=100, batch_size=None):
    """
    Train a simple linear model and track the optimization trajectory.
    
    Args:
        optimizer_class: PyTorch optimizer class
        optimizer_kwargs: Dict of optimizer arguments (lr, momentum, etc.)
        x_data, y_data: Training data
        init_w, init_b: Initial parameter values
        n_epochs: Number of training epochs
        batch_size: If None, use full batch; otherwise mini-batch
    
    Returns:
        Dict with losses, trajectory, final model
    """
    # Create fresh model
    model = SimpleLinearModel(init_w=init_w, init_b=init_b)
    criterion = nn.MSELoss()
    optimizer = optimizer_class(model.parameters(), **optimizer_kwargs)
    
    # Track metrics
    losses = []
    trajectory = [model.get_params()]  # Starting point
    
    n_samples = len(x_data)
    
    for epoch in range(n_epochs):
        if batch_size is None:
            # Full batch gradient descent
            optimizer.zero_grad()
            y_pred = model(x_data)
            loss = criterion(y_pred, y_data)
            loss.backward()
            optimizer.step()
            
            losses.append(loss.item())
            trajectory.append(model.get_params())
        else:
            # Mini-batch SGD
            indices = torch.randperm(n_samples)
            epoch_loss = 0.0
            n_batches = 0
            
            for i in range(0, n_samples, batch_size):
                batch_idx = indices[i:i+batch_size]
                x_batch = x_data[batch_idx]
                y_batch = y_data[batch_idx]
                
                optimizer.zero_grad()
                y_pred = model(x_batch)
                loss = criterion(y_pred, y_batch)
                loss.backward()
                optimizer.step()
                
                epoch_loss += loss.item()
                n_batches += 1
                trajectory.append(model.get_params())
            
            losses.append(epoch_loss / n_batches)
    
    return {
        'losses': losses,
        'trajectory': trajectory,
        'final_w': model.w.item(),
        'final_b': model.b.item(),
        'model': model
    }

## 5. Compare Optimizers

Train with different optimizers and compare their behavior.

In [None]:
# Optimizer configurations
OPTIMIZERS = {
    'SGD (lr=0.01)': (torch.optim.SGD, {'lr': 0.01}),
    'SGD (lr=0.1)': (torch.optim.SGD, {'lr': 0.1}),
    'SGD + Momentum': (torch.optim.SGD, {'lr': 0.01, 'momentum': 0.9}),
    'Adam (lr=0.1)': (torch.optim.Adam, {'lr': 0.1}),
    'Adam (lr=0.01)': (torch.optim.Adam, {'lr': 0.01}),
    'AdamW': (torch.optim.AdamW, {'lr': 0.1}),
    'RMSprop': (torch.optim.RMSprop, {'lr': 0.01}),
    'Adagrad': (torch.optim.Adagrad, {'lr': 0.1}),
}

# Common starting point (away from optimum)
INIT_W = 0.0
INIT_B = 0.0
N_EPOCHS = 50

# Train with each optimizer
results = {}
for name, (opt_class, opt_kwargs) in OPTIMIZERS.items():
    print(f"Training with {name}...")
    results[name] = train_model(
        opt_class, opt_kwargs, x_data, y_data,
        init_w=INIT_W, init_b=INIT_B, n_epochs=N_EPOCHS
    )
    print(f"  Final: w={results[name]['final_w']:.4f}, b={results[name]['final_b']:.4f}, "
          f"loss={results[name]['losses'][-1]:.6f}")

print(f"\nGround truth: w={TRUE_W}, b={TRUE_B}")

## 6. Visualize Loss Curves

In [None]:
# Plot loss curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Linear scale
ax1 = axes[0]
for name, res in results.items():
    ax1.plot(res['losses'], label=name, linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('MSE Loss')
ax1.set_title('Loss Curves (Linear Scale)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Log scale
ax2 = axes[1]
for name, res in results.items():
    ax2.plot(res['losses'], label=name, linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('MSE Loss (log scale)')
ax2.set_yscale('log')
ax2.set_title('Loss Curves (Log Scale)')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Visualize Optimization Trajectories

Show how each optimizer navigates the loss landscape.

In [None]:
# Plot trajectories on contour
fig, ax = plt.subplots(figsize=(12, 10))

# Background contour
contour = ax.contour(W, B, loss_surface, levels=30, cmap='gray', alpha=0.5)
ax.contourf(W, B, loss_surface, levels=30, cmap='viridis', alpha=0.3)

# Color map for optimizers
colors = plt.cm.tab10(np.linspace(0, 1, len(results)))

# Plot each trajectory
for (name, res), color in zip(results.items(), colors):
    traj = np.array(res['trajectory'])
    ax.plot(traj[:, 0], traj[:, 1], '-', color=color, linewidth=2, label=name, alpha=0.8)
    ax.scatter(traj[0, 0], traj[0, 1], color=color, s=100, marker='o', edgecolor='black', zorder=5)  # Start
    ax.scatter(traj[-1, 0], traj[-1, 1], color=color, s=100, marker='s', edgecolor='black', zorder=5)  # End

# Mark optimum
ax.scatter([TRUE_W], [TRUE_B], color='red', s=200, marker='*', zorder=10, label=f'Optimum ({TRUE_W}, {TRUE_B})')

# Mark start
ax.scatter([INIT_W], [INIT_B], color='black', s=150, marker='X', zorder=10, label=f'Start ({INIT_W}, {INIT_B})')

ax.set_xlabel('w (weight)', fontsize=12)
ax.set_ylabel('b (bias)', fontsize=12)
ax.set_title('Optimization Trajectories in Parameter Space', fontsize=14)
ax.legend(loc='upper left', fontsize=9)
ax.grid(True, alpha=0.3)
ax.set_xlim(W_RANGE)
ax.set_ylim(B_RANGE)

plt.tight_layout()
plt.show()

## 8. Detailed Trajectory Comparison (Zoomed)

In [None]:
# Zoomed view near optimum
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

# Zoom range
zoom_w = (TRUE_W - 1.5, TRUE_W + 1.0)
zoom_b = (TRUE_B - 1.5, TRUE_B + 1.5)

# Recompute loss landscape for zoomed view
W_zoom, B_zoom, loss_zoom = compute_loss_landscape(x_data, y_data, zoom_w, zoom_b, resolution=50)

for idx, ((name, res), color) in enumerate(zip(results.items(), colors)):
    ax = axes[idx]
    
    # Background
    ax.contourf(W_zoom, B_zoom, loss_zoom, levels=20, cmap='viridis', alpha=0.4)
    ax.contour(W_zoom, B_zoom, loss_zoom, levels=20, cmap='gray', alpha=0.3)
    
    # Trajectory
    traj = np.array(res['trajectory'])
    ax.plot(traj[:, 0], traj[:, 1], 'o-', color=color, linewidth=2, markersize=3, alpha=0.8)
    ax.scatter(traj[0, 0], traj[0, 1], color='black', s=80, marker='o', zorder=5, label='Start')
    ax.scatter(traj[-1, 0], traj[-1, 1], color=color, s=80, marker='s', edgecolor='black', zorder=5, label='End')
    
    # Optimum
    ax.scatter([TRUE_W], [TRUE_B], color='red', s=100, marker='*', zorder=10)
    
    ax.set_title(f'{name}\nFinal loss: {res["losses"][-1]:.6f}', fontsize=10)
    ax.set_xlabel('w')
    ax.set_ylabel('b')
    ax.set_xlim(zoom_w)
    ax.set_ylim(zoom_b)
    ax.grid(True, alpha=0.3)

plt.suptitle('Zoomed Trajectories Near Optimum', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 9. Parameter Evolution Over Time

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Weight evolution
ax1 = axes[0]
for (name, res), color in zip(results.items(), colors):
    traj = np.array(res['trajectory'])
    ax1.plot(traj[:, 0], label=name, color=color, linewidth=2)
ax1.axhline(y=TRUE_W, color='red', linestyle='--', linewidth=2, label=f'True w={TRUE_W}')
ax1.set_xlabel('Step')
ax1.set_ylabel('w (weight)')
ax1.set_title('Weight (w) Evolution')
ax1.legend(loc='lower right', fontsize=8)
ax1.grid(True, alpha=0.3)

# Bias evolution
ax2 = axes[1]
for (name, res), color in zip(results.items(), colors):
    traj = np.array(res['trajectory'])
    ax2.plot(traj[:, 1], label=name, color=color, linewidth=2)
ax2.axhline(y=TRUE_B, color='red', linestyle='--', linewidth=2, label=f'True b={TRUE_B}')
ax2.set_xlabel('Step')
ax2.set_ylabel('b (bias)')
ax2.set_title('Bias (b) Evolution')
ax2.legend(loc='upper right', fontsize=8)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 10. Summary Statistics

In [None]:
import pandas as pd

# Create summary table
summary_data = []
for name, res in results.items():
    w_error = abs(res['final_w'] - TRUE_W)
    b_error = abs(res['final_b'] - TRUE_B)
    param_dist = np.sqrt(w_error**2 + b_error**2)
    
    summary_data.append({
        'Optimizer': name,
        'Final Loss': f"{res['losses'][-1]:.6f}",
        'Final w': f"{res['final_w']:.4f}",
        'Final b': f"{res['final_b']:.4f}",
        'w Error': f"{w_error:.4f}",
        'b Error': f"{b_error:.4f}",
        'Distance to Optimum': f"{param_dist:.4f}",
        'Steps': len(res['trajectory'])
    })

df = pd.DataFrame(summary_data)
print("=" * 100)
print("OPTIMIZER COMPARISON SUMMARY")
print("=" * 100)
print(f"\nGround Truth: w = {TRUE_W}, b = {TRUE_B}")
print(f"Starting Point: w = {INIT_W}, b = {INIT_B}")
print(f"Epochs: {N_EPOCHS}\n")
print(df.to_string(index=False))
print("=" * 100)

## 11. Convergence Speed Analysis

In [None]:
def epochs_to_threshold(losses, threshold):
    """Return number of epochs to reach loss threshold, or None if not reached."""
    for i, loss in enumerate(losses):
        if loss <= threshold:
            return i + 1
    return None

# Analyze convergence speed
thresholds = [1.0, 0.5, 0.2, 0.1, 0.15]

print("Epochs to reach loss threshold:")
print("-" * 80)
header = f"{'Optimizer':<25}" + "".join([f"Loss<{t:<6}" for t in thresholds])
print(header)
print("-" * 80)

for name, res in results.items():
    row = f"{name:<25}"
    for thresh in thresholds:
        epochs = epochs_to_threshold(res['losses'], thresh)
        row += f"{str(epochs) if epochs else 'N/A':<10}"
    print(row)

## 12. Mini-Batch vs Full-Batch Comparison

In [None]:
# Compare full-batch vs mini-batch for Adam
batch_sizes = [None, 50, 20, 10, 5]  # None = full batch
batch_results = {}

for bs in batch_sizes:
    name = f"Batch={bs if bs else 'Full'}"
    batch_results[name] = train_model(
        torch.optim.Adam, {'lr': 0.1}, x_data, y_data,
        init_w=INIT_W, init_b=INIT_B, n_epochs=20, batch_size=bs
    )

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
for name, res in batch_results.items():
    ax1.plot(res['losses'], label=name, linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('MSE Loss')
ax1.set_title('Adam: Batch Size Comparison (Loss)')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2 = axes[1]
colors_batch = plt.cm.viridis(np.linspace(0, 1, len(batch_results)))
for (name, res), color in zip(batch_results.items(), colors_batch):
    traj = np.array(res['trajectory'])
    ax2.plot(traj[:, 0], traj[:, 1], '-', color=color, linewidth=1.5, label=name, alpha=0.7)

ax2.contour(W, B, loss_surface, levels=20, cmap='gray', alpha=0.3)
ax2.scatter([TRUE_W], [TRUE_B], color='red', s=100, marker='*', zorder=10)
ax2.scatter([INIT_W], [INIT_B], color='black', s=100, marker='X', zorder=10)
ax2.set_xlabel('w (weight)')
ax2.set_ylabel('b (bias)')
ax2.set_title('Adam: Batch Size Comparison (Trajectory)')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_xlim(W_RANGE)
ax2.set_ylim(B_RANGE)

plt.tight_layout()
plt.show()

## 13. Key Observations

### Summary of Optimizer Behaviors:

| Optimizer | Characteristics |
|-----------|----------------|
| **SGD (low lr)** | Slow, steady convergence; may not reach optimum in limited epochs |
| **SGD (high lr)** | Faster but can overshoot; may oscillate |
| **SGD + Momentum** | Accelerates convergence; smooths trajectory |
| **Adam** | Adaptive learning rate; fast convergence; good default choice |
| **AdamW** | Adam with proper weight decay; similar to Adam for this simple case |
| **RMSprop** | Adaptive; predecessor to Adam; good for non-stationary objectives |
| **Adagrad** | Accumulates gradients; learning rate decreases over time |

### For SOEN Training:
- **Adam** is typically a good default choice
- **Learning rate** is crucial - too high causes instability, too low is slow
- **Momentum** helps escape local minima and smooth noisy gradients
- The loss landscape shape affects which optimizer works best

In [None]:
print("Notebook complete!")