# Week 01 â€” Optimization Intuition (Loss as Energy)

This notebook guides you through building physical intuition for loss landscapes and optimization dynamics. You'll:
- Visualize loss landscapes and understand gradient-based optimization
- Implement gradient descent, SGD, and momentum
- Experiment with learning rates and hyperparameters

In [None]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

np.random.seed(42)

## 1. Visualize Simple Loss Landscapes

Create 2D grids and plot loss contours for quadratic and multimodal functions.

In [None]:
# Define loss functions
def quadratic(x, y):
    """Simple quadratic loss"""
    return x**2 + 3*y**2

def multimodal(x, y):
    """Multimodal loss with multiple local minima"""
    return np.sin(x) * np.cos(y) + 0.1*(x**2 + y**2)

# Create grid
x_range = np.linspace(-3, 3, 100)
y_range = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x_range, y_range)

# Compute losses
Z_quad = quadratic(X, Y)
Z_multi = multimodal(X, Y)

# Plot contours
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Quadratic
contour1 = axes[0].contour(X, Y, Z_quad, levels=20, cmap='viridis')
axes[0].set_title('Quadratic Loss Landscape')
axes[0].set_xlabel('x')
axes[0].set_ylabel('y')
plt.colorbar(contour1, ax=axes[0])

# Multimodal
contour2 = axes[1].contour(X, Y, Z_multi, levels=20, cmap='viridis')
axes[1].set_title('Multimodal Loss Landscape')
axes[1].set_xlabel('x')
axes[1].set_ylabel('y')
plt.colorbar(contour2, ax=axes[1])

plt.tight_layout()
plt.show()

## 2. Gradient Descent Dynamics

Implement vanilla gradient descent and plot parameter trajectories.

In [None]:
# Gradient descent implementation
def gradient_descent(x0, y0, grad_fn, lr=0.1, n_steps=50):
    """
    Simple gradient descent optimizer
    
    Args:
        x0, y0: Initial parameters
        grad_fn: Function returning (grad_x, grad_y)
        lr: Learning rate
        n_steps: Number of optimization steps
    
    Returns:
        trajectory: List of (x, y) positions
    """
    trajectory = [(x0, y0)]
    x, y = x0, y0
    
    for _ in range(n_steps):
        gx, gy = grad_fn(x, y)
        x -= lr * gx
        y -= lr * gy
        trajectory.append((x, y))
    
    return trajectory

# Define gradients for quadratic loss
def quad_gradient(x, y):
    return (2*x, 6*y)

# Run gradient descent
trajectory_gd = gradient_descent(2.0, -1.0, quad_gradient, lr=0.1, n_steps=50)

# Plot trajectory on loss landscape
plt.figure(figsize=(8, 6))
plt.contour(X, Y, Z_quad, levels=20, cmap='viridis', alpha=0.6)
traj_x = [p[0] for p in trajectory_gd]
traj_y = [p[1] for p in trajectory_gd]
plt.plot(traj_x, traj_y, 'r-o', markersize=4, linewidth=2, label='GD trajectory')
plt.plot(traj_x[0], traj_y[0], 'go', markersize=10, label='Start')
plt.plot(traj_x[-1], traj_y[-1], 'ro', markersize=10, label='End')
plt.title('Gradient Descent on Quadratic Loss')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

print(f"Final position: ({traj_x[-1]:.4f}, {traj_y[-1]:.4f})")
print(f"Final loss: {quadratic(traj_x[-1], traj_y[-1]):.6f}")

## 3. Momentum and Learning Rate Sweeps

Implement momentum and experiment with different learning rates.

In [None]:
# Gradient descent with momentum
def gradient_descent_momentum(x0, y0, grad_fn, lr=0.1, momentum=0.9, n_steps=50):
    """GD with momentum"""
    trajectory = [(x0, y0)]
    x, y = x0, y0
    vx, vy = 0.0, 0.0
    
    for _ in range(n_steps):
        gx, gy = grad_fn(x, y)
        vx = momentum * vx - lr * gx
        vy = momentum * vy - lr * gy
        x += vx
        y += vy
        trajectory.append((x, y))
    
    return trajectory

# Compare GD vs GD+Momentum
trajectory_momentum = gradient_descent_momentum(2.0, -1.0, quad_gradient, lr=0.1, momentum=0.9, n_steps=50)

# Plot comparison
plt.figure(figsize=(8, 6))
plt.contour(X, Y, Z_quad, levels=20, cmap='viridis', alpha=0.6)

# GD trajectory
plt.plot(traj_x, traj_y, 'b-o', markersize=3, linewidth=1.5, label='Vanilla GD', alpha=0.7)

# Momentum trajectory
mom_x = [p[0] for p in trajectory_momentum]
mom_y = [p[1] for p in trajectory_momentum]
plt.plot(mom_x, mom_y, 'r-o', markersize=3, linewidth=1.5, label='GD + Momentum', alpha=0.7)

plt.plot(2.0, -1.0, 'go', markersize=10, label='Start')
plt.title('Gradient Descent: Vanilla vs Momentum')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

print(f"GD final loss: {quadratic(traj_x[-1], traj_y[-1]):.6f}")
print(f"Momentum final loss: {quadratic(mom_x[-1], mom_y[-1]):.6f}")

In [None]:
# Learning rate sweep
learning_rates = [0.01, 0.05, 0.1, 0.2, 0.5]
final_losses = []

plt.figure(figsize=(12, 5))
plt.contour(X, Y, Z_quad, levels=20, cmap='viridis', alpha=0.3)

for lr in learning_rates:
    traj = gradient_descent(2.0, -1.0, quad_gradient, lr=lr, n_steps=30)
    tx = [p[0] for p in traj]
    ty = [p[1] for p in traj]
    final_loss = quadratic(tx[-1], ty[-1])
    final_losses.append(final_loss)
    plt.plot(tx, ty, '-o', markersize=2, label=f'LR={lr}', alpha=0.7)

plt.title('Learning Rate Sweep on Quadratic Loss')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# Plot final loss vs learning rate
plt.figure(figsize=(8, 5))
plt.plot(learning_rates, final_losses, 'o-', linewidth=2, markersize=8)
plt.xlabel('Learning Rate')
plt.ylabel('Final Loss')
plt.title('Final Loss vs Learning Rate')
plt.grid(alpha=0.3)
plt.show()

## Exercises for Further Practice

1. **Explore SGD**: Implement stochastic gradient descent with mini-batches on a simple regression dataset
2. **Saddle Points**: Create a saddle-point loss function and visualize optimizer behavior
3. **Cyclical Learning Rates**: Implement Leslie Smith's cyclical LR and compare to constant LR
4. **3D Visualization**: Create 3D surface plots of the loss landscapes
5. **Divergence Analysis**: Find learning rates that cause divergence and explain why

## Deliverables Checklist

- [ ] Loss landscape visualizations (quadratic + multimodal)
- [ ] GD/SGD/momentum implementations with trajectory plots
- [ ] Learning rate sweep analysis
- [ ] Short conclusions about convergence behavior and hyperparameter sensitivity

## Recommended Next Steps

- Review optimization sections in Goodfellow et al. "Deep Learning"
- Explore Leslie Smith's LR range test and cyclical LR papers
- Try implementing Adam optimizer (Week 02 preview)