In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import time

sys.path.insert(0, str(Path.cwd().parent.parent.parent))

from modules._import_helper import safe_import_from

set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')
GradientDescent = safe_import_from('01_numerical_toolbox.src.optimizers_from_scratch', 
                                   'GradientDescent')
Momentum = safe_import_from('01_numerical_toolbox.src.optimizers_from_scratch', 'Momentum')
Adam = safe_import_from('01_numerical_toolbox.src.optimizers_from_scratch', 'Adam')

set_seed(42)
REPORTS_DIR = Path('../reports')
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("✓ Imports successful")

---

## 1. Intuition: Why Momentum and Adaptivity?

**Problems with Vanilla GD:**
- **Oscillates** in narrow valleys (ill-conditioned problems)
- **Single learning rate** doesn't adapt to problem geometry
- **Slow** in flat regions

**Solutions:**
- **Momentum**: Accumulates velocity → smooths oscillations, accelerates in consistent directions
- **Adam**: Per-parameter adaptive learning rates → handles different scales automatically

**Analogy**: 
- GD = walking carefully step-by-step
- Momentum = rolling a ball down hill (builds speed)
- Adam = smart walker who adjusts stride per terrain

---

## 2. Minimal Math: Update Rules

### Vanilla Gradient Descent
$$
\mathbf{x}_{t+1} = \mathbf{x}_t - \alpha \nabla f(\mathbf{x}_t)
$$

### Momentum (Polyak)
$$
\begin{align}
\mathbf{v}_{t+1} &= \beta \mathbf{v}_t + \nabla f(\mathbf{x}_t) \\
\mathbf{x}_{t+1} &= \mathbf{x}_t - \alpha \mathbf{v}_{t+1}
\end{align}
$$
- $\beta$ (typically 0.9): momentum coefficient
- Velocity $\mathbf{v}$ accumulates exponentially weighted moving average of gradients

### Adam (Adaptive Moment Estimation)
$$
\begin{align}
\mathbf{m}_t &= \beta_1 \mathbf{m}_{t-1} + (1-\beta_1) \nabla f(\mathbf{x}_{t-1}) \\
\mathbf{v}_t &= \beta_2 \mathbf{v}_{t-1} + (1-\beta_2) [\nabla f(\mathbf{x}_{t-1})]^2 \\
\hat{\mathbf{m}}_t &= \mathbf{m}_t / (1-\beta_1^t) \quad \text{(bias correction)} \\
\hat{\mathbf{v}}_t &= \mathbf{v}_t / (1-\beta_2^t) \\
\mathbf{x}_t &= \mathbf{x}_{t-1} - \alpha \frac{\hat{\mathbf{m}}_t}{\sqrt{\hat{\mathbf{v}}_t} + \epsilon}
\end{align}
$$
- $\mathbf{m}$: first moment (mean of gradients)
- $\mathbf{v}$: second moment (uncentered variance)
- Defaults: $\beta_1=0.9, \beta_2=0.999, \epsilon=10^{-8}$

---

## 3. Experiment 1: Controlled Quadratic Bowl

In [None]:
def create_ill_conditioned_quadratic(kappa=20):
    """Create quadratic with specified condition number."""
    # Eigenvalues from 1 to kappa
    eigenvalues = np.linspace(1, kappa, 2)
    Q, _ = np.linalg.qr(np.random.randn(2, 2))
    A = Q @ np.diag(eigenvalues) @ Q.T
    
    def f(x):
        return 0.5 * x @ A @ x
    
    def grad_f(x):
        return A @ x
    
    return A, f, grad_f

# Setup problem
A, f, grad_f = create_ill_conditioned_quadratic(kappa=20)
x0 = np.array([1.5, 1.0])
learning_rate = 0.1

# Run optimizers
print("Running optimizers on ill-conditioned quadratic (κ=20)...\n")

gd = GradientDescent(learning_rate=learning_rate, max_iter=200, verbose=0)
result_gd = gd.minimize(f, grad_f, x0.copy())
print(f"GD:       {result_gd.n_iterations:3d} iterations, loss={result_gd.f_final:.2e}")

momentum_opt = Momentum(learning_rate=learning_rate, momentum=0.9, max_iter=200, verbose=0)
result_mom = momentum_opt.minimize(f, grad_f, x0.copy())
print(f"Momentum: {result_mom.n_iterations:3d} iterations, loss={result_mom.f_final:.2e}")

adam_opt = Adam(learning_rate=learning_rate, max_iter=200, verbose=0)
result_adam = adam_opt.minimize(f, grad_f, x0.copy())
print(f"Adam:     {result_adam.n_iterations:3d} iterations, loss={result_adam.f_final:.2e}")

**Observation**: Momentum and Adam converge faster than vanilla GD on this ill-conditioned problem.

---

## 4. Visualization: Optimization Paths

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

# Contour plot
x_range = np.linspace(-2, 2, 100)
y_range = np.linspace(-1.5, 1.5, 100)
X, Y = np.meshgrid(x_range, y_range)
Z = np.zeros_like(X)

for i in range(X.shape[0]):
    for j in range(X.shape[1]):
        Z[i, j] = f(np.array([X[i, j], Y[i, j]]))

levels = np.logspace(-2, 1, 20)
contour = ax.contour(X, Y, Z, levels=levels, cmap='gray', alpha=0.4)
ax.clabel(contour, inline=True, fontsize=8)

# Plot paths
path_gd = np.array(result_gd.history['x'])
path_mom = np.array(result_mom.history['x'])
path_adam = np.array(result_adam.history['x'])

ax.plot(path_gd[:, 0], path_gd[:, 1], 'b.-', linewidth=2, markersize=6, 
        label=f'GD ({result_gd.n_iterations} iter)', alpha=0.8)
ax.plot(path_mom[:, 0], path_mom[:, 1], 'r.-', linewidth=2, markersize=6, 
        label=f'Momentum ({result_mom.n_iterations} iter)', alpha=0.8)
ax.plot(path_adam[:, 0], path_adam[:, 1], 'g.-', linewidth=2, markersize=6, 
        label=f'Adam ({result_adam.n_iterations} iter)', alpha=0.8)

ax.plot(x0[0], x0[1], 'ko', markersize=12, label='Start')
ax.plot(0, 0, 'k*', markersize=15, label='Optimum')

ax.set_xlabel('x₁', fontsize=13)
ax.set_ylabel('x₂', fontsize=13)
ax.set_title('Optimizer Comparison: Paths to Minimum (κ=20)', 
             fontsize=14, fontweight='bold')
ax.legend(fontsize=11, loc='upper right')
ax.grid(True, alpha=0.3)
ax.set_aspect('equal')

plt.tight_layout()
plt.savefig(REPORTS_DIR / '02_optimizer_paths_quadratic.png', dpi=150, bbox_inches='tight')
print(f"✓ Saved: {REPORTS_DIR / '02_optimizer_paths_quadratic.png'}")
plt.show()

**Key Observations:**
1. **GD**: Zigzags due to oscillations in narrow valley
2. **Momentum**: Smoother path, less oscillation
3. **Adam**: Most direct path, adapts to geometry

---

## 5. Convergence Curves: Loss Over Time

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Loss (log scale)
ax1.semilogy(result_gd.history['f'], 'b-', linewidth=2, label='GD')
ax1.semilogy(result_mom.history['f'], 'r-', linewidth=2, label='Momentum')
ax1.semilogy(result_adam.history['f'], 'g-', linewidth=2, label='Adam')
ax1.set_xlabel('Iteration', fontsize=12)
ax1.set_ylabel('Loss f(x)', fontsize=12)
ax1.set_title('Convergence Speed Comparison', fontsize=13, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Plot 2: Gradient norm
ax2.semilogy(result_gd.history['grad_norm'], 'b-', linewidth=2, label='GD')
ax2.semilogy(result_mom.history['grad_norm'], 'r-', linewidth=2, label='Momentum')
ax2.semilogy(result_adam.history['grad_norm'], 'g-', linewidth=2, label='Adam')
ax2.set_xlabel('Iteration', fontsize=12)
ax2.set_ylabel('||∇f(x)||', fontsize=12)
ax2.set_title('Gradient Norm Decay', fontsize=13, fontweight='bold')
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(REPORTS_DIR / '02_optimizer_convergence.png', dpi=150, bbox_inches='tight')
print(f"✓ Saved: {REPORTS_DIR / '02_optimizer_convergence.png'}")
plt.show()

---

## 6. Experiment 2: Logistic Regression (Real ML Task)

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Generate synthetic binary classification data
X, y = make_classification(
    n_samples=300,
    n_features=10,
    n_informative=8,
    n_redundant=1,
    n_clusters_per_class=2,
    class_sep=0.8,
    random_state=42
)

# Standardize
X = (X - X.mean(axis=0)) / X.std(axis=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
def sigmoid(z):
    """Numerically stable sigmoid."""
    return np.where(z >= 0, 
                    1 / (1 + np.exp(-z)),
                    np.exp(z) / (1 + np.exp(z)))

def logistic_loss(w, X, y, reg=0.01):
    """Binary cross-entropy loss with L2 regularization."""
    z = X @ w
    probs = sigmoid(z)
    # Numerical stability
    probs = np.clip(probs, 1e-15, 1 - 1e-15)
    loss = -np.mean(y * np.log(probs) + (1 - y) * np.log(1 - probs))
    loss += 0.5 * reg * np.sum(w**2)  # L2 penalty
    return loss

def logistic_grad(w, X, y, reg=0.01):
    """Gradient of logistic loss."""
    z = X @ w
    probs = sigmoid(z)
    grad = X.T @ (probs - y) / len(y)
    grad += reg * w  # L2 gradient
    return grad

def accuracy(w, X, y):
    """Classification accuracy."""
    probs = sigmoid(X @ w)
    preds = (probs >= 0.5).astype(int)
    return np.mean(preds == y)

print("✓ Logistic regression functions defined")

In [None]:
# Optimize with each method
w0 = np.zeros(X_train.shape[1])
lr_logreg = 0.5
max_iter_logreg = 300

print("Training logistic regression with different optimizers...\n")

results_logreg = {}

# GD
start = time.time()
gd_log = GradientDescent(learning_rate=lr_logreg, max_iter=max_iter_logreg, tol=1e-6)
res_gd_log = gd_log.minimize(
    lambda w: logistic_loss(w, X_train, y_train),
    lambda w: logistic_grad(w, X_train, y_train),
    w0.copy()
)
time_gd = time.time() - start
acc_gd_train = accuracy(res_gd_log.x_final, X_train, y_train)
acc_gd_test = accuracy(res_gd_log.x_final, X_test, y_test)
results_logreg['GD'] = (res_gd_log, time_gd, acc_gd_train, acc_gd_test)
print(f"GD:       {res_gd_log.n_iterations:3d} iter | {time_gd:.3f}s | "
      f"Train: {acc_gd_train:.3f} | Test: {acc_gd_test:.3f}")

# Momentum
start = time.time()
mom_log = Momentum(learning_rate=lr_logreg, momentum=0.9, max_iter=max_iter_logreg, tol=1e-6)
res_mom_log = mom_log.minimize(
    lambda w: logistic_loss(w, X_train, y_train),
    lambda w: logistic_grad(w, X_train, y_train),
    w0.copy()
)
time_mom = time.time() - start
acc_mom_train = accuracy(res_mom_log.x_final, X_train, y_train)
acc_mom_test = accuracy(res_mom_log.x_final, X_test, y_test)
results_logreg['Momentum'] = (res_mom_log, time_mom, acc_mom_train, acc_mom_test)
print(f"Momentum: {res_mom_log.n_iterations:3d} iter | {time_mom:.3f}s | "
      f"Train: {acc_mom_train:.3f} | Test: {acc_mom_test:.3f}")

# Adam
start = time.time()
adam_log = Adam(learning_rate=lr_logreg*0.2, max_iter=max_iter_logreg, tol=1e-6)  # Adam needs smaller LR
res_adam_log = adam_log.minimize(
    lambda w: logistic_loss(w, X_train, y_train),
    lambda w: logistic_grad(w, X_train, y_train),
    w0.copy()
)
time_adam = time.time() - start
acc_adam_train = accuracy(res_adam_log.x_final, X_train, y_train)
acc_adam_test = accuracy(res_adam_log.x_final, X_test, y_test)
results_logreg['Adam'] = (res_adam_log, time_adam, acc_adam_train, acc_adam_test)
print(f"Adam:     {res_adam_log.n_iterations:3d} iter | {time_adam:.3f}s | "
      f"Train: {acc_adam_train:.3f} | Test: {acc_adam_test:.3f}")

---

## 7. Benchmark Table

In [None]:
print("\n" + "="*70)
print("LOGISTIC REGRESSION BENCHMARK")
print("="*70)
print(f"{'Optimizer':<12} {'Iterations':<12} {'Time (s)':<10} {'Train Acc':<10} {'Test Acc':<10} {'Final Loss':<12}")
print("-"*70)

for name, (result, time_elapsed, acc_train, acc_test) in results_logreg.items():
    print(f"{name:<12} {result.n_iterations:<12} {time_elapsed:<10.3f} "
          f"{acc_train:<10.3f} {acc_test:<10.3f} {result.f_final:<12.4e}")

print("="*70)

In [None]:
# Plot logistic regression convergence
fig, ax = plt.subplots(figsize=(10, 6))

for name, (result, _, _, _) in results_logreg.items():
    ax.semilogy(result.history['f'], linewidth=2.5, label=name)

ax.set_xlabel('Iteration', fontsize=13)
ax.set_ylabel('Training Loss', fontsize=13)
ax.set_title('Logistic Regression: Optimizer Comparison', fontsize=14, fontweight='bold')
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(REPORTS_DIR / '02_logreg_convergence.png', dpi=150, bbox_inches='tight')
print(f"✓ Saved: {REPORTS_DIR / '02_logreg_convergence.png'}")
plt.show()

---

## 8. Effect of Learning Rate (GD vs Momentum)

In [None]:
# Test different learning rates
learning_rates = [0.01, 0.05, 0.1, 0.3, 0.5]
A, f, grad = create_ill_conditioned_quadratic(kappa=20)
x0_lr = np.array([1.5, 1.0])

iters_gd = []
iters_mom = []

for lr in learning_rates:
    # GD
    opt_gd = GradientDescent(learning_rate=lr, max_iter=500, tol=1e-8)
    res_gd = opt_gd.minimize(f, grad, x0_lr.copy())
    iters_gd.append(res_gd.n_iterations if res_gd.converged else 500)
    
    # Momentum
    opt_mom = Momentum(learning_rate=lr, momentum=0.9, max_iter=500, tol=1e-8)
    res_mom = opt_mom.minimize(f, grad, x0_lr.copy())
    iters_mom.append(res_mom.n_iterations if res_mom.converged else 500)

fig, ax = plt.subplots(figsize=(10, 6))
x_pos = np.arange(len(learning_rates))
width = 0.35

ax.bar(x_pos - width/2, iters_gd, width, label='GD', alpha=0.8)
ax.bar(x_pos + width/2, iters_mom, width, label='Momentum', alpha=0.8)

ax.set_xlabel('Learning Rate', fontsize=13)
ax.set_ylabel('Iterations to Converge', fontsize=13)
ax.set_title('Learning Rate Sensitivity (κ=20)', fontsize=14, fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels([str(lr) for lr in learning_rates])
ax.legend(fontsize=12)
ax.grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(REPORTS_DIR / '02_learning_rate_sensitivity.png', dpi=150, bbox_inches='tight')
print(f"✓ Saved: {REPORTS_DIR / '02_learning_rate_sensitivity.png'}")
plt.show()

**Key Finding**: Momentum is less sensitive to learning rate choice!

---

## 9. Key Takeaways

✅ **Momentum smooths oscillations**:
   - Accumulates velocity in consistent directions
   - Dampens zigzag behavior in narrow valleys
   - Typical β = 0.9 works well in practice

✅ **Adam adapts per-parameter**:
   - Automatically scales learning rates by gradient history
   - Robust to ill-conditioning
   - Default choice for deep learning (unless you have reasons not to)

✅ **Convergence speed**:
   - Quadratic: Adam ≈ Momentum > GD
   - Logistic Regression: Similar ordering
   - Wall-clock time: All similar (per-iteration cost is low)

✅ **When to use what**:
   - **GD**: Simple, interpretable, good for well-conditioned problems
   - **Momentum**: Standard for convex optimization, less tuning than GD
   - **Adam**: Default for neural networks, handles varying scales

---

## 10. Common Pitfalls

❌ **Using Adam learning rate for GD**: Adam needs ~10× smaller LR than GD

❌ **High momentum on noisy gradients**: Can overshoot; reduce β or use gradient clipping

❌ **Not tuning β parameters in Adam**: Defaults (0.9, 0.999) work for most cases, but sometimes β₂=0.99 is better

❌ **Comparing optimizers without proper LR tuning**: Each optimizer has different optimal LR ranges

❌ **Forgetting bias correction in Adam**: Early iterations are biased without it (our implementation includes this)

---

## 11. Exercises

**Exercise 1**: Implement Nesterov Accelerated Gradient (NAG) and compare to standard momentum.

In [None]:
# Your code here

**Exercise 2**: Test Adam with different β₁ values (0.5, 0.7, 0.9, 0.95) on the quadratic problem. Which works best?

In [None]:
# Your code here

**Exercise 3**: Create a "Rosenbrock function" (Google it) and compare all three optimizers. Which handles this non-quadratic, non-convex function best?

In [None]:
# Your code here

**Exercise 4**: Add "learning rate warmup" to Adam: start with small LR and linearly increase for first 100 steps. Does this help on the logistic regression task?

In [None]:
# Your code here

**Exercise 5**: Implement learning rate decay: multiply LR by 0.9 every 50 iterations. Compare final loss for GD with/without decay.

In [None]:
# Your code here

---

## Solutions

*Solutions provided at end of notebook - try exercises first!*

<details>
<summary><b>Exercise 1 Solution: Nesterov Momentum</b></summary>

```python
class NesterovMomentum:
    def __init__(self, learning_rate=0.01, momentum=0.9, max_iter=1000, tol=1e-6):
        self.lr = learning_rate
        self.beta = momentum
        self.max_iter = max_iter
        self.tol = tol
    
    def minimize(self, f, grad, x0):
        x = x0.copy()
        v = np.zeros_like(x)
        history = {'f': [], 'x': [], 'grad_norm': []}
        
        for k in range(self.max_iter):
            # NAG: evaluate gradient at "look-ahead" position
            x_lookahead = x - self.beta * v
            g = grad(x_lookahead)
            
            v = self.beta * v + g
            x = x - self.lr * v
            
            history['f'].append(f(x))
            history['x'].append(x.copy())
            history['grad_norm'].append(np.linalg.norm(g))
            
            if np.linalg.norm(g) < self.tol:
                break
        
        from modules._import_helper import safe_import_from
        OptimizationResult = safe_import_from('01_numerical_toolbox.src.optimizers_from_scratch', 
                                               'OptimizationResult')
        return OptimizationResult(x, f(x), history, True, k+1)

# Test
A, f, grad = create_ill_conditioned_quadratic(kappa=20)
nag = NesterovMomentum(learning_rate=0.1, momentum=0.9)
res_nag = nag.minimize(f, grad, np.array([1.5, 1.0]))
print(f"NAG: {res_nag.n_iterations} iterations")
```
</details>

<details>
<summary><b>Exercise 2 Solution: Adam β₁ sweep</b></summary>

```python
beta1_values = [0.5, 0.7, 0.9, 0.95]
A, f, grad = create_ill_conditioned_quadratic(kappa=20)
x0 = np.array([1.5, 1.0])

for b1 in beta1_values:
    adam = Adam(learning_rate=0.1, beta1=b1, beta2=0.999, max_iter=200)
    res = adam.minimize(f, grad, x0.copy())
    print(f"β₁={b1:.2f}: {res.n_iterations} iterations, loss={res.f_final:.2e}")

# Typical result: β₁=0.9 is best for this problem
```
</details>

<details>
<summary><b>Exercise 3 Solution: Rosenbrock function</b></summary>

```python
def rosenbrock(x, a=1, b=100):
    return (a - x[0])**2 + b * (x[1] - x[0]**2)**2

def rosenbrock_grad(x, a=1, b=100):
    dx0 = -2*(a - x[0]) - 4*b*x[0]*(x[1] - x[0]**2)
    dx1 = 2*b*(x[1] - x[0]**2)
    return np.array([dx0, dx1])

x0_ros = np.array([-1.0, 2.0])
lr = 0.001

gd = GradientDescent(learning_rate=lr, max_iter=5000)
res_gd_ros = gd.minimize(rosenbrock, rosenbrock_grad, x0_ros.copy())
print(f"GD: {res_gd_ros.n_iterations} iter, final={res_gd_ros.x_final}")

mom = Momentum(learning_rate=lr, momentum=0.9, max_iter=5000)
res_mom_ros = mom.minimize(rosenbrock, rosenbrock_grad, x0_ros.copy())
print(f"Momentum: {res_mom_ros.n_iterations} iter, final={res_mom_ros.x_final}")

adam = Adam(learning_rate=lr*10, max_iter=5000)  # Adam can use higher LR
res_adam_ros = adam.minimize(rosenbrock, rosenbrock_grad, x0_ros.copy())
print(f"Adam: {res_adam_ros.n_iterations} iter, final={res_adam_ros.x_final}")

# Optimum is at [1, 1]
```
</details>

---

## Summary Report

In [None]:
# Generate summary
report = f"""
GRADIENT DESCENT DYNAMICS: OPTIMIZER COMPARISON
{'='*70}

QUADRATIC BOWL (κ=20):
  GD:       {result_gd.n_iterations} iterations
  Momentum: {result_mom.n_iterations} iterations ({100*(1 - result_mom.n_iterations/result_gd.n_iterations):.0f}% faster)
  Adam:     {result_adam.n_iterations} iterations ({100*(1 - result_adam.n_iterations/result_gd.n_iterations):.0f}% faster)

LOGISTIC REGRESSION:
"""  

for name, (res, time_elapsed, acc_train, acc_test) in results_logreg.items():
    report += f"\n  {name:<10}: {res.n_iterations:3d} iter | {time_elapsed:.3f}s | Test Acc: {acc_test:.3f}"

report += f"""

KEY INSIGHTS:
  1. Momentum reduces iterations by ~{100*(1 - result_mom.n_iterations/result_gd.n_iterations):.0f}% on ill-conditioned problems
  2. Adam adapts automatically to problem geometry
  3. All methods achieve similar final accuracy on logistic regression
  4. Momentum is more robust to learning rate choice than GD

RECOMMENDATIONS:
  - Use GD for well-conditioned, simple problems
  - Use Momentum (β=0.9) for convex optimization
  - Use Adam as default for deep learning
  - Always tune learning rate per optimizer!

Plots saved in: {REPORTS_DIR}
"""

print(report)

with open(REPORTS_DIR / '02_optimizer_comparison_report.txt', 'w') as f:
    f.write(report)

print(f"\n✓ Report saved: {REPORTS_DIR / '02_optimizer_comparison_report.txt'}")