In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import sys

# Add module to path
sys.path.insert(0, str(Path.cwd().parent.parent.parent))

from modules._import_helper import safe_import_from

# Import from repo standards
set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')

# Import from this module
GradientDescent = safe_import_from('01_numerical_toolbox.src.optimizers_from_scratch', 'GradientDescent')
condition_number = safe_import_from('01_numerical_toolbox.src.linear_algebra', 'condition_number')

# Setup
set_seed(42)
REPORTS_DIR = Path('../reports')
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("✓ Imports successful")

---

## 1. Intuition: What is Conditioning?

**Key Ideas:**
- The **condition number** κ(A) measures how "stretched" or "squashed" a matrix A is
- For optimization, it determines the *shape* of the loss landscape
- **Well-conditioned** (κ ≈ 1): Loss surface is spherical → easy optimization
- **Ill-conditioned** (κ ≫ 1): Loss surface is elongated (like a valley) → slow optimization
- **Definition**: κ(A) = σ_max(A) / σ_min(A) = largest eigenvalue / smallest eigenvalue

**Why it matters**: Gradient descent takes *tiny* steps along narrow valleys but *overshoots* along wide directions.

---

## 2. Minimal Math: Quadratic Loss Landscape

Consider a quadratic loss function (like in linear regression):

$$
f(\mathbf{x}) = \frac{1}{2} \mathbf{x}^T A \mathbf{x} - \mathbf{b}^T \mathbf{x}
$$

The gradient is:
$$
\nabla f(\mathbf{x}) = A\mathbf{x} - \mathbf{b}
$$

**Key insight**: The Hessian (second derivative) is $\nabla^2 f = A$. The condition number $\kappa(A)$ determines:
- **Convergence rate**: $\sim \left(1 - \frac{1}{\kappa}\right)^k$ after $k$ iterations
- **Optimal step size**: $\alpha_{\text{max}} \approx \frac{2}{\lambda_{\text{max}} + \lambda_{\text{min}}}$

**Example**: If κ = 100, gradient descent needs ~100× more iterations than if κ = 1.

---

## 3. Implementation: Quadratic Bowls with Different Conditioning

In [None]:
def create_quadratic(kappa: float, dim: int = 2) -> tuple:
    """
    Create a quadratic function f(x) = 0.5 * x^T A x with condition number kappa.
    
    Returns:
        A: Symmetric positive definite matrix
        f: Objective function
        grad_f: Gradient function
    """
    # Create eigenvalues with desired condition number
    eigenvalues = np.linspace(1, kappa, dim)
    
    # Random rotation (orthogonal matrix)
    Q, _ = np.linalg.qr(np.random.randn(dim, dim))
    
    # A = Q * diag(λ) * Q^T
    A = Q @ np.diag(eigenvalues) @ Q.T
    
    # Define objective and gradient
    def f(x):
        return 0.5 * x @ A @ x
    
    def grad_f(x):
        return A @ x
    
    return A, f, grad_f

print("Example: Creating quadratic with κ = 10")
A, f, grad = create_quadratic(kappa=10.0)
print(f"Condition number: {condition_number(A):.2f}")
print(f"Eigenvalues: {np.linalg.eigvalsh(A)}")

---

## 4. Experiment: Comparing Convergence Across Condition Numbers

In [None]:
# Test different condition numbers
kappas = [1, 5, 20, 100]
x0 = np.array([1.0, 1.0])
learning_rate = 0.1

results = {}

for kappa in kappas:
    A, f, grad_f = create_quadratic(kappa, dim=2)
    
    # Adjust learning rate for stability
    max_eigenvalue = np.max(np.linalg.eigvalsh(A))
    safe_lr = 1.9 / max_eigenvalue  # Just below 2/λ_max
    
    optimizer = GradientDescent(
        learning_rate=safe_lr,
        max_iter=500,
        tol=1e-8
    )
    
    result = optimizer.minimize(f, grad_f, x0.copy())
    results[kappa] = result
    
    print(f"κ = {kappa:3d} | Iterations: {result.n_iterations:3d} | "
          f"Final loss: {result.f_final:.2e} | LR: {safe_lr:.4f}")

**Observation**: Notice how the number of iterations scales roughly with κ!

---

## 5. Visualization: Loss Contours and Optimization Paths

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for idx, kappa in enumerate(kappas):
    ax = axes[idx]
    
    # Recreate problem
    A, f, grad_f = create_quadratic(kappa, dim=2)
    
    # Create contour plot
    x_range = np.linspace(-1.5, 1.5, 100)
    y_range = np.linspace(-1.5, 1.5, 100)
    X, Y = np.meshgrid(x_range, y_range)
    Z = np.zeros_like(X)
    
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            Z[i, j] = f(np.array([X[i, j], Y[i, j]]))
    
    # Plot contours
    levels = np.logspace(-2, 1, 20)
    contour = ax.contour(X, Y, Z, levels=levels, cmap='viridis', alpha=0.6)
    ax.clabel(contour, inline=True, fontsize=8, fmt='%.2f')
    
    # Plot optimization path
    path = np.array(results[kappa].history['x'])
    ax.plot(path[:, 0], path[:, 1], 'r.-', linewidth=2, markersize=8, 
            label='GD path', alpha=0.8)
    ax.plot(path[0, 0], path[0, 1], 'go', markersize=12, label='Start')
    ax.plot(path[-1, 0], path[-1, 1], 'r*', markersize=15, label='End')
    
    ax.set_xlabel('x₁', fontsize=12)
    ax.set_ylabel('x₂', fontsize=12)
    ax.set_title(f'κ = {kappa} | {results[kappa].n_iterations} iterations', 
                 fontsize=13, fontweight='bold')
    ax.grid(True, alpha=0.3)
    ax.legend(loc='upper right', fontsize=9)
    ax.set_aspect('equal')

plt.tight_layout()
plt.savefig(REPORTS_DIR / '01_conditioning_paths.png', dpi=150, bbox_inches='tight')
print(f"✓ Saved: {REPORTS_DIR / '01_conditioning_paths.png'}")
plt.show()

**Key Observations:**
1. **κ = 1**: Circular contours → direct path to minimum
2. **κ = 100**: Elongated ellipses → zigzag path in narrow valley
3. Path shows "bouncing" behavior in ill-conditioned problems

---

## 6. Convergence Curves: Loss and Gradient Norm

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Loss vs iterations
for kappa in kappas:
    losses = results[kappa].history['f']
    ax1.semilogy(losses, linewidth=2, label=f'κ = {kappa}')

ax1.set_xlabel('Iteration', fontsize=12)
ax1.set_ylabel('Loss f(x)', fontsize=12)
ax1.set_title('Convergence Speed vs Condition Number', fontsize=13, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Plot 2: Gradient norm vs iterations
for kappa in kappas:
    grad_norms = results[kappa].history['grad_norm']
    ax2.semilogy(grad_norms, linewidth=2, label=f'κ = {kappa}')

ax2.set_xlabel('Iteration', fontsize=12)
ax2.set_ylabel('||∇f(x)||', fontsize=12)
ax2.set_title('Gradient Norm Decay', fontsize=13, fontweight='bold')
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(REPORTS_DIR / '01_conditioning_convergence.png', dpi=150, bbox_inches='tight')
print(f"✓ Saved: {REPORTS_DIR / '01_conditioning_convergence.png'}")
plt.show()

**Analysis**: The linear decay rate in log-scale confirms the theoretical prediction:
- Slope ∝ 1 - 1/κ
- Higher κ → slower decay → more iterations needed

---

## 7. Feature Scaling Demonstration

In [None]:
# Create poorly scaled data
np.random.seed(42)
n_samples = 100
X_unscaled = np.random.randn(n_samples, 2)
X_unscaled[:, 0] *= 100  # First feature has huge scale
X_unscaled[:, 1] *= 0.1   # Second feature is tiny

print("Unscaled data:")
print(f"  Feature 1: std = {X_unscaled[:, 0].std():.2f}")
print(f"  Feature 2: std = {X_unscaled[:, 1].std():.2f}")
print(f"  Scale ratio: {X_unscaled[:, 0].std() / X_unscaled[:, 1].std():.0f}:1")

# Compute condition numbers
A_unscaled = X_unscaled.T @ X_unscaled / n_samples
kappa_unscaled = condition_number(A_unscaled)

# Scale features (standardization)
X_scaled = (X_unscaled - X_unscaled.mean(axis=0)) / X_unscaled.std(axis=0)
A_scaled = X_scaled.T @ X_scaled / n_samples
kappa_scaled = condition_number(A_scaled)

print(f"\nCondition numbers:")
print(f"  Unscaled: κ = {kappa_unscaled:.1f}")
print(f"  Scaled:   κ = {kappa_scaled:.1f}")
print(f"  Improvement: {kappa_unscaled/kappa_scaled:.1f}x")

---

## 8. Key Takeaways

✅ **Condition number** κ determines optimization difficulty:
   - κ = 1: spherical loss → fast convergence
   - κ ≫ 1: elongated loss → slow, zigzag path

✅ **Feature scaling** is essential:
   - Standardization (mean=0, std=1) often sufficient
   - Reduces κ → speeds up gradient descent

✅ **Learning rate** must be tuned to problem scale:
   - Safe choice: α < 2/λ_max (where λ_max is largest eigenvalue)
   - Ill-conditioned problems need smaller α

✅ **Real ML impact**:
   - Neural networks: BatchNorm reduces internal covariate shift
   - Linear models: Always scale features before training
   - Regularization (Ridge, L2) improves conditioning

---

## 9. Common Pitfalls

❌ **Using same learning rate across problems**: α=0.01 might work for κ=10 but diverge for κ=1000

❌ **Forgetting to scale features**: Mixing features with different units (e.g., meters vs millimeters) creates high κ

❌ **Ignoring numerical precision**: Very high κ (>10⁸) hits floating-point limits

❌ **Confusing conditioning with convexity**: Well-conditioned doesn't mean globally convex (non-convex problems can still have local ill-conditioning)

---

## 10. Exercises

Try these on your own! Solutions are at the end.

**Exercise 1**: Create a 3D quadratic with κ = 50 and optimize it with GD. Plot the loss curve.

In [None]:
# Your code here

**Exercise 2**: What happens if you use learning rate α = 2.5/λ_max (above the stability limit)? Try it with κ = 10.

In [None]:
# Your code here

**Exercise 3**: Generate random data with 5 features where one feature has 100× larger scale than others. Compute κ before and after standardization.

In [None]:
# Your code here

**Exercise 4**: Theoretical - Prove that for κ = 1 (identity matrix), gradient descent converges in 1 step with α = 1.

*Your answer here (math or code verification)*

**Exercise 5**: Implement a simple line search that adapts learning rate automatically. Compare to fixed α on a κ=50 problem.

In [None]:
# Your code here

---

## Solutions (Expand to view)

<details>
<summary><b>Click to show Exercise 1 solution</b></summary>

```python
A_3d, f_3d, grad_3d = create_quadratic(kappa=50, dim=3)
x0_3d = np.array([1.0, 1.0, 1.0])
max_eig = np.max(np.linalg.eigvalsh(A_3d))
opt = GradientDescent(learning_rate=1.8/max_eig, max_iter=1000)
result = opt.minimize(f_3d, grad_3d, x0_3d)

plt.figure(figsize=(8, 5))
plt.semilogy(result.history['f'], linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('3D Quadratic with κ=50')
plt.grid(True, alpha=0.3)
plt.show()
print(f"Converged in {result.n_iterations} iterations")
```
</details>

<details>
<summary><b>Click to show Exercise 2 solution</b></summary>

```python
A, f, grad = create_quadratic(kappa=10, dim=2)
max_eig = np.max(np.linalg.eigvalsh(A))
unstable_lr = 2.5 / max_eig  # Above stability limit!

opt = GradientDescent(learning_rate=unstable_lr, max_iter=100)
result = opt.minimize(f, grad, np.array([1.0, 1.0]))

plt.figure(figsize=(8, 5))
plt.plot(result.history['f'], linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Unstable: α too large → divergence!')
plt.grid(True, alpha=0.3)
plt.show()
print("Notice: Loss increases (diverges) instead of decreasing!")
```
</details>

<details>
<summary><b>Click to show Exercise 3 solution</b></summary>

```python
np.random.seed(123)
X = np.random.randn(50, 5)
X[:, 0] *= 100  # Scale first feature

A_before = X.T @ X / X.shape[0]
kappa_before = condition_number(A_before)

X_standardized = (X - X.mean(axis=0)) / X.std(axis=0)
A_after = X_standardized.T @ X_standardized / X.shape[0]
kappa_after = condition_number(A_after)

print(f"Before standardization: κ = {kappa_before:.1f}")
print(f"After standardization:  κ = {kappa_after:.1f}")
print(f"Improvement: {kappa_before/kappa_after:.0f}x")
```
</details>

<details>
<summary><b>Click to show Exercise 4 solution</b></summary>

**Proof**: For $f(\mathbf{x}) = \frac{1}{2}\mathbf{x}^T I \mathbf{x}$ (identity matrix), we have:
- Gradient: $\nabla f = I \mathbf{x} = \mathbf{x}$
- GD update: $\mathbf{x}_{k+1} = \mathbf{x}_k - \alpha \mathbf{x}_k = (1-\alpha)\mathbf{x}_k$
- With $\alpha=1$: $\mathbf{x}_1 = (1-1)\mathbf{x}_0 = \mathbf{0}$ ✓

Verification:
```python
A_identity = np.eye(2)
f_id = lambda x: 0.5 * x @ A_identity @ x
grad_id = lambda x: A_identity @ x
opt = GradientDescent(learning_rate=1.0, max_iter=10)
result = opt.minimize(f_id, grad_id, np.array([1.0, 2.0]))
print(f"Iterations: {result.n_iterations}")  # Should be 1
print(f"Final x: {result.x_final}")  # Should be ~[0, 0]
```
</details>

<details>
<summary><b>Click to show Exercise 5 solution</b></summary>

```python
def gd_with_backtracking(f, grad, x0, max_iter=500, c=0.5, rho=0.8):
    """GD with backtracking line search (Armijo rule)."""
    x = x0.copy()
    losses = []
    alpha = 1.0  # Initial step size
    
    for k in range(max_iter):
        g = grad(x)
        losses.append(f(x))
        
        if np.linalg.norm(g) < 1e-8:
            break
        
        # Backtracking line search
        alpha_k = alpha
        while f(x - alpha_k * g) > f(x) - c * alpha_k * (g @ g):
            alpha_k *= rho
        
        x = x - alpha_k * g
    
    return x, losses

# Compare
A, f, grad = create_quadratic(kappa=50, dim=2)
x0 = np.array([1.0, 1.0])

# Fixed LR
opt_fixed = GradientDescent(learning_rate=0.01, max_iter=500)
res_fixed = opt_fixed.minimize(f, grad, x0.copy())

# Adaptive LR
x_adapt, losses_adapt = gd_with_backtracking(f, grad, x0.copy())

plt.figure(figsize=(8, 5))
plt.semilogy(res_fixed.history['f'], label='Fixed α=0.01', linewidth=2)
plt.semilogy(losses_adapt, label='Backtracking', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Fixed vs Adaptive Learning Rate (κ=50)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"Fixed: {len(res_fixed.history['f'])} iterations")
print(f"Adaptive: {len(losses_adapt)} iterations")
```
</details>

---

## Summary Report

In [None]:
# Generate text report
report_text = f"""
CONDITIONING AND SCALING ANALYSIS
{'='*60}

Condition Numbers Tested: {kappas}

Convergence Summary:
"""

for kappa in kappas:
    res = results[kappa]
    report_text += f"\n  κ = {kappa:3d} → {res.n_iterations:3d} iterations, "
    report_text += f"final loss = {res.f_final:.2e}"

report_text += f"""

Key Finding:
  - Iteration count scales approximately linearly with κ
  - Well-conditioned problems (κ≈1) converge in ~10 iterations
  - Ill-conditioned problems (κ=100) need ~{results[100].n_iterations} iterations

Practical Advice:
  1. Always standardize features before training
  2. Monitor condition number of data covariance X^T X
  3. Use adaptive optimizers (Adam) for problems with unknown κ
  4. Consider regularization to improve conditioning

Plots saved:
  - {REPORTS_DIR / '01_conditioning_paths.png'}
  - {REPORTS_DIR / '01_conditioning_convergence.png'}
"""

print(report_text)

# Save report
with open(REPORTS_DIR / '01_conditioning_report.txt', 'w') as f:
    f.write(report_text)

print(f"\n✓ Report saved: {REPORTS_DIR / '01_conditioning_report.txt'}")