In [None]:
import sys
from pathlib import Path
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import json
from torch.utils.data import DataLoader, TensorDataset

repo_root = Path.cwd().parent.parent.parent
sys.path.insert(0, str(repo_root))

from modules._import_helper import safe_import_from

SimpleMLP = safe_import_from('06_deep_learning_systems.src.models', 'SimpleMLP')
CNNMnist = safe_import_from('06_deep_learning_systems.src.models', 'CNNMnist')
train_on_tiny_batch = safe_import_from('06_deep_learning_systems.src.trainer', 'train_on_tiny_batch')
create_tiny_dataset = safe_import_from('06_deep_learning_systems.src.datasets', 'create_tiny_dataset')
get_mnist_loaders = safe_import_from('06_deep_learning_systems.src.datasets', 'get_mnist_loaders')
set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')

print("‚úì Imports successful")

# Setup
reports_dir = Path("../reports/notebook_02")
reports_dir.mkdir(parents=True, exist_ok=True)
set_seed(42)

## Part 1: The Overfit Test - What It Means

**Question**: Before debugging anything complex, can your model memorize 32 samples?

**Why this matters**:
- Tests that **gradients flow** (not zero/NaN)
- Tests that **loss function** is correct
- Tests that **data pipeline** works
- Tests that **model has capacity**
- Tests that **labels match inputs**

**Expected outcome**: Loss ‚Üí 0.001 within 100-200 steps, 100% accuracy

If this fails, **don't waste time tuning hyperparameters** - there's a bug!

## Part 2: Healthy Overfit - Reference Example

In [None]:
# Create tiny dataset
print("Creating tiny dataset (32 samples)...")
X, y = create_tiny_dataset(n_samples=32, n_features=10, n_classes=2, seed=42)
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Classes: {y.unique()}")
print(f"Class distribution: {[(y == c).sum().item() for c in y.unique()]}")

In [None]:
# Create model with sufficient capacity
model = SimpleMLP(input_dim=10, hidden_dims=[32, 32], output_dim=2)
print(f"\nModel: {model.__class__.__name__}")
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")

# Overfit
print("\nOverfitting tiny batch...")
model, losses = train_on_tiny_batch(
    model, X, y,
    num_steps=200,
    lr=1e-2,
    device="cpu"
)

print(f"\nInitial loss: {losses[0]:.4f}")
print(f"Final loss: {losses[-1]:.6f}")
print(f"Reduction: {(1 - losses[-1]/losses[0])*100:.1f}%")

# Check accuracy
model.eval()
with torch.no_grad():
    preds = model(X).argmax(dim=1)
    accuracy = (preds == y).float().mean().item()
print(f"Final accuracy: {accuracy:.2%}")

if losses[-1] < 0.01 and accuracy == 1.0:
    print("\n‚úÖ HEALTHY: Model successfully overfitted!")
else:
    print("\n‚ö†Ô∏è WARNING: Model didn't overfit perfectly")

In [None]:
# Plot healthy overfit curve
plt.figure(figsize=(10, 5))
plt.plot(losses, linewidth=2, color='steelblue')
plt.axhline(0.01, color='red', linestyle='--', linewidth=2, label='Target (<0.01)')
plt.xlabel('Step', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Healthy Overfit: Rapid Loss Collapse', fontsize=14, fontweight='bold')
plt.yscale('log')
plt.grid(True, alpha=0.3, which='both')
plt.legend(fontsize=11)
plt.tight_layout()
plt.savefig(reports_dir / "healthy_overfit.png", dpi=150)
plt.show()

print(f"‚úì Saved to {reports_dir / 'healthy_overfit.png'}")

## Part 3: Debugging - When Overfit Fails

Let's intentionally break things and learn how to diagnose each problem.

### Bug 1: Model Too Small (Insufficient Capacity)

In [None]:
print("Bug 1: Tiny model (1 hidden unit)\n")
tiny_model = SimpleMLP(input_dim=10, hidden_dims=[1], output_dim=2)  # Only 1 unit!
print(f"Parameters: {sum(p.numel() for p in tiny_model.parameters())}")

tiny_model, tiny_losses = train_on_tiny_batch(
    tiny_model, X, y,
    num_steps=200,
    lr=1e-2,
    device="cpu"
)

print(f"\nFinal loss: {tiny_losses[-1]:.4f}")
print("‚Üí Loss plateaus high - model lacks capacity to memorize")
print("Fix: Increase hidden_dims")

### Bug 2: Learning Rate Too Low

In [None]:
print("\nBug 2: Learning rate too low\n")
model_lowlr = SimpleMLP(input_dim=10, hidden_dims=[32, 32], output_dim=2)

model_lowlr, lowlr_losses = train_on_tiny_batch(
    model_lowlr, X, y,
    num_steps=200,
    lr=1e-6,  # Too low!
    device="cpu"
)

print(f"Final loss: {lowlr_losses[-1]:.4f}")
print("‚Üí Loss decreases very slowly")
print("Fix: Increase learning rate (try 1e-3 to 1e-2)")

### Bug 3: Wrong Labels

In [None]:
print("\nBug 3: Randomly shuffled labels (no signal)\n")
y_random = torch.randint(0, 2, (32,))  # Random labels!

model_random = SimpleMLP(input_dim=10, hidden_dims=[32, 32], output_dim=2)
model_random, random_losses = train_on_tiny_batch(
    model_random, X, y_random,
    num_steps=200,
    lr=1e-2,
    device="cpu"
)

print(f"Final loss: {random_losses[-1]:.4f}")
print("‚Üí Can still memorize, but takes longer")
print("Fix: Verify labels match inputs (inspect samples)")

In [None]:
# Compare all failure modes
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Healthy
axes[0, 0].plot(losses, linewidth=2, color='green')
axes[0, 0].axhline(0.01, color='red', linestyle='--', alpha=0.5)
axes[0, 0].set_title('‚úÖ Healthy: Rapid Collapse', fontweight='bold')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_yscale('log')
axes[0, 0].grid(True, alpha=0.3)

# Bug 1: Tiny model
axes[0, 1].plot(tiny_losses, linewidth=2, color='orange')
axes[0, 1].axhline(0.01, color='red', linestyle='--', alpha=0.5)
axes[0, 1].set_title('‚ùå Bug 1: Model Too Small', fontweight='bold')
axes[0, 1].set_yscale('log')
axes[0, 1].grid(True, alpha=0.3)

# Bug 2: LR too low
axes[1, 0].plot(lowlr_losses, linewidth=2, color='blue')
axes[1, 0].axhline(0.01, color='red', linestyle='--', alpha=0.5)
axes[1, 0].set_title('‚ùå Bug 2: LR Too Low', fontweight='bold')
axes[1, 0].set_xlabel('Step')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].set_yscale('log')
axes[1, 0].grid(True, alpha=0.3)

# Bug 3: Random labels
axes[1, 1].plot(random_losses, linewidth=2, color='purple')
axes[1, 1].axhline(0.01, color='red', linestyle='--', alpha=0.5)
axes[1, 1].set_title('‚ö†Ô∏è Bug 3: Random Labels', fontweight='bold')
axes[1, 1].set_xlabel('Step')
axes[1, 1].set_yscale('log')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / "failure_modes.png", dpi=150)
plt.show()

print(f"‚úì Saved to {reports_dir / 'failure_modes.png'}")

## Part 4: Gradient Diagnostics

When overfitting fails, check gradients first.

In [None]:
def check_gradients(model, X, y):
    """Diagnostic function to check gradient health."""
    model.train()
    output = model(X)
    loss = nn.CrossEntropyLoss()(output, y)
    loss.backward()
    
    print("Gradient Check:")
    print("-" * 60)
    
    for name, param in model.named_parameters():
        if param.grad is not None:
            grad_norm = param.grad.norm().item()
            grad_mean = param.grad.mean().item()
            grad_std = param.grad.std().item()
            has_nan = torch.isnan(param.grad).any().item()
            has_inf = torch.isinf(param.grad).any().item()
            
            status = "‚úì" if not (has_nan or has_inf) and grad_norm > 0 else "‚úó"
            
            print(f"{status} {name:30s} norm={grad_norm:.6f} mean={grad_mean:.6f} std={grad_std:.6f}")
            
            if has_nan:
                print(f"   ‚ö†Ô∏è NaN detected in {name}")
            if has_inf:
                print(f"   ‚ö†Ô∏è Inf detected in {name}")
            if grad_norm == 0:
                print(f"   ‚ö†Ô∏è Zero gradient in {name} (dead neuron?)")
        else:
            print(f"‚úó {name:30s} NO GRADIENT")
    
    print("-" * 60)

# Test on healthy model
model_test = SimpleMLP(input_dim=10, hidden_dims=[32, 32], output_dim=2)
check_gradients(model_test, X, y)

## Part 5: Data Inspection

Always inspect your data before blaming the model.

In [None]:
# Load real MNIST for inspection
print("Loading MNIST for inspection...")
train_loader, _, _ = get_mnist_loaders(
    data_dir=Path("../../../data"),
    batch_size=16,
    val_split=0.1,
    num_workers=0,
    seed=42,
)

# Get one batch
images, labels = next(iter(train_loader))

print(f"\nBatch info:")
print(f"  Images shape: {images.shape}")
print(f"  Labels shape: {labels.shape}")
print(f"  Image range: [{images.min():.3f}, {images.max():.3f}]")
print(f"  Labels: {labels.tolist()}")
print(f"  Unique labels: {labels.unique().tolist()}")

In [None]:
# Visualize samples
fig, axes = plt.subplots(2, 8, figsize=(16, 4))
axes = axes.flatten()

for i in range(16):
    img = images[i].squeeze()  # Remove channel dim
    label = labels[i].item()
    
    axes[i].imshow(img, cmap='gray')
    axes[i].set_title(f'Label: {label}', fontsize=10)
    axes[i].axis('off')

plt.suptitle('Data Inspection: First 16 Samples', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(reports_dir / "data_inspection.png", dpi=150)
plt.show()

print(f"‚úì Saved to {reports_dir / 'data_inspection.png'}")
print("\nChecklist:")
print("  [‚úì] Images look correct")
print("  [‚úì] Labels match visual content")
print("  [‚úì] Data is normalized")
print("  [‚úì] No obvious corruption")

## Part 6: Create Debugging Checklist

In [None]:
checklist = """# Deep Learning Debugging Checklist

## When Training Fails (High Loss / No Learning)

### 1. Can you overfit a tiny batch? (32-128 samples)
- [ ] Loss goes to <0.01 in 100-200 steps?
- [ ] 100% accuracy on the tiny batch?
- If NO ‚Üí systematic debugging below

### 2. Check Gradients
- [ ] Gradients exist for all parameters?
- [ ] No NaN or Inf in gradients?
- [ ] Gradients are non-zero (not all dead neurons)?
- [ ] Gradient magnitudes are reasonable (1e-5 to 1e-2)?

### 3. Inspect Data
- [ ] Visualize 10-20 samples - do they look correct?
- [ ] Labels match the visual content?
- [ ] Input range is reasonable (normalized)?
- [ ] No obvious corruption or NaN values?
- [ ] Class balance reasonable (not 99%/1%)?

### 4. Verify Model & Loss
- [ ] Model has sufficient capacity (not 1-2 neurons)?
- [ ] Output shape matches expected (batch_size, num_classes)?
- [ ] Loss function matches task (CrossEntropy for classification)?
- [ ] No gradient blocking (e.g., detach(), .data)?

### 5. Check Learning Rate
- [ ] Not too high (loss explodes ‚Üí Inf)?
- [ ] Not too low (loss barely decreases)?
- [ ] Typical range: 1e-4 to 1e-2 for Adam

### 6. Verify Optimizer
- [ ] Optimizer has model.parameters()?
- [ ] optimizer.zero_grad() called before backward()?
- [ ] optimizer.step() called after backward()?

### 7. Check Data Pipeline
- [ ] DataLoader shuffles training data?
- [ ] Transforms applied correctly?
- [ ] Batch size not too small (<8) or too large?

## Common Bugs & Fixes

| Symptom | Likely Cause | Fix |
|---------|--------------|-----|
| Loss = NaN | Exploding gradients or bad data | Lower LR, check for Inf/NaN in data |
| Loss plateaus high | Model too small or LR too low | Increase capacity or LR |
| Loss very slow | LR too low | Increase LR by 10x |
| Loss explodes | LR too high | Decrease LR by 10x |
| Zero gradients | Dead neurons (e.g., ReLU) | Check activations, try LeakyReLU |
| Can't overfit 32 samples | Fundamental bug | Systematic debugging above |

## When to Use This Checklist

1. **Starting a new model** ‚Üí Run overfit test first
2. **Training fails** ‚Üí Work through checklist top to bottom
3. **Before tuning hyperparameters** ‚Üí Verify overfit test passes
4. **When reproducing papers** ‚Üí Overfit on their data first

## Remember

> "If your model can't overfit 32 samples, don't waste time tuning hyperparameters. 
> There's a bug in your code."

Generated: {date}
"""

from datetime import datetime
checklist = checklist.format(date=datetime.now().strftime("%Y-%m-%d"))

with open(reports_dir / "debugging_checklist.md", 'w') as f:
    f.write(checklist)

print(f"‚úì Saved checklist to {reports_dir / 'debugging_checklist.md'}")
print("\n" + checklist)

## Part 7: Save Summary

In [None]:
summary = {
    "experiment": "overfit_sanity_checks",
    "healthy_overfit": {
        "initial_loss": float(losses[0]),
        "final_loss": float(losses[-1]),
        "reduction_percent": float((1 - losses[-1]/losses[0]) * 100),
        "steps": len(losses),
        "final_accuracy": float(accuracy),
        "passed": bool(losses[-1] < 0.01 and accuracy == 1.0),
    },
    "failure_modes": {
        "tiny_model": {"final_loss": float(tiny_losses[-1]), "converged": bool(tiny_losses[-1] < 0.01)},
        "low_lr": {"final_loss": float(lowlr_losses[-1]), "converged": bool(lowlr_losses[-1] < 0.01)},
        "random_labels": {"final_loss": float(random_losses[-1]), "converged": bool(random_losses[-1] < 0.01)},
    },
    "key_insight": "Always test overfit on 32 samples before debugging anything else",
}

with open(reports_dir / "summary.json", 'w') as f:
    json.dump(summary, f, indent=2)

print(f"‚úì Saved summary to {reports_dir / 'summary.json'}")
print("\nSummary:")
print(json.dumps(summary, indent=2))

## Key Takeaways

### ‚úÖ The Golden Rule

**"Can your model overfit 32 samples in <200 steps?"**
- If YES ‚Üí Model is fundamentally working, tune hyperparameters
- If NO ‚Üí There's a bug, don't waste time tuning

### üîß Debugging Priority

1. **Overfit test** (this notebook)
2. **Check gradients** (are they non-zero, finite?)
3. **Inspect data** (does it look correct?)
4. **Verify loss & model** (correct for task?)
5. **Then and only then** ‚Üí tune hyperparameters

### ‚ö†Ô∏è Common Mistakes

1. **Skipping the overfit test** ‚Üí waste hours debugging wrong things
2. **Tuning hyperparameters when fundamentals are broken**
3. **Not inspecting data visually**
4. **Assuming gradients are fine without checking**

### üìã Your Debugging Workflow

```python
# Step 1: Overfit test (ALWAYS FIRST)
X_tiny, y_tiny = get_tiny_batch(32)
model, losses = train_on_tiny_batch(model, X_tiny, y_tiny, steps=200)
assert losses[-1] < 0.01, "Overfit test failed - debug before continuing!"

# Step 2: If overfit fails ‚Üí check gradients
check_gradients(model, X_tiny, y_tiny)

# Step 3: Inspect data
visualize_samples(X_tiny, y_tiny)

# Step 4: Only after passing overfit ‚Üí train full dataset
history = trainer.train(train_loader, val_loader)
```

## Exercises

### Exercise 1: Introduce and Fix a Bug

Create a model that **fails** the overfit test due to a bug of your choice. Then diagnose and fix it using the checklist.

In [None]:
# Your code here
# Ideas: wrong loss function, zero learning rate, broken model, etc.

### Exercise 2: Minimum Model Capacity

Find the **smallest** model (fewest parameters) that can still overfit the 32-sample batch. Start with `[1]` hidden units and increase.

In [None]:
# Your code here

### Exercise 3: Gradient Explosion

Create a scenario where gradients **explode** (become Inf). Then fix it with gradient clipping.

In [None]:
# Your code here
# Hint: Use very high learning rate or deep network

---

## Solutions

<details>
<summary>Click to reveal solutions</summary>

### Solution 1: Wrong Loss Function Bug

In [None]:
# Bug: Using MSE instead of CrossEntropy for classification
print("Creating bug: Wrong loss function (MSE for classification)\n")

model_bug = SimpleMLP(10, [32, 32], 2)
optimizer = torch.optim.Adam(model_bug.parameters(), lr=1e-2)
criterion = nn.MSELoss()  # WRONG! Should be CrossEntropyLoss

losses_bug = []
for step in range(200):
    optimizer.zero_grad()
    output = model_bug(X)
    loss = criterion(output, y.float())  # MSE needs float
    loss.backward()
    optimizer.step()
    losses_bug.append(loss.item())

print(f"Final loss with MSE: {losses_bug[-1]:.4f}")
print("‚Üí Doesn't converge well!\n")

# Fix: Use correct loss
print("Fix: Using CrossEntropyLoss\n")
model_fix = SimpleMLP(10, [32, 32], 2)
model_fix, losses_fix = train_on_tiny_batch(model_fix, X, y, 200, 1e-2, "cpu")

print(f"Final loss with CrossEntropy: {losses_fix[-1]:.6f}")
print("‚úì Fixed!")

### Solution 2: Minimum Capacity

In [None]:
# Find minimum capacity
hidden_sizes = [1, 2, 4, 8, 16, 32]
results = []

for hidden_dim in hidden_sizes:
    model_test = SimpleMLP(10, [hidden_dim], 2)
    model_test, test_losses = train_on_tiny_batch(model_test, X, y, 200, 1e-2, "cpu")
    final_loss = test_losses[-1]
    passed = final_loss < 0.01
    results.append((hidden_dim, final_loss, passed))
    print(f"Hidden={hidden_dim:2d}: final_loss={final_loss:.6f} {'‚úì' if passed else '‚úó'}")

# Find minimum that works
min_working = min([h for h, l, p in results if p])
print(f"\n‚Üí Minimum capacity: {min_working} hidden units")

### Solution 3: Gradient Explosion & Clipping

In [None]:
# Cause gradient explosion with very high LR
print("Creating gradient explosion (LR=1.0)\n")
model_explode = SimpleMLP(10, [32, 32], 2)
optimizer = torch.optim.SGD(model_explode.parameters(), lr=1.0)  # Too high!
criterion = nn.CrossEntropyLoss()

explosion_losses = []
for step in range(10):
    optimizer.zero_grad()
    output = model_explode(X)
    loss = criterion(output, y)
    loss.backward()
    
    # Check for explosion
    grad_norm = sum(p.grad.norm().item()**2 for p in model_explode.parameters())**0.5
    
    optimizer.step()
    explosion_losses.append(loss.item())
    
    print(f"Step {step}: loss={loss.item():.4f}, grad_norm={grad_norm:.2f}")
    
    if grad_norm > 1000:
        print("‚Üí Gradients exploded!\n")
        break

# Fix with gradient clipping
print("Fix: Gradient clipping (max_norm=1.0)\n")
model_clip = SimpleMLP(10, [32, 32], 2)
optimizer = torch.optim.SGD(model_clip.parameters(), lr=1.0)

clip_losses = []
for step in range(100):
    optimizer.zero_grad()
    output = model_clip(X)
    loss = criterion(output, y)
    loss.backward()
    
    # Clip gradients
    torch.nn.utils.clip_grad_norm_(model_clip.parameters(), max_norm=1.0)
    
    optimizer.step()
    clip_losses.append(loss.item())

print(f"Final loss with clipping: {clip_losses[-1]:.4f}")
print("‚úì Stable training with gradient clipping!")

</details>

---

## Next Steps

- **Notebook 03**: Optimization dynamics - compare optimizers, LR schedules, regularization
- **Notebook 04**: Monitoring & error analysis - confusion matrices, worst errors

**Remember**: The overfit test is your **first line of defense**. Run it before debugging anything else!