In [None]:
import sys
from pathlib import Path
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import json

# Add repo to path
repo_root = Path.cwd().parent.parent.parent
sys.path.insert(0, str(repo_root))

from modules._import_helper import safe_import_from

# Import module components
TrainingConfig = safe_import_from('06_deep_learning_systems.src.config', 'TrainingConfig')
SimpleMLP = safe_import_from('06_deep_learning_systems.src.models', 'SimpleMLP')
get_mnist_loaders = safe_import_from('06_deep_learning_systems.src.datasets', 'get_mnist_loaders')
Trainer = safe_import_from('06_deep_learning_systems.src.trainer', 'Trainer')
set_seed = safe_import_from('00_repo_standards.src.mlphys_core.seeding', 'set_seed')

print("‚úì All imports successful")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## Part 1: Training Loop Anatomy

A minimal training loop has these components:

```python
# 1. Data
train_loader = DataLoader(train_dataset, ...)

# 2. Model
model = SimpleMLP(...)

# 3. Loss function
criterion = nn.CrossEntropyLoss()

# 4. Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 5. Training loop
for epoch in range(num_epochs):
    for batch_x, batch_y in train_loader:
        # Forward
        output = model(batch_x)
        loss = criterion(output, batch_y)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # 6. Evaluation
    val_loss, val_acc = evaluate(model, val_loader)
```

Our `Trainer` class encapsulates this pattern.

In [None]:
# Setup output directory
reports_dir = Path("../reports/notebook_01")
reports_dir.mkdir(parents=True, exist_ok=True)

print(f"Outputs will be saved to: {reports_dir}")

## Part 2: Reproducibility - The Foundation

**Question**: If I train the same model twice with the same code, will I get identical results?

**Answer**: Only if you control all sources of randomness.

### Sources of Randomness in PyTorch

1. **Python** random module
2. **NumPy** RNG
3. **PyTorch CPU** RNG
4. **PyTorch CUDA** RNG (if using GPU)
5. **DataLoader** workers (multi-process)
6. **Non-deterministic operations** (some GPU ops)

Our `set_seed()` function handles the first 4.

In [None]:
def demonstrate_randomness():
    """Show what happens without setting seeds."""
    print("Without setting seed:")
    print("Run 1:", torch.randn(3))
    print("Run 2:", torch.randn(3))
    print("Run 3:", torch.randn(3))
    print("‚Üí Different every time!\n")
    
    print("With seed=42:")
    torch.manual_seed(42)
    print("Run 1:", torch.randn(3))
    torch.manual_seed(42)
    print("Run 2:", torch.randn(3))
    torch.manual_seed(42)
    print("Run 3:", torch.randn(3))
    print("‚Üí Identical when seed is reset!")

demonstrate_randomness()

## Part 3: Reproducible Training - Experiment 1

Let's train a small model twice with the same seed and verify bit-identical results.

In [None]:
def run_training_experiment(seed: int, name: str):
    """Run a training experiment with specified seed."""
    set_seed(seed)
    
    # Create config (small for speed)
    config = TrainingConfig(
        name=name,
        seed=seed,
        model_type="SimpleMLP",
        input_dim=784,
        hidden_dims=[64, 32],
        output_dim=10,
        batch_size=128,
        num_epochs=3,
        learning_rate=1e-3,
        early_stop_patience=10,  # No early stopping
        save_artifacts=False,
        device="cpu",
    )
    
    # Load data (small subset)
    print(f"\n[{name}] Loading MNIST...")
    train_loader, val_loader, test_loader = get_mnist_loaders(
        data_dir=Path("../../../data"),
        batch_size=config.batch_size,
        val_split=0.1,
        num_workers=0,  # No multiprocessing for reproducibility
        seed=seed,
    )
    
    # Create model
    model = SimpleMLP(
        input_dim=config.input_dim,
        hidden_dims=config.hidden_dims,
        output_dim=config.output_dim,
    )
    
    # Train
    trainer = Trainer(config, model, device="cpu")
    history = trainer.train(train_loader, val_loader)
    
    # Evaluate
    test_metrics = trainer.evaluate(test_loader)
    
    print(f"[{name}] Final test accuracy: {test_metrics['accuracy']:.4f}")
    
    return history, test_metrics

# Run experiment 1
print("="*60)
print("Experiment 1: First run with seed=42")
print("="*60)
history1, metrics1 = run_training_experiment(seed=42, name="run1")

In [None]:
# Run experiment 2 (same seed)
print("\n" + "="*60)
print("Experiment 2: Second run with seed=42 (should be identical)")
print("="*60)
history2, metrics2 = run_training_experiment(seed=42, name="run2")

In [None]:
# Verify reproducibility
print("\n" + "="*60)
print("Reproducibility Check")
print("="*60)

# Compare final metrics
print("\nFinal Test Accuracy:")
print(f"  Run 1: {metrics1['accuracy']:.6f}")
print(f"  Run 2: {metrics2['accuracy']:.6f}")
print(f"  Difference: {abs(metrics1['accuracy'] - metrics2['accuracy']):.10f}")

# Compare training histories
train_loss_diff = np.array(history1['train_loss']) - np.array(history2['train_loss'])
val_loss_diff = np.array(history1['val_loss']) - np.array(history2['val_loss'])

print("\nTraining History Differences:")
print(f"  Train loss max diff: {np.abs(train_loss_diff).max():.10f}")
print(f"  Val loss max diff: {np.abs(val_loss_diff).max():.10f}")

if np.allclose(history1['train_loss'], history2['train_loss'], atol=1e-6):
    print("\n‚úÖ REPRODUCIBLE: Results are bit-identical (within 1e-6)")
else:
    print("\n‚ùå NOT REPRODUCIBLE: Results differ!")
    print("   This could be due to:")
    print("   - GPU non-determinism")
    print("   - DataLoader workers")
    print("   - Missing seed setting")

## Part 4: Visualize Training Dynamics

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

epochs = np.arange(1, len(history1['train_loss']) + 1)

# Loss curves
axes[0].plot(epochs, history1['train_loss'], 'o-', label='Train Loss', linewidth=2, markersize=6)
axes[0].plot(epochs, history1['val_loss'], 's-', label='Val Loss', linewidth=2, markersize=6)
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].set_title('Learning Curves (Reproducible Run)', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Accuracy curve
axes[1].plot(epochs, history1['val_accuracy'], 'o-', color='green', linewidth=2, markersize=6)
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Validation Accuracy', fontsize=12)
axes[1].set_title('Validation Accuracy', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim([0, 1])

plt.tight_layout()
plt.savefig(reports_dir / "learning_curves.png", dpi=150, bbox_inches='tight')
plt.show()

print(f"\n‚úì Saved to {reports_dir / 'learning_curves.png'}")

## Part 5: Non-Determinism Demo

Let's intentionally break reproducibility to understand what can go wrong.

In [None]:
print("Demonstration: What breaks reproducibility?\n")

# Case 1: Different seeds
print("Case 1: Different seeds")
set_seed(42)
result1 = torch.randn(3)
set_seed(123)  # Different seed!
result2 = torch.randn(3)
print(f"  Seed 42: {result1}")
print(f"  Seed 123: {result2}")
print(f"  ‚Üí Different results (as expected)\n")

# Case 2: Forgot to set seed
print("Case 2: Forgot to set seed before second run")
set_seed(42)
result1 = torch.randn(3)
# Forgot set_seed here!
result2 = torch.randn(3)
print(f"  Run 1: {result1}")
print(f"  Run 2: {result2}")
print(f"  ‚Üí Non-reproducible!\n")

# Case 3: GPU non-determinism (demonstration only)
print("Case 3: GPU operations can be non-deterministic")
print("  Some CUDA ops use atomic operations that aren't deterministic")
print("  Solution: torch.use_deterministic_algorithms(True)")
print("  Trade-off: May be slower, some ops not supported\n")

## Part 6: Save Summary

In [None]:
# Save metrics
summary = {
    "experiment": "reproducibility_demo",
    "seed": 42,
    "num_epochs": len(history1['train_loss']),
    "final_train_loss": float(history1['train_loss'][-1]),
    "final_val_loss": float(history1['val_loss'][-1]),
    "final_val_accuracy": float(history1['val_accuracy'][-1]),
    "test_accuracy": float(metrics1['accuracy']),
    "test_loss": float(metrics1['loss']),
    "reproducibility_check": {
        "run1_test_acc": float(metrics1['accuracy']),
        "run2_test_acc": float(metrics2['accuracy']),
        "difference": float(abs(metrics1['accuracy'] - metrics2['accuracy'])),
        "is_reproducible": bool(abs(metrics1['accuracy'] - metrics2['accuracy']) < 1e-6),
    }
}

with open(reports_dir / "summary.json", 'w') as f:
    json.dump(summary, f, indent=2)

print(f"‚úì Saved summary to {reports_dir / 'summary.json'}")
print("\nSummary:")
print(json.dumps(summary, indent=2))

## Key Takeaways

### ‚úÖ Best Practices

1. **Always set seeds** at the start of experiments
2. **Use CPU** for bit-identical reproducibility (GPU has non-deterministic ops)
3. **Disable DataLoader workers** (`num_workers=0`) or set worker seed function
4. **Log seeds** with experiments for reproducibility
5. **Verify reproducibility** on small experiments before scaling up

### ‚ö†Ô∏è Common Pitfalls

1. **Forgetting to reset seed** between runs
2. **Using GPU without deterministic mode** ‚Üí slight differences each run
3. **DataLoader multiprocessing** ‚Üí each worker has different RNG state
4. **Async operations** ‚Üí execution order varies
5. **Library updates** ‚Üí different PyTorch versions may have different numerics

### üîß Debugging Checklist

When results aren't reproducible:

- [ ] Did you call `set_seed()` before training?
- [ ] Are you using CPU or GPU? (GPU has non-determinism)
- [ ] Did you set `num_workers=0` in DataLoader?
- [ ] Are you using the same PyTorch version?
- [ ] Did you save the random seed with the experiment?
- [ ] Are there any async operations (e.g., `.to(device, non_blocking=True)`)?

## Exercises

Complete these exercises to test your understanding:

### Exercise 1: Break Determinism

Modify the training code to intentionally make it non-reproducible. Run it twice and verify the results differ.

In [None]:
# Your code here
# Hint: Remove the set_seed() call or use a different seed each time

### Exercise 2: Measure Seed Sensitivity

Train the same model with 5 different seeds (e.g., 1, 2, 3, 4, 5). Plot the distribution of final test accuracies. How much does the seed matter?

In [None]:
# Your code here
# Hint: Use a loop to run training with different seeds

### Exercise 3: Identify Non-Deterministic Operation

The following code snippet has a non-deterministic bug. Find it and fix it.

```python
set_seed(42)
model = SimpleMLP(10, [20], 2)
data = torch.randn(100, 10)
labels = torch.randint(0, 2, (100,))

# Training loop
optimizer = torch.optim.Adam(model.parameters())
for epoch in range(3):
    perm = torch.randperm(100)  # BUG: creates new random permutation
    x_batch = data[perm[:32]]
    y_batch = labels[perm[:32]]
    
    output = model(x_batch)
    loss = nn.CrossEntropyLoss()(output, y_batch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
```

In [None]:
# Your fixed code here

---

## Solutions

<details>
<summary>Click to reveal solutions</summary>

### Solution 1: Break Determinism

In [None]:
# Solution: Comment out set_seed or use random seed
import time

def run_non_deterministic():
    # Use current time as seed (different each run)
    seed = int(time.time() * 1000) % 10000
    set_seed(seed)
    
    model = SimpleMLP(10, [20], 2)
    x = torch.randn(4, 10)
    return model(x).sum().item()

result1 = run_non_deterministic()
result2 = run_non_deterministic()
print(f"Run 1: {result1:.6f}")
print(f"Run 2: {result2:.6f}")
print(f"Different: {result1 != result2}")

### Solution 2: Seed Sensitivity

In [None]:
# Solution: Train with multiple seeds
seeds = [1, 2, 3, 4, 5]
accuracies = []

for seed in seeds:
    print(f"\nTraining with seed={seed}...")
    _, metrics = run_training_experiment(seed=seed, name=f"seed_{seed}")
    accuracies.append(metrics['accuracy'])

# Plot distribution
plt.figure(figsize=(8, 5))
plt.bar(seeds, accuracies, color='steelblue', alpha=0.7)
plt.axhline(np.mean(accuracies), color='red', linestyle='--', label=f'Mean: {np.mean(accuracies):.4f}')
plt.xlabel('Seed', fontsize=12)
plt.ylabel('Test Accuracy', fontsize=12)
plt.title('Seed Sensitivity Analysis', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(reports_dir / "seed_sensitivity.png", dpi=150)
plt.show()

print(f"\nAccuracies: {accuracies}")
print(f"Mean: {np.mean(accuracies):.4f}")
print(f"Std: {np.std(accuracies):.4f}")
print(f"Range: [{min(accuracies):.4f}, {max(accuracies):.4f}]")

### Solution 3: Fix Non-Deterministic Bug

In [None]:
# Solution: Set seed inside loop or use fixed permutation
set_seed(42)
model = SimpleMLP(10, [20], 2)
data = torch.randn(100, 10)
labels = torch.randint(0, 2, (100,))

# Fix 1: Set seed at start (affects all subsequent random ops)
torch.manual_seed(42)

optimizer = torch.optim.Adam(model.parameters())
for epoch in range(3):
    perm = torch.randperm(100)  # Now deterministic
    x_batch = data[perm[:32]]
    y_batch = labels[perm[:32]]
    
    output = model(x_batch)
    loss = nn.CrossEntropyLoss()(output, y_batch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Alternative Fix 2: Use DataLoader with fixed seed
# This is cleaner and how you should do it in practice
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(data, labels)
loader = DataLoader(dataset, batch_size=32, shuffle=True, 
                   generator=torch.Generator().manual_seed(42))
print("‚úì Bug fixed: now deterministic")

</details>

---

## Next Steps

- **Notebook 02**: Sanity checks - learn to overfit a tiny batch (the #1 debugging technique)
- **Notebook 03**: Optimization dynamics - LR schedules, optimizers, regularization
- **Notebook 04**: Monitoring & error analysis - confusion matrices, worst errors

**Key lesson**: Reproducibility isn't optional - it's the foundation of reliable ML engineering.