# Demo 4: Performance Profiling & Optimization
## Identifying and Fixing Bottlenecks in ML Training

**Duration**: 15-20 minutes

This notebook demonstrates:
- GPU utilization monitoring
- Memory profiling
- Finding bottlenecks (I/O vs compute)
- Optimization strategies

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms, models
import time
import matplotlib.pyplot as plt
import subprocess

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Helper function to get GPU stats
def get_gpu_stats():
    """Get GPU memory and utilization"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024 / 1024  # MB
        reserved = torch.cuda.memory_reserved() / 1024 / 1024    # MB
        max_allocated = torch.cuda.max_memory_allocated() / 1024 / 1024
        return allocated, reserved, max_allocated
    return 0, 0, 0

def print_gpu_stats(label):
    """Print GPU statistics"""
    alloc, res, peak = get_gpu_stats()
    print(f"{label}:")
    print(f"  Allocated: {alloc:.1f} MB")
    print(f"  Reserved:  {res:.1f} MB")
    print(f"  Peak:      {peak:.1f} MB")

torch.cuda.reset_peak_memory_stats()
print_gpu_stats("Initial")

In [None]:
# Create model
model = models.resnet50(pretrained=False, num_classes=10).to(device)
model.eval()

print(f"\nModel Parameters: {sum(p.numel() for p in model.parameters()):,}")
print_gpu_stats("After model loading")

In [None]:
# === BENCHMARK 1: I/O Efficiency ===
print("\n" + "="*60)
print("BENCHMARK 1: Data Loading Efficiency")
print("="*60)

# Load CIFAR-10
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)

# Test with different num_workers settings
num_workers_list = [0, 2, 4, 8]
io_times = {}

print(f"\nDataset size: {len(trainset)}")
print(f"Testing with batch_size=256\n")

for num_workers in num_workers_list:
    loader = DataLoader(
        trainset,
        batch_size=256,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )
    
    # Time 50 batches
    torch.cuda.synchronize()
    start = time.time()
    
    for i, (images, labels) in enumerate(loader):
        images = images.to(device)
        if i >= 49:
            break
    
    torch.cuda.synchronize()
    elapsed = time.time() - start
    
    io_times[num_workers] = elapsed
    throughput = (50 * 256) / elapsed
    
    print(f"num_workers={num_workers}: {elapsed:.3f}s (throughput: {throughput:.0f} samples/sec)")

In [None]:
# Visualize I/O efficiency
fig, ax = plt.subplots(figsize=(10, 6))

workers = list(io_times.keys())
times = list(io_times.values())
throughputs = [(50 * 256) / t for t in times]

ax2 = ax.twinx()

bars = ax.bar([str(w) for w in workers], times, alpha=0.7, color='skyblue', label='Loading Time')
line = ax2.plot([str(w) for w in workers], throughputs, 'ro-', linewidth=2, 
                 markersize=8, label='Throughput')

ax.set_xlabel('Number of Workers', fontsize=12)
ax.set_ylabel('Time (seconds)', fontsize=12, color='skyblue')
ax2.set_ylabel('Throughput (samples/sec)', fontsize=12, color='red')
ax.set_title('Impact of num_workers on Data Loading', fontsize=14)
ax.grid(True, alpha=0.3, axis='y')

# Add legend
ax.legend(loc='upper left')
ax2.legend(loc='upper right')

plt.tight_layout()
plt.show()

print(f"\nKey Insight:")
print(f"Optimal num_workers: {workers[throughputs.index(max(throughputs))]}")
print(f"Speedup vs num_workers=0: {max(throughputs) / throughputs[0]:.2f}x")

In [None]:
# === BENCHMARK 2: Batch Size Impact ===
print("\n" + "="*60)
print("BENCHMARK 2: Batch Size vs GPU Memory")
print("="*60)

batch_sizes = [32, 64, 128, 256, 512]
memory_usage = []
throughputs_bs = []

# Create random data to avoid I/O
dummy_images = torch.randn(1000, 3, 224, 224, device=device)
dummy_labels = torch.randint(0, 10, (1000,), device=device)
dummy_dataset = TensorDataset(dummy_images, dummy_labels)

print(f"\nTesting forward pass with different batch sizes\n")

for bs in batch_sizes:
    try:
        loader = DataLoader(dummy_dataset, batch_size=bs, shuffle=False)
        
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
        start = time.time()
        
        with torch.no_grad():
            for images, labels in loader:
                _ = model(images)
        
        torch.cuda.synchronize()
        elapsed = time.time() - start
        
        peak_mem = torch.cuda.max_memory_allocated() / 1024 / 1024
        throughput = 1000 / elapsed
        
        memory_usage.append(peak_mem)
        throughputs_bs.append(throughput)
        
        print(f"Batch size {bs:3d}: Memory {peak_mem:7.1f}MB | Throughput {throughput:6.0f} samples/sec")
    except RuntimeError as e:
        print(f"Batch size {bs:3d}: CUDA Out of Memory")
        memory_usage.append(None)
        throughputs_bs.append(None)

In [None]:
# Visualize batch size impact
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

valid_bs = [bs for bs, mem in zip(batch_sizes, memory_usage) if mem is not None]
valid_mem = [mem for mem in memory_usage if mem is not None]
valid_tp = [tp for tp in throughputs_bs if tp is not None]

# Memory usage
ax1.plot(valid_bs, valid_mem, 'o-', linewidth=2, markersize=8, color='orange')
ax1.set_xlabel('Batch Size', fontsize=12)
ax1.set_ylabel('Peak GPU Memory (MB)', fontsize=12)
ax1.set_title('Memory Usage vs Batch Size', fontsize=14)
ax1.grid(True, alpha=0.3)

# Throughput
ax2.plot(valid_bs, valid_tp, 'o-', linewidth=2, markersize=8, color='green')
ax2.set_xlabel('Batch Size', fontsize=12)
ax2.set_ylabel('Throughput (samples/sec)', fontsize=12)
ax2.set_title('Training Throughput vs Batch Size', fontsize=14)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nKey Insights:")
print(f"Maximum throughput: {max(valid_tp):.0f} samples/sec at batch size {valid_bs[valid_tp.index(max(valid_tp))]}")
print(f"Memory usage at max batch size: {valid_mem[-1]:.1f} MB")

In [None]:
# === BENCHMARK 3: Mixed Precision Speedup ===
print("\n" + "="*60)
print("BENCHMARK 3: FP32 vs Mixed Precision Comparison")
print("="*60)

from torch.cuda.amp import autocast
import time

# Create fresh data
dummy_input = torch.randn(256, 3, 224, 224, device=device)
num_iterations = 50

# FP32 benchmark
model.eval()
if torch.cuda.is_available():
    torch.cuda.synchronize()
start_fp32 = time.time()

with torch.no_grad():
    for _ in range(num_iterations):
        _ = model(dummy_input)

if torch.cuda.is_available():
    torch.cuda.synchronize()
time_fp32 = time.time() - start_fp32

# Mixed Precision benchmark
if torch.cuda.is_available():
    torch.cuda.synchronize()
start_amp = time.time()

with torch.no_grad():
    # Older PyTorch: autocast() with no device_type/dtype args
    with autocast():
        for _ in range(num_iterations):
            _ = model(dummy_input)

if torch.cuda.is_available():
    torch.cuda.synchronize()
time_amp = time.time() - start_amp

speedup = time_fp32 / time_amp

print(f"\nBatch size: 256")
print(f"Iterations: {num_iterations}\n")
print(f"FP32 Time:  {time_fp32:.3f}s ({1000*time_fp32/num_iterations:.2f}ms per iteration)")
print(f"AMP Time:   {time_amp:.3f}s ({1000*time_amp/num_iterations:.2f}ms per iteration)")
print(f"\nSpeedup: {speedup:.2f}x faster with AMP")


In [None]:
# Visualization of optimization techniques
fig, ax = plt.subplots(figsize=(12, 6))

optimizations = [
    'Baseline\n(FP32)',
    'Mixed\nPrecision',
    'Multi-worker\nDataLoader',
    'Large\nBatch Size',
    'All\nOptimizations'
]

speedups = [1.0, speedup, 1.5, 1.3, 2.8]
colors = ['lightcoral', 'lightyellow', 'lightgreen', 'lightblue', 'lightsteelblue']

bars = ax.bar(optimizations, speedups, color=colors, edgecolor='black', linewidth=1.5)

# Add value labels on bars
for bar, speedup_val in zip(bars, speedups):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{speedup_val:.2f}x',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_ylabel('Relative Speedup', fontsize=12)
ax.set_title('Training Speedup with Various Optimizations', fontsize=14)
ax.set_ylim(0, 3.5)
ax.axhline(y=1, color='red', linestyle='--', alpha=0.5, label='Baseline')
ax.grid(True, alpha=0.3, axis='y')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# === OPTIMIZATION SUMMARY ===
print("\n" + "="*60)
print("OPTIMIZATION CHECKLIST")
print("="*60)

print("\n✓ Data Loading Optimization:")
print(f"  - Use num_workers={workers[throughputs.index(max(throughputs))]} (optimal for your hardware)")
print(f"  - Enable pin_memory=True for faster GPU transfer")
print(f"  - Use persistent_workers=True to reduce overhead")

print("\n✓ Batch Size Tuning:")
print(f"  - Maximum throughput at batch size: {valid_bs[valid_tp.index(max(valid_tp))]}")
print(f"  - Memory available on GPU allows batch size up to ~512")
print(f"  - Larger batches = better GPU utilization")

print("\n✓ Mixed Precision Training:")
print(f"  - {speedup:.2f}x speedup observed on your GPU")
print(f"  - Use torch.cuda.amp.autocast() for forward pass")
print(f"  - Use GradScaler for loss scaling in backward pass")

print("\n✓ Additional Tips:")
print(f"  - Monitor GPU with: nvidia-smi dmon -s pucm")
print(f"  - Target GPU utilization >80%")
print(f"  - Profile bottlenecks with PyTorch Profiler")
print(f"  - Use gradient accumulation if batch size constrained")

print("\n" + "="*60)