# 21 - Scaling Experiments

This notebook tests how data formats scale with different configurations:
- **Batch size scaling**: 16, 32, 64, 128, 256
- **Worker scaling**: 0, 2, 4, 8, 16

**Goal:** Understand which formats scale better with:
- Larger batch sizes (GPU utilization)
- More workers (parallel data loading)

**Experiment Design:**
- Single epoch per configuration
- Measure throughput (samples/second)
- Track resource utilization
- Compare across all formats

**Output:**
- Scaling metrics logged to `runs/<session>/summary.csv`
- Resource monitoring logs per configuration

In [None]:
import os
import sys
import time
import json
from pathlib import Path
from collections import defaultdict

import torch
import torch.nn as nn
from torchvision import models
from tqdm.auto import tqdm

# Load common utilities
%run ./10_common_utils.ipynb

## Configuration

In [None]:
# Detect environment
IS_KAGGLE = "KAGGLE_KERNEL_RUN_TYPE" in os.environ
BASE_DIR = Path('/kaggle/working/format-matters') if IS_KAGGLE else Path('..').resolve()

# Create run directory for this session
RUN_DIR = BASE_DIR / 'runs' / time.strftime('%Y%m%d-%H%M%S') / 'train_scaling'
RUN_DIR.mkdir(parents=True, exist_ok=True)

SUMMARY_CSV = RUN_DIR / 'summary.csv'
SUMMARY_CSV.touch(exist_ok=True)

print(f"Environment: {'Kaggle' if IS_KAGGLE else 'Local'}")
print(f"Base directory: {BASE_DIR}")
print(f"Run directory: {RUN_DIR}")
print(f"Summary log: {SUMMARY_CSV}")

# Write system info
write_sysinfo(RUN_DIR / 'sysinfo.json')

## Scaling Configuration

In [None]:
# Scaling parameters
BATCH_SIZES = [16, 32, 64, 128, 256]
NUM_WORKERS = [0, 2, 4, 8]

# Base configuration
BASE_CONFIG = {
    'num_batches': 100,  # Run for 100 batches per test
    'seed': 42,
}

# Formats to test
FORMATS = [
    ('csv', '11_loader_csv.ipynb', 'default'),
    ('webdataset', '12_loader_webdataset.ipynb', 'shard256_none'),
    ('tfrecord', '13_loader_tfrecord.ipynb', 'shard256_none'),
    ('lmdb', '14_loader_lmdb.ipynb', 'compress_none'),
]

# Dataset to use
BUILT_DIR = BASE_DIR / 'data' / 'built'
DATASET = None
for ds in ['cifar10', 'imagenet-mini', 'tiny-imagenet-200']:
    if (BUILT_DIR / ds).exists():
        DATASET = ds
        break

if DATASET is None:
    raise RuntimeError("No datasets found. Run dataset preparation notebooks first.")

print(f"\nScaling Configuration:")
print(f"  Dataset: {DATASET}")
print(f"  Batch sizes: {BATCH_SIZES}")
print(f"  Worker counts: {NUM_WORKERS}")
print(f"  Batches per test: {BASE_CONFIG['num_batches']}")
print(f"\nFormats to test: {len(FORMATS)}")
for fmt_name, _, variant in FORMATS:
    print(f"  - {fmt_name} ({variant})")

## Throughput Measurement

In [None]:
def measure_throughput(dataloader, device, num_batches):
    """
    Measure data loading throughput.
    
    Args:
        dataloader: PyTorch DataLoader
        device: Device to transfer data to
        num_batches: Number of batches to measure
    
    Returns:
        Dictionary with throughput metrics
    """
    total_samples = 0
    start_time = time.time()
    
    # Warmup (first batch is often slower)
    try:
        images, labels = next(iter(dataloader))
        images = images.to(device)
        if device.type == 'cuda':
            torch.cuda.synchronize()
    except StopIteration:
        return {'error': 'Empty dataloader'}
    
    # Measure throughput
    start_time = time.time()
    
    for i, (images, labels) in enumerate(dataloader):
        if i >= num_batches:
            break
        
        # Transfer to device
        images = images.to(device)
        labels = labels.to(device)
        
        # Synchronize if using CUDA
        if device.type == 'cuda':
            torch.cuda.synchronize()
        
        total_samples += images.size(0)
    
    elapsed_time = time.time() - start_time
    
    return {
        'total_samples': total_samples,
        'elapsed_time': elapsed_time,
        'samples_per_sec': total_samples / elapsed_time if elapsed_time > 0 else 0,
        'batches_per_sec': num_batches / elapsed_time if elapsed_time > 0 else 0,
    }

## Scaling Experiment Runner

In [None]:
def run_scaling_experiment(
    format_name,
    loader_notebook,
    variant,
    dataset,
    batch_size,
    num_workers,
    num_batches,
    device
):
    """
    Run scaling experiment for a single configuration.
    
    Args:
        format_name: Name of format
        loader_notebook: Path to loader notebook
        variant: Format variant
        dataset: Dataset name
        batch_size: Batch size to test
        num_workers: Number of workers to test
        num_batches: Number of batches to measure
        device: Device to use
    
    Returns:
        Dictionary with results
    """
    print(f"\nTesting {format_name} - batch_size={batch_size}, workers={num_workers}")
    
    try:
        # Load dataloader
        %run ./{loader_notebook}
        
        # Create dataloader
        dataloader = make_dataloader(
            dataset=dataset,
            split='train',
            batch_size=batch_size,
            num_workers=num_workers,
            variant=variant,
            shuffle=True,
            pin_memory=True
        )
        
        # Start resource monitoring
        log_path = RUN_DIR / f"{format_name}_bs{batch_size}_w{num_workers}_metrics.csv"
        monitor_thread, stop_event = start_monitor(log_path, interval=0.5)
        
        # Measure throughput
        metrics = measure_throughput(dataloader, device, num_batches)
        
        # Stop monitoring
        stop_monitor(monitor_thread, stop_event)
        
        # Compute resource metrics
        resource_metrics = compute_metrics_from_logs(log_path)
        
        # Combine results
        results = {
            'format': format_name,
            'variant': variant,
            'dataset': dataset,
            'batch_size': batch_size,
            'num_workers': num_workers,
            **metrics,
            **resource_metrics,
        }
        
        # Log to summary
        append_to_summary(SUMMARY_CSV, results)
        
        print(f"  ✓ Throughput: {metrics['samples_per_sec']:.1f} samples/s")
        
        return results
        
    except Exception as e:
        print(f"  ✗ Failed: {e}")
        return {
            'format': format_name,
            'variant': variant,
            'dataset': dataset,
            'batch_size': batch_size,
            'num_workers': num_workers,
            'error': str(e),
        }

## Run Batch Size Scaling Experiments

In [None]:
# Get device
device = get_device()
print(f"Using device: {device}\n")

print("="*80)
print("BATCH SIZE SCALING EXPERIMENTS")
print("="*80)

# Fixed number of workers for batch size scaling
FIXED_WORKERS = 4

batch_size_results = []

for format_name, loader_notebook, variant in FORMATS:
    print(f"\n{'='*80}")
    print(f"Format: {format_name} ({variant})")
    print(f"{'='*80}")
    
    for batch_size in BATCH_SIZES:
        result = run_scaling_experiment(
            format_name=format_name,
            loader_notebook=loader_notebook,
            variant=variant,
            dataset=DATASET,
            batch_size=batch_size,
            num_workers=FIXED_WORKERS,
            num_batches=BASE_CONFIG['num_batches'],
            device=device
        )
        batch_size_results.append(result)
        
        # Small delay
        time.sleep(2)

print(f"\n{'='*80}")
print("Batch size scaling experiments completed!")
print(f"{'='*80}")

## Run Worker Scaling Experiments

In [None]:
print("\n" + "="*80)
print("WORKER SCALING EXPERIMENTS")
print("="*80)

# Fixed batch size for worker scaling
FIXED_BATCH_SIZE = 64

worker_results = []

for format_name, loader_notebook, variant in FORMATS:
    print(f"\n{'='*80}")
    print(f"Format: {format_name} ({variant})")
    print(f"{'='*80}")
    
    for num_workers in NUM_WORKERS:
        result = run_scaling_experiment(
            format_name=format_name,
            loader_notebook=loader_notebook,
            variant=variant,
            dataset=DATASET,
            batch_size=FIXED_BATCH_SIZE,
            num_workers=num_workers,
            num_batches=BASE_CONFIG['num_batches'],
            device=device
        )
        worker_results.append(result)
        
        # Small delay
        time.sleep(2)

print(f"\n{'='*80}")
print("Worker scaling experiments completed!")
print(f"{'='*80}")

## Results Summary

In [None]:
import pandas as pd
import numpy as np

if SUMMARY_CSV.exists() and SUMMARY_CSV.stat().st_size > 0:
    df = pd.read_csv(SUMMARY_CSV)
    
    print("\n" + "="*80)
    print("SCALING EXPERIMENTS SUMMARY")
    print("="*80)
    
    # Batch Size Scaling Results
    print(f"\n\nBatch Size Scaling (workers={FIXED_WORKERS}):\n")
    print(f"{'Format':<15} {'Batch Size':<12} {'Throughput (samp/s)':<20} {'GPU Util %':<12}")
    print("-" * 70)
    
    batch_df = df[df['num_workers'] == FIXED_WORKERS].sort_values(['format', 'batch_size'])
    for _, row in batch_df.iterrows():
        gpu_util = row.get('gpu_util_mean', 0) or 0
        print(f"{row['format']:<15} {row['batch_size']:<12} "
              f"{row['samples_per_sec']:>18.1f} {gpu_util:>10.1f}%")
    
    # Worker Scaling Results
    print(f"\n\nWorker Scaling (batch_size={FIXED_BATCH_SIZE}):\n")
    print(f"{'Format':<15} {'Workers':<10} {'Throughput (samp/s)':<20} {'CPU Util %':<12}")
    print("-" * 70)
    
    worker_df = df[df['batch_size'] == FIXED_BATCH_SIZE].sort_values(['format', 'num_workers'])
    for _, row in worker_df.iterrows():
        cpu_util = row.get('cpu_util_mean', 0) or 0
        print(f"{row['format']:<15} {row['num_workers']:<10} "
              f"{row['samples_per_sec']:>18.1f} {cpu_util:>10.1f}%")
    
    # Best configurations
    print(f"\n\nBest Configurations (by throughput):\n")
    print(f"{'Format':<15} {'Batch Size':<12} {'Workers':<10} {'Throughput (samp/s)':<20}")
    print("-" * 70)
    
    for format_name in df['format'].unique():
        format_df = df[df['format'] == format_name]
        best = format_df.loc[format_df['samples_per_sec'].idxmax()]
        print(f"{best['format']:<15} {best['batch_size']:<12} "
              f"{best['num_workers']:<10} {best['samples_per_sec']:>18.1f}")
    
    print("\n" + "="*80)
    print(f"\nResults saved to: {SUMMARY_CSV}")
    print(f"Resource logs saved to: {RUN_DIR}")
else:
    print("No results available")

## ✅ Scaling Experiments Complete

**What was measured:**
- Throughput scaling with batch size
- Throughput scaling with number of workers
- Resource utilization at different scales
- Optimal configurations per format

**Key Insights:**
- Which formats scale better with larger batches?
- Which formats benefit most from parallel workers?
- What are the optimal configurations for each format?
- Where are the bottlenecks (GPU, CPU, I/O)?

**Next steps:**
1. Aggregate all results (30_analysis_summary.ipynb)
2. Create visualizations (31_analysis_plots.ipynb)
3. Generate decision guide (40_decision_guide.ipynb)