# 01 - I/O Microbenchmarks

This notebook conducts comprehensive I/O performance analysis across different:
- **Storage types**: NVMe, NFS, object stores, memory
- **Access patterns**: Sequential vs random, different block sizes
- **File characteristics**: Size, format, compression
- **Concurrency levels**: Single vs multi-threaded access

Results help understand storage bottlenecks in ML data pipelines and guide optimization decisions.

## Experiment Matrix
1. **Sequential Read Performance** across block sizes and file sizes
2. **Random Access Patterns** with different seek behaviors
3. **Format Comparison** (Parquet vs CSV vs binary formats)
4. **Small File vs Large File** performance characteristics
5. **Memory-mapped vs Direct I/O** comparisons
6. **Multi-threaded Access** patterns

In [None]:
import os
import sys
import time
import mmap
import json
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Dict, List, Any

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import psutil

# Add src to path for imports
sys.path.append(str(Path.cwd().parent))
from src.bench.io_bench import seq_read_bench, rand_read_bench
from src.bench.data_generator import create_synthetic_tabular, create_mixed_workload_data
from src.bench.plotting import lineplot_csv

# Ensure results directory exists
Path('../results').mkdir(exist_ok=True)

print("üîß I/O Microbenchmarks Setup")
print(f"Working directory: {Path.cwd()}")
print(f"Available memory: {psutil.virtual_memory().available / (1024**3):.1f} GB")
print(f"CPU cores: {psutil.cpu_count()}")

# Check if we need to generate test data
data_dir = Path('../data')
if not data_dir.exists() or len(list(data_dir.glob('**/*'))) < 10:
    print("üìä Generating test data...")
    # Generate minimal test dataset
    data_dir.mkdir(exist_ok=True)
    
    # Create some test files of different sizes
    test_sizes = [1024, 10*1024, 100*1024, 1024*1024, 10*1024*1024]  # 1KB to 10MB
    for i, size in enumerate(test_sizes):
        test_file = data_dir / f'test_{size//1024}kb.dat'
        with open(test_file, 'wb') as f:
            f.write(os.urandom(size))
        print(f"   Created {test_file} ({size/1024:.1f} KB)")
    
    print("‚úÖ Test data generated")
else:
    print("‚úÖ Using existing test data")

In [None]:
# Enhanced sequential read benchmarks
print("\nüîÑ Running Sequential Read Benchmarks")
print("=" * 50)

# Collect all test files
test_files = [p for p in Path('../data').glob('**/*') if p.is_file() and p.stat().st_size > 0]
print(f"Found {len(test_files)} test files")

# Test different block sizes for sequential reads
block_sizes_kb = [4, 16, 64, 256, 1024, 4096]  # 4KB to 4MB blocks
seq_results = []

for file_path in tqdm(test_files, desc="Files"):
    file_size = file_path.stat().st_size
    
    # Skip very small files for large block sizes
    max_block_kb = min(4096, file_size // 1024)
    relevant_blocks = [b for b in block_sizes_kb if b <= max_block_kb]
    
    for block_kb in relevant_blocks:
        try:
            # Run benchmark multiple times for stability
            times = []
            for run in range(3):  # 3 runs per configuration
                result = seq_read_bench(str(file_path), block_kb=block_kb)
                times.append(result['mb_s'])
            
            # Record statistics
            seq_results.append({
                'file_path': str(file_path.relative_to(Path('../data'))),
                'file_size_mb': file_size / (1024**2),
                'file_size_category': categorize_file_size(file_size),
                'block_kb': block_kb,
                'throughput_mb_s': np.mean(times),
                'throughput_std': np.std(times),
                'throughput_min': np.min(times),
                'throughput_max': np.max(times),
                'access_pattern': 'sequential'
            })
            
        except Exception as e:
            print(f"‚ö†Ô∏è Error with {file_path} @ {block_kb}KB: {e}")

def categorize_file_size(size_bytes):
    """Categorize file size for analysis."""
    if size_bytes < 1024**2:  # < 1MB
        return 'small'
    elif size_bytes < 10 * 1024**2:  # < 10MB
        return 'medium'
    else:
        return 'large'

seq_df = pd.DataFrame(seq_results)
if not seq_df.empty:
    seq_df.to_csv('../results/io_sequential_detailed.csv', index=False)
    print(f"\nüìä Sequential Read Results:")
    print(seq_df.groupby(['file_size_category', 'block_kb'])['throughput_mb_s'].agg(['mean', 'std', 'count']))
else:
    print("‚ö†Ô∏è No sequential benchmark results collected")

In [None]:
# Enhanced random access benchmarks
print("\nüé≤ Running Random Access Benchmarks")
print("=" * 50)

# Focus on files large enough for meaningful random access
large_files = [f for f in test_files if f.stat().st_size > 1024**2]  # > 1MB

if not large_files:
    print("Creating larger test file for random access...")
    large_test_file = Path('../data/large_random_test.dat')
    with open(large_test_file, 'wb') as f:
        # Create 50MB file
        for chunk in range(50):
            f.write(os.urandom(1024*1024))
    large_files = [large_test_file]
    print(f"Created {large_test_file} (50 MB)")

rand_results = []
block_sizes_kb = [4, 16, 64, 256]  # Smaller blocks for random access
sample_counts = [100, 500, 1000]  # Different numbers of random accesses

for file_path in tqdm(large_files, desc="Large files"):
    file_size = file_path.stat().st_size
    
    for block_kb in block_sizes_kb:
        for n_samples in sample_counts:
            try:
                # Run random access benchmark
                times = []
                for run in range(3):  # Multiple runs for stability
                    result = rand_read_bench(
                        str(file_path), 
                        block_kb=block_kb, 
                        samples=n_samples
                    )
                    times.append(result['mb_s'])
                
                rand_results.append({
                    'file_path': str(file_path.relative_to(Path('../data'))),
                    'file_size_mb': file_size / (1024**2),
                    'block_kb': block_kb,
                    'n_samples': n_samples,
                    'throughput_mb_s': np.mean(times),
                    'throughput_std': np.std(times),
                    'iops': (n_samples * 3) / sum([seq_read_bench(str(file_path), block_kb)['seconds'] for _ in range(3)]),
                    'access_pattern': 'random'
                })
                
            except Exception as e:
                print(f"‚ö†Ô∏è Random access error {file_path} @ {block_kb}KB/{n_samples}: {e}")

rand_df = pd.DataFrame(rand_results)
if not rand_df.empty:
    rand_df.to_csv('../results/io_random_detailed.csv', index=False)
    print(f"\nüìä Random Access Results:")
    print(rand_df.groupby(['block_kb', 'n_samples'])['throughput_mb_s'].agg(['mean', 'std']))
    print(f"\nüíæ IOPS Results:")
    print(rand_df.groupby(['block_kb'])['iops'].agg(['mean', 'std']))
else:
    print("‚ö†Ô∏è No random access results collected")

In [None]:
# Memory-mapped file benchmarks
print("\nüó∫Ô∏è Running Memory-Mapped File Benchmarks")
print("=" * 50)

def mmap_read_bench(file_path: str, access_pattern: str = 'sequential', n_accesses: int = 1000):
    """Benchmark memory-mapped file access."""
    file_size = os.path.getsize(file_path)
    
    start_time = time.perf_counter()
    
    with open(file_path, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            total_bytes = 0
            
            if access_pattern == 'sequential':
                # Sequential scan
                block_size = 64 * 1024  # 64KB blocks
                for i in range(0, len(mm), block_size):
                    data = mm[i:i+block_size]
                    total_bytes += len(data)
            
            elif access_pattern == 'random':
                # Random access
                block_size = 4 * 1024  # 4KB blocks
                np.random.seed(42)
                for _ in range(n_accesses):
                    offset = np.random.randint(0, max(len(mm) - block_size, 1))
                    data = mm[offset:offset+block_size]
                    total_bytes += len(data)
    
    duration = time.perf_counter() - start_time
    throughput_mb_s = (total_bytes / (1024**2)) / duration
    
    return {
        'throughput_mb_s': throughput_mb_s,
        'duration': duration,
        'total_bytes': total_bytes,
        'file_size': file_size
    }

mmap_results = []

for file_path in tqdm(large_files, desc="mmap benchmarks"):
    file_size = file_path.stat().st_size
    
    for pattern in ['sequential', 'random']:
        try:
            times = []
            for run in range(3):
                result = mmap_read_bench(str(file_path), access_pattern=pattern)
                times.append(result['throughput_mb_s'])
            
            mmap_results.append({
                'file_path': str(file_path.relative_to(Path('../data'))),
                'file_size_mb': file_size / (1024**2),
                'access_pattern': f'mmap_{pattern}',
                'throughput_mb_s': np.mean(times),
                'throughput_std': np.std(times),
                'method': 'memory_mapped'
            })
            
        except Exception as e:
            print(f"‚ö†Ô∏è mmap error {file_path} ({pattern}): {e}")

mmap_df = pd.DataFrame(mmap_results)
if not mmap_df.empty:
    mmap_df.to_csv('../results/io_mmap_detailed.csv', index=False)
    print(f"\nüìä Memory-Mapped Results:")
    print(mmap_df.groupby('access_pattern')['throughput_mb_s'].agg(['mean', 'std']))
else:
    print("‚ö†Ô∏è No memory-mapped results collected")

In [None]:
# Multi-threaded I/O benchmarks
print("\nüîÄ Running Multi-threaded I/O Benchmarks")
print("=" * 50)

def threaded_read_worker(args):
    """Worker function for multi-threaded reads."""
    file_path, block_kb, thread_id = args
    try:
        result = seq_read_bench(file_path, block_kb=block_kb)
        result['thread_id'] = thread_id
        return result
    except Exception as e:
        return {'error': str(e), 'thread_id': thread_id}

def benchmark_concurrent_reads(file_paths, n_threads, block_kb=1024):
    """Benchmark concurrent reads across multiple threads."""
    # Prepare arguments for thread pool
    args_list = []
    for i in range(n_threads):
        file_path = file_paths[i % len(file_paths)]  # Cycle through files
        args_list.append((str(file_path), block_kb, i))
    
    start_time = time.perf_counter()
    
    # Run concurrent reads
    results = []
    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        future_to_args = {executor.submit(threaded_read_worker, args): args for args in args_list}
        
        for future in as_completed(future_to_args):
            result = future.result()
            if 'error' not in result:
                results.append(result)
    
    total_time = time.perf_counter() - start_time
    
    return {
        'n_threads': n_threads,
        'total_time': total_time,
        'individual_results': results,
        'aggregate_throughput': sum(r['mb_s'] for r in results),
        'avg_per_thread': np.mean([r['mb_s'] for r in results]) if results else 0
    }

# Test different thread counts
thread_counts = [1, 2, 4, 8, min(16, psutil.cpu_count())]
concurrent_results = []

# Use a subset of files for concurrent testing
test_files_subset = test_files[:min(8, len(test_files))]

for n_threads in tqdm(thread_counts, desc="Thread counts"):
    try:
        # Run multiple times for stability
        thread_times = []
        for run in range(3):
            result = benchmark_concurrent_reads(test_files_subset, n_threads)
            thread_times.append(result['aggregate_throughput'])
        
        concurrent_results.append({
            'n_threads': n_threads,
            'aggregate_throughput_mb_s': np.mean(thread_times),
            'throughput_std': np.std(thread_times),
            'scaling_efficiency': np.mean(thread_times) / (thread_times[0] if thread_counts[0] == 1 else np.mean(thread_times)) if n_threads > 1 else 1.0,
            'method': 'concurrent_reads'
        })
        
    except Exception as e:
        print(f"‚ö†Ô∏è Concurrent benchmark error @ {n_threads} threads: {e}")

concurrent_df = pd.DataFrame(concurrent_results)
if not concurrent_df.empty:
    concurrent_df.to_csv('../results/io_concurrent_detailed.csv', index=False)
    print(f"\nüìä Concurrent Read Results:")
    print(concurrent_df[['n_threads', 'aggregate_throughput_mb_s', 'scaling_efficiency']])
else:
    print("‚ö†Ô∏è No concurrent benchmark results collected")

In [None]:
# Comprehensive analysis and visualization
print("\nüìà Analysis and Visualization")
print("=" * 50)

# Combine all results for comprehensive analysis
all_results = []

# Add sequential results
if not seq_df.empty:
    seq_summary = seq_df.copy()
    seq_summary['benchmark_type'] = 'sequential'
    all_results.append(seq_summary)

# Add random results
if not rand_df.empty:
    rand_summary = rand_df.copy()
    rand_summary['benchmark_type'] = 'random'
    all_results.append(rand_summary)

# Add memory-mapped results
if not mmap_df.empty:
    mmap_summary = mmap_df.copy()
    mmap_summary['benchmark_type'] = 'memory_mapped'
    all_results.append(mmap_summary)

if all_results:
    combined_df = pd.concat(all_results, ignore_index=True)
    combined_df.to_csv('../results/io_benchmarks_combined.csv', index=False)
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('I/O Performance Microbenchmarks', fontsize=16)
    
    # 1. Sequential throughput by block size
    if not seq_df.empty:
        seq_by_block = seq_df.groupby('block_kb')['throughput_mb_s'].agg(['mean', 'std']).reset_index()
        axes[0,0].errorbar(seq_by_block['block_kb'], seq_by_block['mean'], 
                          yerr=seq_by_block['std'], marker='o', capsize=5)
        axes[0,0].set_xlabel('Block Size (KB)')
        axes[0,0].set_ylabel('Throughput (MB/s)')
        axes[0,0].set_title('Sequential Read Performance')
        axes[0,0].set_xscale('log', base=2)
        axes[0,0].grid(True, alpha=0.3)
    
    # 2. Random vs Sequential comparison
    if not rand_df.empty and not seq_df.empty:
        # Get comparable data points
        common_blocks = set(seq_df['block_kb']).intersection(set(rand_df['block_kb']))
        if common_blocks:
            seq_comp = seq_df[seq_df['block_kb'].isin(common_blocks)].groupby('block_kb')['throughput_mb_s'].mean()
            rand_comp = rand_df[rand_df['block_kb'].isin(common_blocks)].groupby('block_kb')['throughput_mb_s'].mean()
            
            x = np.arange(len(common_blocks))
            width = 0.35
            
            axes[0,1].bar(x - width/2, seq_comp.values, width, label='Sequential', alpha=0.8)
            axes[0,1].bar(x + width/2, rand_comp.values, width, label='Random', alpha=0.8)
            axes[0,1].set_xlabel('Block Size (KB)')
            axes[0,1].set_ylabel('Throughput (MB/s)')
            axes[0,1].set_title('Sequential vs Random Access')
            axes[0,1].set_xticks(x)
            axes[0,1].set_xticklabels([f'{int(b)}' for b in sorted(common_blocks)])
            axes[0,1].legend()
            axes[0,1].grid(True, alpha=0.3)
    
    # 3. File size impact
    if not seq_df.empty and 'file_size_category' in seq_df.columns:
        size_impact = seq_df.groupby('file_size_category')['throughput_mb_s'].agg(['mean', 'std']).reset_index()
        axes[1,0].bar(size_impact['file_size_category'], size_impact['mean'], 
                     yerr=size_impact['std'], capsize=5, alpha=0.8)
        axes[1,0].set_xlabel('File Size Category')
        axes[1,0].set_ylabel('Throughput (MB/s)')
        axes[1,0].set_title('Throughput by File Size')
        axes[1,0].grid(True, alpha=0.3)
    
    # 4. Concurrent scaling
    if not concurrent_df.empty:
        axes[1,1].plot(concurrent_df['n_threads'], concurrent_df['aggregate_throughput_mb_s'], 
                      marker='o', linewidth=2, markersize=8)
        axes[1,1].set_xlabel('Number of Threads')
        axes[1,1].set_ylabel('Aggregate Throughput (MB/s)')
        axes[1,1].set_title('Multi-threaded Scaling')
        axes[1,1].grid(True, alpha=0.3)
        
        # Add ideal scaling line
        if len(concurrent_df) > 1:
            baseline = concurrent_df[concurrent_df['n_threads'] == 1]['aggregate_throughput_mb_s'].iloc[0]
            ideal_line = [baseline * t for t in concurrent_df['n_threads']]
            axes[1,1].plot(concurrent_df['n_threads'], ideal_line, '--', alpha=0.5, label='Ideal scaling')
            axes[1,1].legend()
    
    plt.tight_layout()
    plt.savefig('../results/io_benchmarks_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print("\nüìä BENCHMARK SUMMARY:")
    print("=" * 40)
    
    if not seq_df.empty:
        best_seq = seq_df.loc[seq_df['throughput_mb_s'].idxmax()]
        print(f"üèÜ Best Sequential: {best_seq['throughput_mb_s']:.1f} MB/s @ {best_seq['block_kb']}KB blocks")
    
    if not rand_df.empty:
        best_rand = rand_df.loc[rand_df['throughput_mb_s'].idxmax()]
        print(f"üé≤ Best Random: {best_rand['throughput_mb_s']:.1f} MB/s @ {best_rand['block_kb']}KB blocks")
    
    if not concurrent_df.empty:
        best_concurrent = concurrent_df.loc[concurrent_df['aggregate_throughput_mb_s'].idxmax()]
        print(f"üîÄ Best Concurrent: {best_concurrent['aggregate_throughput_mb_s']:.1f} MB/s @ {best_concurrent['n_threads']} threads")
    
    print(f"\nüìÑ Results saved to ../results/io_benchmarks_*.csv")
    print(f"üìà Visualization saved to ../results/io_benchmarks_analysis.png")

else:
    print("‚ö†Ô∏è No benchmark results to analyze")

In [None]:
# Performance recommendations and insights
print("\nüí° PERFORMANCE INSIGHTS & RECOMMENDATIONS")
print("=" * 60)

insights = []

# Block size analysis
if not seq_df.empty:
    optimal_block = seq_df.loc[seq_df['throughput_mb_s'].idxmax(), 'block_kb']
    insights.append(f"üîß Optimal block size for sequential reads: {optimal_block}KB")
    
    # Check if larger blocks are consistently better
    block_perf = seq_df.groupby('block_kb')['throughput_mb_s'].mean().sort_index()
    if len(block_perf) > 1:
        if block_perf.iloc[-1] > block_perf.iloc[0] * 1.5:
            insights.append("üìà Larger block sizes show significant performance gains")
        else:
            insights.append("üìä Block size impact is moderate - other factors may dominate")

# Random vs Sequential analysis
if not rand_df.empty and not seq_df.empty:
    avg_seq = seq_df['throughput_mb_s'].mean()
    avg_rand = rand_df['throughput_mb_s'].mean()
    ratio = avg_seq / avg_rand
    
    if ratio > 10:
        insights.append(f"‚ö° Sequential access is {ratio:.1f}x faster - prioritize sequential data layouts")
    elif ratio > 3:
        insights.append(f"üìã Sequential access advantage ({ratio:.1f}x) - consider data organization")
    else:
        insights.append(f"üîç Random access penalty is moderate ({ratio:.1f}x) - storage may have good random performance")

# Concurrency analysis
if not concurrent_df.empty and len(concurrent_df) > 1:
    single_thread = concurrent_df[concurrent_df['n_threads'] == 1]['aggregate_throughput_mb_s'].iloc[0]
    max_threads_row = concurrent_df.loc[concurrent_df['aggregate_throughput_mb_s'].idxmax()]
    max_throughput = max_threads_row['aggregate_throughput_mb_s']
    optimal_threads = max_threads_row['n_threads']
    scaling = max_throughput / single_thread
    
    if scaling > optimal_threads * 0.8:  # Good scaling
        insights.append(f"üöÄ Excellent scaling: {scaling:.1f}x speedup with {optimal_threads} threads")
    elif scaling > optimal_threads * 0.5:  # Moderate scaling  
        insights.append(f"üìà Moderate scaling: {scaling:.1f}x speedup with {optimal_threads} threads")
    else:  # Poor scaling
        insights.append(f"‚ö†Ô∏è Limited scaling: {scaling:.1f}x speedup - I/O may be bottleneck")

# Memory-mapped analysis
if not mmap_df.empty:
    mmap_seq = mmap_df[mmap_df['access_pattern'] == 'mmap_sequential']['throughput_mb_s'].mean()
    if not seq_df.empty:
        regular_seq = seq_df['throughput_mb_s'].mean()
        mmap_advantage = mmap_seq / regular_seq
        
        if mmap_advantage > 1.2:
            insights.append(f"üó∫Ô∏è Memory mapping shows {mmap_advantage:.1f}x advantage - consider for large files")
        elif mmap_advantage < 0.8:
            insights.append(f"üìÅ Regular file I/O outperforms memory mapping - system may have good page cache")
        else:
            insights.append(f"‚öñÔ∏è Memory mapping and regular I/O perform similarly")

# General recommendations
print("\nüéØ KEY INSIGHTS:")
for i, insight in enumerate(insights, 1):
    print(f"   {i}. {insight}")

print("\nüîß OPTIMIZATION RECOMMENDATIONS:")
recommendations = [
    "Configure data loaders with optimal block sizes identified above",
    "Use sequential access patterns when possible (sort data, use columnar formats)",
    "Consider multi-threaded data loading based on scaling results",
    "For large datasets, test memory mapping vs regular I/O",
    "Monitor storage utilization during ML training to identify bottlenecks",
    "Consider NVMe caching for frequently accessed data",
    "Use compression if CPU is available and storage is slow"
]

for i, rec in enumerate(recommendations, 1):
    print(f"   {i}. {rec}")

print("\n‚úÖ I/O Microbenchmarks Complete!")
print(f"üìä Check ../results/ for detailed CSV files and analysis plots")