# Flight Router Benchmarking Suite

**Three separate benchmarks** for accurate performance measurement:

| Benchmark | What it measures | API calls | Purpose |
|-----------|------------------|-----------|----------|
| **Part A: Routing** | Pure algorithm performance | None | Algorithm optimization |
| **Part B: Validation** | API latency only | Yes (fixed routes) | API capacity planning |
| **Part C: Full Pipeline** | End-to-end performance | Yes | Realistic user experience |

## 1. Setup

In [None]:
import sys
import os
import time
import asyncio
import itertools
import statistics
from datetime import datetime, date
from pathlib import Path
from dataclasses import dataclass, field
from typing import List, Set, Dict, Any, Optional

import matplotlib.pyplot as plt
import numpy as np

# Setup path
NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = NOTEBOOK_DIR.parent.parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Core imports
from src.flight_router.application import FindOptimalRoutes
from src.flight_router.services import RouteValidationService
from src.flight_router.adapters.validators import DuffelOfferValidator
from src.flight_router.schemas.validation import ValidationConfig

# Configuration
DEMO_DB = Path("data/demo_flights.db")
DEPARTURE_DATE = datetime(2026, 7, 13)
RETURN_DATE = datetime(2026, 7, 19)

# API Token for validation
DUFFEL_API_TOKEN = os.environ.get("DUFFEL_API_TOKEN", "")
HAS_API_TOKEN = bool(DUFFEL_API_TOKEN)

print("Setup complete!")
print(f"Database: {DEMO_DB}")
print(f"Dates: {DEPARTURE_DATE.date()} to {RETURN_DATE.date()}")
if HAS_API_TOKEN:
    print(f"API Token: {DUFFEL_API_TOKEN[:8]}...{DUFFEL_API_TOKEN[-4:]}")
else:
    print("No API token - Parts B and C will be skipped")

In [None]:
# Discover available destinations
ORIGIN = "WAW"

with FindOptimalRoutes(db_path=DEMO_DB) as router:
    all_airports = router.get_available_airports()
    reachable = {city for city in all_airports if router.has_route(ORIGIN, city)}

CANDIDATE_HUBS = ['LHR', 'CDG', 'FRA', 'AMS', 'MUC', 'FCO', 'VIE', 'ZRH', 'BRU', 'BCN', 'MAD', 'CPH']
DESTINATIONS = [h for h in CANDIDATE_HUBS if h in reachable]

print(f"Origin: {ORIGIN}")
print(f"Available destinations: {DESTINATIONS[:8]}...")
print(f"Total: {len(DESTINATIONS)} destinations")

---

# Part A: Routing Benchmark

**Measures**: Pure routing algorithm performance (no API calls)

**Why separate**: Routing has O(2^k) complexity due to power-set state space.
The algorithm tracks which cities are visited (2^k subsets), not visit order (k! permutations).

In [None]:
@dataclass
class RoutingBenchmarkConfig:
    """Configuration for routing benchmark."""
    origin: str = "WAW"
    max_k: int = 3              # Max destinations
    combos_per_k: int = 3       # Destination combinations per k
    runs_per_combo: int = 3     # Runs per combination
    warmup_runs: int = 1        # Warmup (excluded from results)


@dataclass
class RoutingResult:
    """Result of a single routing benchmark run."""
    k: int
    destinations: tuple
    run_index: int
    time_ms: float
    num_routes: int


routing_config = RoutingBenchmarkConfig(
    max_k=3,
    combos_per_k=3,
    runs_per_combo=3,
)

print("Routing Benchmark Configuration:")
print(f"  Max k: {routing_config.max_k}")
print(f"  Combinations per k: {routing_config.combos_per_k}")
print(f"  Runs per combination: {routing_config.runs_per_combo}")

total_runs = sum(
    min(routing_config.combos_per_k, len(list(itertools.combinations(DESTINATIONS[:k+2], k))))
    * routing_config.runs_per_combo
    for k in range(1, routing_config.max_k + 1)
)
print(f"  Total runs: {total_runs}")

In [None]:
def run_routing_benchmark(config: RoutingBenchmarkConfig, destinations: List[str]) -> List[RoutingResult]:
    """Run pure routing benchmark (no API calls)."""
    results = []
    
    with FindOptimalRoutes(db_path=DEMO_DB) as router:
        for k in range(1, config.max_k + 1):
            if k > len(destinations):
                print(f"Skipping k={k}: not enough destinations")
                continue
            
            # Get destination combinations
            all_combos = list(itertools.combinations(destinations[:k+2], k))
            combos = all_combos[:config.combos_per_k]
            
            print(f"\nk={k}: {len(combos)} combinations x {config.runs_per_combo} runs")
            print("-" * 50)
            
            for combo in combos:
                dest_set = set(combo)
                
                # Warmup
                for _ in range(config.warmup_runs):
                    router.search(
                        origin=config.origin,
                        destinations=dest_set,
                        departure_date=DEPARTURE_DATE,
                        return_date=RETURN_DATE,
                    )
                
                # Benchmark runs
                combo_times = []
                num_routes = 0
                
                for run_idx in range(config.runs_per_combo):
                    start = time.perf_counter()
                    routes = router.search(
                        origin=config.origin,
                        destinations=dest_set,
                        departure_date=DEPARTURE_DATE,
                        return_date=RETURN_DATE,
                    )
                    elapsed_ms = (time.perf_counter() - start) * 1000
                    
                    num_routes = len(routes)
                    combo_times.append(elapsed_ms)
                    
                    results.append(RoutingResult(
                        k=k,
                        destinations=combo,
                        run_index=run_idx,
                        time_ms=elapsed_ms,
                        num_routes=num_routes,
                    ))
                
                avg_time = statistics.mean(combo_times)
                std_time = statistics.stdev(combo_times) if len(combo_times) > 1 else 0
                print(f"  {combo}: {avg_time:.0f}ms (+/-{std_time:.0f}ms), {num_routes} routes")
    
    return results


print("Starting Routing Benchmark...")
print("=" * 60)
routing_results = run_routing_benchmark(routing_config, DESTINATIONS)
print(f"\nCompleted {len(routing_results)} routing benchmark runs")

In [None]:
# Analyze routing results
routing_stats = {}
by_k = {}
for r in routing_results:
    if r.k not in by_k:
        by_k[r.k] = []
    by_k[r.k].append(r)

for k, runs in sorted(by_k.items()):
    times = [r.time_ms for r in runs]
    routes = [r.num_routes for r in runs]
    routing_stats[k] = {
        'mean': statistics.mean(times),
        'std': statistics.stdev(times) if len(times) > 1 else 0,
        'min': min(times),
        'max': max(times),
        'avg_routes': statistics.mean(routes),
        'n': len(times),
    }

print("Routing Benchmark Results")
print("=" * 60)
print(f"{'k':>3} | {'Mean (ms)':>12} | {'Std':>8} | {'Min':>8} | {'Max':>8} | {'Routes':>8}")
print("-" * 60)
for k, s in routing_stats.items():
    print(f"{k:>3} | {s['mean']:>12.0f} | {s['std']:>8.0f} | {s['min']:>8.0f} | {s['max']:>8.0f} | {s['avg_routes']:>8.1f}")

# Scaling analysis
print("\nScaling Analysis:")
k_values = sorted(routing_stats.keys())
for i in range(1, len(k_values)):
    k_prev, k_curr = k_values[i-1], k_values[i]
    ratio = routing_stats[k_curr]['mean'] / routing_stats[k_prev]['mean']
    print(f"  k={k_prev} -> k={k_curr}: {ratio:.2f}x increase")

---

# Part B: Validation Benchmark

**Measures**: Pure API latency (fixed routes, no routing overhead)

**Why separate**: 
- Uses fixed, pre-computed routes to isolate API latency
- Strict pacing (1 request/second) to avoid rate limiting variance
- Measures what matters for capacity planning: API response time

In [None]:
@dataclass
class ValidationBenchmarkConfig:
    """Configuration for validation-only benchmark."""
    num_routes_to_validate: int = 5     # Routes to validate
    runs_per_route: int = 2             # Repeat each route validation
    requests_per_minute: float = 50.0   # Strict pacing to avoid 429s


@dataclass
class ValidationResult:
    """Result of a single validation benchmark run."""
    route_index: int
    run_index: int
    num_segments: int
    validation_time_ms: float
    status: str
    confidence: float
    is_bookable: bool


validation_config = ValidationBenchmarkConfig(
    num_routes_to_validate=5,
    runs_per_route=2,
    requests_per_minute=50.0,
)

print("Validation Benchmark Configuration:")
print(f"  Routes to validate: {validation_config.num_routes_to_validate}")
print(f"  Runs per route: {validation_config.runs_per_route}")
print(f"  Rate limit: {validation_config.requests_per_minute} req/min")
print(f"  Min interval: {60.0 / validation_config.requests_per_minute:.2f}s between requests")

if not HAS_API_TOKEN:
    print("\nWARNING: No API token - this benchmark will be skipped")

In [None]:
async def run_validation_benchmark(
    config: ValidationBenchmarkConfig,
) -> List[ValidationResult]:
    """Run validation-only benchmark with strict pacing."""
    
    if not HAS_API_TOKEN:
        print("Skipping: No API token")
        return []
    
    results = []
    min_interval = 60.0 / config.requests_per_minute
    
    # Get some routes to validate (using k=2 for variety)
    print("Getting routes to validate...")
    with FindOptimalRoutes(db_path=DEMO_DB) as router:
        test_routes = router.search(
            origin=ORIGIN,
            destinations={'LHR', 'CDG'},
            departure_date=DEPARTURE_DATE,
            return_date=RETURN_DATE,
        )[:config.num_routes_to_validate]
    
    if not test_routes:
        print("No routes found to validate")
        return []
    
    print(f"Found {len(test_routes)} routes to benchmark")
    
    # Setup validator
    validator = DuffelOfferValidator(api_token=DUFFEL_API_TOKEN)
    validator_service = RouteValidationService(validator)
    
    print(f"\nValidating {len(test_routes)} routes x {config.runs_per_route} runs")
    print(f"Pacing: {min_interval:.1f}s between validation requests")
    print("-" * 60)
    
    last_request_time = 0.0
    
    for route_idx, route in enumerate(test_routes):
        num_segments = len(route.segments)
        
        for run_idx in range(config.runs_per_route):
            # Strict pacing: wait before each validation
            now = time.perf_counter()
            if last_request_time > 0:
                elapsed = now - last_request_time
                if elapsed < min_interval:
                    wait_time = min_interval - elapsed
                    await asyncio.sleep(wait_time)
            
            # Validate and measure
            start = time.perf_counter()
            validated = await validator_service.validate_routes(
                routes=[route],
                departure_date=DEPARTURE_DATE.date(),
                validate_top_n=1,
            )
            elapsed_ms = (time.perf_counter() - start) * 1000
            last_request_time = time.perf_counter()
            
            vr = validated[0] if validated else None
            status = vr.validation.status.value if vr and vr.validation else "N/A"
            confidence = vr.validation.average_confidence if vr and vr.validation else 0.0
            is_bookable = vr.is_bookable if vr else False
            
            results.append(ValidationResult(
                route_index=route_idx,
                run_index=run_idx,
                num_segments=num_segments,
                validation_time_ms=elapsed_ms,
                status=status,
                confidence=confidence,
                is_bookable=is_bookable,
            ))
            
            print(f"  Route {route_idx+1} run {run_idx+1}: {elapsed_ms:.0f}ms, status={status}, confidence={confidence:.1f}%")
    
    return results


if HAS_API_TOKEN:
    print("Starting Validation Benchmark...")
    print("=" * 60)
    validation_results = await run_validation_benchmark(validation_config)
    print(f"\nCompleted {len(validation_results)} validation benchmark runs")
else:
    print("Validation benchmark skipped (no API token)")
    validation_results = []

In [None]:
if validation_results:
    times = [r.validation_time_ms for r in validation_results]
    validation_stats = {
        'mean': statistics.mean(times),
        'std': statistics.stdev(times) if len(times) > 1 else 0,
        'min': min(times),
        'max': max(times),
        'p50': statistics.median(times),
        'n': len(times),
        'bookable_rate': sum(1 for r in validation_results if r.is_bookable) / len(validation_results) * 100,
        'avg_confidence': statistics.mean(r.confidence for r in validation_results),
    }
    
    cv = (validation_stats['std'] / validation_stats['mean'] * 100) if validation_stats['mean'] > 0 else 0
    
    print("Validation Benchmark Results")
    print("=" * 60)
    print(f"Samples: {validation_stats['n']}")
    print(f"Mean: {validation_stats['mean']:.0f}ms")
    print(f"Std dev: {validation_stats['std']:.0f}ms")
    print(f"Min: {validation_stats['min']:.0f}ms")
    print(f"Max: {validation_stats['max']:.0f}ms")
    print(f"Median (P50): {validation_stats['p50']:.0f}ms")
    print(f"Coefficient of Variation: {cv:.1f}%")
    print(f"\nBookable rate: {validation_stats['bookable_rate']:.1f}%")
    print(f"Avg confidence: {validation_stats['avg_confidence']:.1f}%")
    
    if cv < 30:
        print("\nVariance is LOW - results are consistent")
    else:
        print("\nVariance is HIGH - consider stricter pacing")
else:
    validation_stats = None
    print("No validation results to analyze")

---

# Part C: Full Pipeline Benchmark

**Measures**: Realistic end-to-end performance (routing + validation)

**When to use**: Understanding total user-facing latency for production planning.

In [None]:
@dataclass
class PipelineBenchmarkConfig:
    """Configuration for full pipeline benchmark."""
    origin: str = "WAW"
    max_k: int = 2                      # Keep small due to API time
    combos_per_k: int = 2               # Fewer combinations
    runs_per_combo: int = 2             # Fewer runs
    validate_top_n: int = 2             # Validate top N routes
    requests_per_minute: float = 50.0   # Conservative pacing


@dataclass
class PipelineResult:
    """Result of a full pipeline benchmark run."""
    k: int
    destinations: tuple
    run_index: int
    routing_time_ms: float
    validation_time_ms: float
    num_routes_found: int
    num_routes_validated: int
    num_routes_bookable: int
    
    @property
    def total_time_ms(self) -> float:
        return self.routing_time_ms + self.validation_time_ms


pipeline_config = PipelineBenchmarkConfig(
    max_k=2,
    combos_per_k=2,
    runs_per_combo=2,
    validate_top_n=2,
    requests_per_minute=50.0,
)

print("Full Pipeline Benchmark Configuration:")
print(f"  Max k: {pipeline_config.max_k}")
print(f"  Combinations per k: {pipeline_config.combos_per_k}")
print(f"  Runs per combination: {pipeline_config.runs_per_combo}")
print(f"  Validate top N: {pipeline_config.validate_top_n}")
print(f"  Rate limit: {pipeline_config.requests_per_minute} req/min")

if not HAS_API_TOKEN:
    print("\nWARNING: No API token - this benchmark will be skipped")

In [None]:
async def run_pipeline_benchmark(
    config: PipelineBenchmarkConfig,
    destinations: List[str],
) -> List[PipelineResult]:
    """Run full pipeline benchmark (routing + validation)."""
    
    if not HAS_API_TOKEN:
        print("Skipping: No API token")
        return []
    
    results = []
    min_interval = 60.0 / config.requests_per_minute
    last_request_time = 0.0
    
    validator = DuffelOfferValidator(api_token=DUFFEL_API_TOKEN)
    validator_service = RouteValidationService(validator)
    
    with FindOptimalRoutes(db_path=DEMO_DB) as router:
        for k in range(1, config.max_k + 1):
            if k > len(destinations):
                continue
            
            combos = list(itertools.combinations(destinations[:k+2], k))[:config.combos_per_k]
            
            print(f"\nk={k}: {len(combos)} combinations x {config.runs_per_combo} runs")
            print("-" * 60)
            
            for combo in combos:
                dest_set = set(combo)
                
                for run_idx in range(config.runs_per_combo):
                    # Phase 1: Routing
                    routing_start = time.perf_counter()
                    routes = router.search(
                        origin=config.origin,
                        destinations=dest_set,
                        departure_date=DEPARTURE_DATE,
                        return_date=RETURN_DATE,
                    )
                    routing_time_ms = (time.perf_counter() - routing_start) * 1000
                    
                    # Phase 2: Validation with pacing
                    now = time.perf_counter()
                    if last_request_time > 0:
                        elapsed = now - last_request_time
                        if elapsed < min_interval:
                            await asyncio.sleep(min_interval - elapsed)
                    
                    validation_start = time.perf_counter()
                    validated = await validator_service.validate_routes(
                        routes=routes,
                        departure_date=DEPARTURE_DATE.date(),
                        validate_top_n=config.validate_top_n,
                    )
                    validation_time_ms = (time.perf_counter() - validation_start) * 1000
                    last_request_time = time.perf_counter()
                    
                    num_validated = sum(1 for v in validated if v.is_validated)
                    num_bookable = sum(1 for v in validated if v.is_bookable)
                    
                    results.append(PipelineResult(
                        k=k,
                        destinations=combo,
                        run_index=run_idx,
                        routing_time_ms=routing_time_ms,
                        validation_time_ms=validation_time_ms,
                        num_routes_found=len(routes),
                        num_routes_validated=num_validated,
                        num_routes_bookable=num_bookable,
                    ))
                    
                    print(f"  {combo} run {run_idx+1}: routing={routing_time_ms:.0f}ms, "
                          f"validation={validation_time_ms:.0f}ms, total={routing_time_ms + validation_time_ms:.0f}ms")
    
    return results


if HAS_API_TOKEN:
    print("Starting Full Pipeline Benchmark...")
    print("=" * 60)
    pipeline_results = await run_pipeline_benchmark(pipeline_config, DESTINATIONS)
    print(f"\nCompleted {len(pipeline_results)} pipeline benchmark runs")
else:
    print("Full pipeline benchmark skipped (no API token)")
    pipeline_results = []

In [None]:
if pipeline_results:
    pipeline_stats = {}
    by_k = {}
    for r in pipeline_results:
        if r.k not in by_k:
            by_k[r.k] = []
        by_k[r.k].append(r)
    
    for k, runs in sorted(by_k.items()):
        routing_times = [r.routing_time_ms for r in runs]
        validation_times = [r.validation_time_ms for r in runs]
        total_times = [r.total_time_ms for r in runs]
        
        pipeline_stats[k] = {
            'routing_mean': statistics.mean(routing_times),
            'validation_mean': statistics.mean(validation_times),
            'total_mean': statistics.mean(total_times),
            'total_std': statistics.stdev(total_times) if len(total_times) > 1 else 0,
            'avg_routes': statistics.mean(r.num_routes_found for r in runs),
            'avg_bookable': statistics.mean(r.num_routes_bookable for r in runs),
        }
    
    print("Full Pipeline Benchmark Results")
    print("=" * 80)
    print(f"{'k':>3} | {'Routing (ms)':>14} | {'Validation (ms)':>16} | {'Total (ms)':>14} | {'Bookable':>8}")
    print("-" * 80)
    for k, s in pipeline_stats.items():
        print(f"{k:>3} | {s['routing_mean']:>14.0f} | {s['validation_mean']:>16.0f} | "
              f"{s['total_mean']:>14.0f} | {s['avg_bookable']:>8.1f}")
    
    print("\nTime breakdown at max k:")
    max_k = max(pipeline_stats.keys())
    s = pipeline_stats[max_k]
    total = s['total_mean']
    print(f"  Routing:    {s['routing_mean']:>8.0f}ms ({s['routing_mean']/total*100:.1f}%)")
    print(f"  Validation: {s['validation_mean']:>8.0f}ms ({s['validation_mean']/total*100:.1f}%)")
else:
    pipeline_stats = None
    print("No pipeline results to analyze")

---

# Visualization

In [None]:
# Complexity fitting function
def fit_exponential(k_vals, times):
    """
    Fit T(k) = c × b^k to data using log-linear regression.
    
    Returns (c, b, r_squared) where:
    - c: coefficient
    - b: base of exponential (should be ~2.0 for O(2^k))
    - r_squared: goodness of fit in log space
    """
    k_arr = np.array(k_vals)
    log_times = np.log(times)
    # Linear fit in log space: log(T) = log(c) + k × log(b)
    slope, intercept = np.polyfit(k_arr, log_times, 1)
    b = np.exp(slope)   # Base of exponential
    c = np.exp(intercept)  # Coefficient
    
    # R² in log space
    log_pred = intercept + slope * k_arr
    ss_res = np.sum((log_times - log_pred) ** 2)
    ss_tot = np.sum((log_times - np.mean(log_times)) ** 2)
    r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
    
    return c, b, r_squared

# Fit the complexity model
k_vals = sorted(routing_stats.keys())
means = [routing_stats[k]['mean'] for k in k_vals]
stds = [routing_stats[k]['std'] for k in k_vals]

c_fit, b_fit, r_sq = fit_exponential(k_vals, means)

# Determine if O(2^k) is confirmed
is_exp_2 = (1.7 < b_fit < 2.4) and (r_sq > 0.95)
deviation_from_2 = abs(b_fit - 2.0) / 2.0 * 100

print("Complexity Analysis:")
print(f"  Fitted model: T(k) = {c_fit:.0f} × {b_fit:.2f}^k")
print(f"  Fitted base: {b_fit:.3f} (deviation from 2.0: {deviation_from_2:.1f}%)")
print(f"  R² (log space): {r_sq:.4f}")
print(f"  Expected for O(2^k): base ≈ 2.0, R² ≈ 1.0")
print(f"  Conclusion: {'O(2^k) confirmed' if is_exp_2 else 'Check fit details'}")

# Determine what to plot based on available data
has_validation = bool(validation_results)
has_pipeline = bool(pipeline_results)

if has_validation and has_pipeline:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
elif has_pipeline:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    axes = [[axes[0], axes[1]], [None, None]]
else:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    axes = [[axes[0], axes[1]], [None, None]]

fig.suptitle('Flight Router Benchmark Results', fontsize=14, fontweight='bold')

# Plot 1: Routing benchmark results
ax1 = axes[0][0]
ax1.bar(k_vals, means, yerr=stds, capsize=5, color='#2196F3', alpha=0.8)
ax1.set_xlabel('Number of Destinations (k)')
ax1.set_ylabel('Time (ms)')
ax1.set_title('Part A: Routing Performance')
ax1.set_xticks(k_vals)
ax1.grid(True, alpha=0.3, axis='y')

# Plot 2: Routing scaling (log) with theoretical fit
ax2 = axes[0][1]

# Observed data points
ax2.semilogy(k_vals, means, 'o', color='#2196F3', markersize=12, label='Observed', zorder=3)

# Theoretical fit line
k_theory = np.linspace(min(k_vals) - 0.2, max(k_vals) + 0.2, 100)
t_theory = c_fit * (b_fit ** k_theory)
ax2.semilogy(k_theory, t_theory, '--', color='#E53935', linewidth=2, 
             label=f'Fit: {c_fit:.0f} × {b_fit:.2f}^k (R²={r_sq:.3f})', zorder=2)

# Reference O(2^k) line (normalized to first data point)
c_ref = means[0] / (2 ** k_vals[0])
t_ref = c_ref * (2 ** k_theory)
ax2.semilogy(k_theory, t_ref, ':', color='#4CAF50', linewidth=2, alpha=0.7,
             label='Reference: O(2^k)', zorder=1)

ax2.set_xlabel('Number of Destinations (k)')
ax2.set_ylabel('Time (ms) - log scale')
ax2.set_title(f'Routing Scaling: O({b_fit:.2f}^k) ≈ O(2^k)')
ax2.set_xticks(k_vals)
ax2.legend(loc='upper left')
ax2.grid(True, alpha=0.3)

if has_validation and axes[1][0] is not None:
    # Plot 3: Validation latency distribution
    ax3 = axes[1][0]
    times = [r.validation_time_ms for r in validation_results]
    ax3.hist(times, bins=10, color='#FF9800', alpha=0.8, edgecolor='white')
    ax3.axvline(validation_stats['mean'], color='red', linestyle='--', label=f"Mean: {validation_stats['mean']:.0f}ms")
    ax3.set_xlabel('Validation Time (ms)')
    ax3.set_ylabel('Frequency')
    ax3.set_title('Part B: Validation Latency Distribution')
    ax3.legend()
    ax3.grid(True, alpha=0.3, axis='y')

if has_pipeline and axes[1][1] is not None:
    # Plot 4: Pipeline breakdown
    ax4 = axes[1][1]
    k_vals_p = sorted(pipeline_stats.keys())
    routing_p = [pipeline_stats[k]['routing_mean'] for k in k_vals_p]
    validation_p = [pipeline_stats[k]['validation_mean'] for k in k_vals_p]
    x = np.arange(len(k_vals_p))
    ax4.bar(x, routing_p, 0.6, label='Routing', color='#2196F3', alpha=0.8)
    ax4.bar(x, validation_p, 0.6, bottom=routing_p, label='Validation', color='#FF9800', alpha=0.8)
    ax4.set_xlabel('Number of Destinations (k)')
    ax4.set_ylabel('Time (ms)')
    ax4.set_title('Part C: Full Pipeline Breakdown')
    ax4.set_xticks(x)
    ax4.set_xticklabels(k_vals_p)
    ax4.legend()
    ax4.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('benchmark_results.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nPlot saved to: benchmark_results.png")

---

# Export Results

In [None]:
import json
from datetime import datetime as dt

export_data = {
    'timestamp': dt.now().isoformat(),
    'benchmarks': {
        'routing': {
            'config': {
                'max_k': routing_config.max_k,
                'combos_per_k': routing_config.combos_per_k,
                'runs_per_combo': routing_config.runs_per_combo,
            },
            'statistics': {str(k): v for k, v in routing_stats.items()},
            'raw_results': [
                {'k': r.k, 'destinations': list(r.destinations), 'time_ms': r.time_ms, 'num_routes': r.num_routes}
                for r in routing_results
            ],
        },
        'validation': {
            'statistics': validation_stats,
            'raw_results': [
                {'route_index': r.route_index, 'time_ms': r.validation_time_ms, 'status': r.status, 'confidence': r.confidence}
                for r in validation_results
            ] if validation_results else [],
        } if validation_stats else None,
        'pipeline': {
            'statistics': {str(k): v for k, v in pipeline_stats.items()},
            'raw_results': [
                {'k': r.k, 'routing_ms': r.routing_time_ms, 'validation_ms': r.validation_time_ms, 'total_ms': r.total_time_ms}
                for r in pipeline_results
            ],
        } if pipeline_stats else None,
    },
}

output_file = Path('benchmark_results.json')
with open(output_file, 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"Results exported to: {output_file}")

---

# Summary

In [None]:
print("=" * 70)
print("BENCHMARK SUMMARY")
print("=" * 70)

print("\nPart A: Routing Benchmark")
print("-" * 40)
for k in sorted(routing_stats.keys()):
    s = routing_stats[k]
    print(f"  k={k}: {s['mean']:.0f}ms (+/-{s['std']:.0f}ms)")

if validation_stats:
    print("\nPart B: Validation Benchmark")
    print("-" * 40)
    print(f"  Mean latency: {validation_stats['mean']:.0f}ms")
    print(f"  Std dev: {validation_stats['std']:.0f}ms")
    print(f"  Bookable rate: {validation_stats['bookable_rate']:.1f}%")
else:
    print("\nPart B: Validation Benchmark - SKIPPED (no API token)")

if pipeline_stats:
    print("\nPart C: Full Pipeline Benchmark")
    print("-" * 40)
    for k in sorted(pipeline_stats.keys()):
        s = pipeline_stats[k]
        print(f"  k={k}: routing={s['routing_mean']:.0f}ms + validation={s['validation_mean']:.0f}ms = {s['total_mean']:.0f}ms")
else:
    print("\nPart C: Full Pipeline Benchmark - SKIPPED (no API token)")

print("\n" + "=" * 70)
print("Files generated:")
print("  - benchmark_results.png")
print("  - benchmark_results.json")