# Monte Carlo Walk-Forward Validation - Statistical Arbitrage Strategy

**Strategy**: Hypothesis 5 - Statistical Arbitrage Pairs Trading  
**Project ID**: 26140717  
**Optimized Sharpe**: 1.829  
**Baseline Sharpe**: 0.127  

## Optimized Parameters to Validate:
- z_entry_threshold: 1.5
- z_exit_threshold: 1.0
- lookback_period: 30
- position_size_per_pair: 0.40
- max_holding_days: 30
- stop_loss_z: 4.0

## Approach:
Uses QuantBook to:
1. Access historical data for pairs (PNC/KBE, ARCC/AMLP, RBA/SMFG, ENB/WEC)
2. Run Monte Carlo splits (random train/test periods)
3. Execute strategy logic locally in Python
4. Calculate Sharpe ratio for each period
5. Analyze degradation (train vs test)

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import random
from collections import Counter, deque
import json

# LOCAL DEBUG MODE - Use mock QuantConnect API
print("="*70)
print("LOCAL DEBUG MODE - Using Mock QuantConnect API")
print("="*70)
from mock_quantbook import QuantBook, Resolution

# Initialize QuantBook (mock)
qb = QuantBook()

print("✓ Mock QuantConnect Research environment initialized")

In [None]:
# ==================== CONFIGURATION ====================

config = {
    'project_id': 26140717,
    
    # Pairs to trade
    'pairs': [
        {'long': 'PNC', 'short': 'KBE', 'name': 'PNC_KBE'},
        {'long': 'ARCC', 'short': 'AMLP', 'name': 'ARCC_AMLP'},
        {'long': 'RBA', 'short': 'SMFG', 'name': 'RBA_SMFG'},
        {'long': 'ENB', 'short': 'WEC', 'name': 'ENB_WEC'}
    ],
    
    # Total period for analysis
    'total_period': {
        'start': datetime(2022, 1, 1),
        'end': datetime(2025, 10, 31)
    },
    
    # Monte Carlo configuration
    'train_test_split': 0.70,
    'monte_carlo_runs': 20,  # Gradual scaling: 20 → 50 → 100 → ... → 1000+
    'random_seed': 42,
    
    # Optimized parameters to test
    'parameters': {
        'z_entry_threshold': 1.5,
        'z_exit_threshold': 1.0,
        'lookback_period': 30,
        'position_size_per_pair': 0.40,
        'max_holding_days': 30,
        'stop_loss_z': 4.0
    },
    
    'baseline_sharpe': 1.829,
    'initial_capital': 100000
}

# Set random seed
if config['random_seed']:
    random.seed(config['random_seed'])
    np.random.seed(config['random_seed'])

print("Configuration:")
print(f"  Pairs: {len(config['pairs'])}")
print(f"  Period: {config['total_period']['start'].date()} to {config['total_period']['end'].date()}")
print(f"  Train/Test: {config['train_test_split']*100:.0f}%/{(1-config['train_test_split'])*100:.0f}%")
print(f"  Monte Carlo runs: {config['monte_carlo_runs']} (testing gradually toward 1000+)")
print(f"  Parameters: {config['parameters']}")
print(f"  Baseline Sharpe: {config['baseline_sharpe']:.3f}")

In [None]:
# ==================== SUBSCRIBE TO SECURITIES ====================

print("Subscribing to securities...")

symbols = {}
for pair in config['pairs']:
    long_sym = qb.AddEquity(pair['long'], Resolution.Daily).Symbol
    short_sym = qb.AddEquity(pair['short'], Resolution.Daily).Symbol
    symbols[pair['name']] = {'long': long_sym, 'short': short_sym}
    print(f"  ✓ {pair['name']}: {pair['long']}/{pair['short']}")

print(f"\n✓ Subscribed to {len(symbols)} pairs")

In [None]:
# ==================== DEBUG: DATA ALIGNMENT ISSUE ====================

print("="*70)
print("DEBUGGING DATA ALIGNMENT ISSUE")
print("="*70)
print()

# Test with one pair in a problematic test period
test_pair = config['pairs'][0]  # PNC/KBE
test_start = datetime(2025, 3, 23)  # From the error screenshot
test_end = datetime(2026, 5, 16)

print(f"Testing {test_pair['name']}: {test_pair['long']}/{test_pair['short']}")
print(f"Period: {test_start.date()} to {test_end.date()} ({(test_end - test_start).days} days)")
print()

# Fetch data
long_hist = qb.History([symbols[test_pair['name']]['long']], test_start, test_end, Resolution.Daily)
short_hist = qb.History([symbols[test_pair['name']]['short']], test_start, test_end, Resolution.Daily)

print(f"Raw fetch: Long={long_hist.shape[0]} rows, Short={short_hist.shape[0]} rows")
print()

# DISPLAY long history
print("Long history (full):")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
long_hist.head(len(long_hist))

print()

# DISPLAY short history
print("Short history (full):")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
short_hist.head(len(short_hist))

print()

# Extract close prices
if isinstance(long_hist.index, pd.MultiIndex):
    long_close = long_hist['close'].droplevel(0)
    short_close = short_hist['close'].droplevel(0)
else:
    long_close = long_hist['close']
    short_close = short_hist['close']

print(f"After extract: Long={len(long_close)}, Short={len(short_close)}")
print(f"Long NaN count: {long_close.isna().sum()}")
print(f"Short NaN count: {short_close.isna().sum()}")
print()

# Date overlap analysis
long_dates = set(long_close.index)
short_dates = set(short_close.index)
common_dates = long_dates & short_dates
long_only = long_dates - short_dates
short_only = short_dates - long_dates

print("="*70)
print("DATE OVERLAP ANALYSIS")
print("="*70)
print(f"Long-only dates:  {len(long_only)}")
print(f"Short-only dates: {len(short_only)}")
print(f"Common dates:     {len(common_dates)}")
print()

# Current method
df_current = pd.DataFrame({'long_price': long_close, 'short_price': short_close})
print("="*70)
print("CURRENT METHOD (outer join + dropna)")
print("="*70)
print(f"Before dropna: {df_current.shape[0]} rows")
print(f"  Long NaN:    {df_current['long_price'].isna().sum()}")
print(f"  Short NaN:   {df_current['short_price'].isna().sum()}")
print(f"  Both valid:  {(df_current['long_price'].notna() & df_current['short_price'].notna()).sum()}")
print()
print("df_current (full):")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
df_current.head(len(df_current))

print()

df_current_clean = df_current.dropna()
print(f"After dropna:  {df_current_clean.shape[0]} rows")
print()
print("df_current_clean (full):")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
df_current_clean.head(len(df_current_clean))

print()

# Recommended method
print("="*70)
print("RECOMMENDED METHOD (reindex to common dates)")
print("="*70)
common_dates_sorted = sorted(common_dates)
df_recommended = pd.DataFrame({
    'long_price': long_close.reindex(common_dates_sorted),
    'short_price': short_close.reindex(common_dates_sorted)
})
print(f"After reindex: {df_recommended.shape[0]} rows")
print(f"  Long NaN:    {df_recommended['long_price'].isna().sum()}")
print(f"  Short NaN:   {df_recommended['short_price'].isna().sum()}")
print()
print("df_recommended (full):")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
df_recommended.head(len(df_recommended))

print()

df_recommended_clean = df_recommended.dropna()
print(f"After dropna:  {df_recommended_clean.shape[0]} rows")
print()
print("df_recommended_clean (full):")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
df_recommended_clean.head(len(df_recommended_clean))

print()

# Diagnosis
print("="*70)
print("DIAGNOSIS")
print("="*70)
print(f"Expected (common dates):     {len(common_dates)}")
print(f"Current method result:       {df_current_clean.shape[0]}")
print(f"Recommended method result:   {df_recommended_clean.shape[0]}")
print(f"Improvement:                 +{df_recommended_clean.shape[0] - df_current_clean.shape[0]} rows")
print()

print("="*70)
print("ROOT CAUSE")
print("="*70)
if len(long_only) > 0 or len(short_only) > 0:
    print(f"DATE MISALIGNMENT: {len(long_only) + len(short_only)} non-overlapping trading days")
else:
    print("NO DATE MISALIGNMENT")

if long_close.isna().sum() > 0 or short_close.isna().sum() > 0:
    print(f"NaN VALUES: Long has {long_close.isna().sum()} NaN, Short has {short_close.isna().sum()} NaN")
else:
    print("NO NaN VALUES")
print("="*70)

In [None]:
# ==================== HELPER FUNCTIONS ====================

def generate_random_split(start_date, end_date, train_pct, seed=None):
    """Generate random train/test split for Monte Carlo"""
    if seed is not None:
        random.seed(seed)
    
    total_days = (end_date - start_date).days
    train_days = int(total_days * train_pct)
    test_days = total_days - train_days
    
    # Random start for training window
    max_offset = test_days
    offset = random.randint(0, max(0, max_offset))
    
    train_start = start_date + timedelta(days=offset)
    train_end = train_start + timedelta(days=train_days)
    test_start = train_end + timedelta(days=1)
    test_end = train_start + timedelta(days=total_days)
    
    return train_start, train_end, test_start, test_end


def calculate_spread(long_prices, short_prices):
    """Calculate spread between two price series"""
    return np.log(long_prices) - np.log(short_prices)


def calculate_zscore(spread, lookback):
    """Calculate z-score using rolling window"""
    if len(spread) < lookback:
        return pd.Series([np.nan] * len(spread), index=spread.index)
    
    rolling_mean = spread.rolling(window=lookback).mean()
    rolling_std = spread.rolling(window=lookback).std(ddof=1)
    
    zscore = (spread - rolling_mean) / rolling_std
    return zscore


def simulate_strategy(data, params):
    """
    Simulate statistical arbitrage strategy on historical data
    
    Args:
        data: Dict of DataFrames with price data for each pair
        params: Strategy parameters
    
    Returns:
        equity_curve: Daily portfolio values
        trades: List of trade records
    """
    capital = config['initial_capital']
    equity_curve = []
    trades = []
    
    # Get all dates (union of all pair dates)
    all_dates = sorted(set().union(*[set(df.index) for df in data.values()]))
    
    # Track positions for each pair
    positions = {pair['name']: None for pair in config['pairs']}
    
    for date in all_dates:
        daily_pnl = 0
        
        # Process each pair
        for pair in config['pairs']:
            pair_name = pair['name']
            df = data[pair_name]
            
            if date not in df.index:
                continue
            
            # Get current prices and z-score
            current_data = df.loc[:date]
            if len(current_data) < params['lookback_period']:
                continue
            
            long_price = df.loc[date, 'long_price']
            short_price = df.loc[date, 'short_price']
            z_score = df.loc[date, 'zscore']
            
            if np.isnan(z_score):
                continue
            
            pos = positions[pair_name]
            
            # Check exit conditions
            if pos is not None:
                days_held = (date - pos['entry_date']).days
                
                # Calculate current P&L
                if pos['direction'] == 'long_spread':
                    pnl = (long_price / pos['entry_long'] - 1) * pos['long_shares'] * pos['entry_long']
                    pnl -= (short_price / pos['entry_short'] - 1) * pos['short_shares'] * pos['entry_short']
                else:
                    pnl = (short_price / pos['entry_short'] - 1) * pos['short_shares'] * pos['entry_short']
                    pnl -= (long_price / pos['entry_long'] - 1) * pos['long_shares'] * pos['entry_long']
                
                daily_pnl += pnl - pos['last_pnl']
                pos['last_pnl'] = pnl
                
                # Exit conditions
                exit_signal = False
                exit_reason = None
                
                if abs(z_score) < params['z_exit_threshold']:
                    exit_signal = True
                    exit_reason = 'mean_reversion'
                elif days_held >= params['max_holding_days']:
                    exit_signal = True
                    exit_reason = 'timeout'
                elif abs(z_score) > params['stop_loss_z']:
                    exit_signal = True
                    exit_reason = 'stop_loss'
                
                if exit_signal:
                    capital += pnl
                    trades.append({
                        'pair': pair_name,
                        'entry_date': pos['entry_date'],
                        'exit_date': date,
                        'entry_z': pos['entry_z'],
                        'exit_z': z_score,
                        'pnl': pnl,
                        'exit_reason': exit_reason,
                        'days_held': days_held
                    })
                    positions[pair_name] = None
            
            # Check entry conditions (if no position)
            if positions[pair_name] is None:
                if abs(z_score) > params['z_entry_threshold']:
                    # Calculate position sizes (dollar-neutral)
                    pair_capital = capital * params['position_size_per_pair']
                    
                    if z_score > 0:  # Short spread (long short, short long)
                        direction = 'short_spread'
                        long_shares = pair_capital / (2 * long_price)
                        short_shares = pair_capital / (2 * short_price)
                    else:  # Long spread (long long, short short)
                        direction = 'long_spread'
                        long_shares = pair_capital / (2 * long_price)
                        short_shares = pair_capital / (2 * short_price)
                    
                    positions[pair_name] = {
                        'entry_date': date,
                        'entry_z': z_score,
                        'entry_long': long_price,
                        'entry_short': short_price,
                        'long_shares': long_shares,
                        'short_shares': short_shares,
                        'direction': direction,
                        'last_pnl': 0
                    }
        
        # Record equity
        equity_curve.append({'date': date, 'equity': capital})
    
    return pd.DataFrame(equity_curve).set_index('date'), trades


def calculate_sharpe(equity_curve):
    """Calculate annualized Sharpe ratio"""
    returns = equity_curve['equity'].pct_change().dropna()
    if len(returns) == 0 or returns.std() == 0:
        return 0.0
    
    sharpe = returns.mean() / returns.std() * np.sqrt(252)  # Annualized
    return sharpe


print("✓ Helper functions loaded")

In [None]:
# ==================== MONTE CARLO WALK-FORWARD ====================

print("="*70)
print("MONTE CARLO WALK-FORWARD ANALYSIS - STATISTICAL ARBITRAGE")
print("="*70)
print()

results = []
errors = []

for run in range(config['monte_carlo_runs']):
    print(f"\n{'='*70}")
    print(f"Monte Carlo Run {run + 1}/{config['monte_carlo_runs']}")
    print(f"{'='*70}")
    
    try:
        # 1. Generate random train/test split
        train_start, train_end, test_start, test_end = generate_random_split(
            config['total_period']['start'],
            config['total_period']['end'],
            config['train_test_split'],
            seed=run if config['random_seed'] else None
        )
        
        print(f"Training:  {train_start.date()} to {train_end.date()} ({(train_end - train_start).days} days)")
        print(f"Testing:   {test_start.date()} to {test_end.date()} ({(test_end - test_start).days} days)")
        
        # 2. Fetch historical data for TRAINING period
        print(f"\nFetching training data...")
        train_data = {}
        for pair in config['pairs']:
            # Fetch history - use list with single symbol to get clean DataFrame
            long_hist = qb.History([symbols[pair['name']]['long']], train_start, train_end, Resolution.Daily)
            short_hist = qb.History([symbols[pair['name']]['short']], train_start, train_end, Resolution.Daily)
            
            if long_hist.empty or short_hist.empty:
                print(f"  ⚠ Skipping {pair['name']}: no data")
                continue
            
            # Extract close prices - handle multi-index if present
            if isinstance(long_hist.index, pd.MultiIndex):
                long_close = long_hist['close'].droplevel(0)
                short_close = short_hist['close'].droplevel(0)
            else:
                long_close = long_hist['close']
                short_close = short_hist['close']
            
            # Create aligned DataFrame
            df = pd.DataFrame({
                'long_price': long_close,
                'short_price': short_close
            }).dropna()
            
            # Only require lookback period worth of data
            if len(df) < config['parameters']['lookback_period']:
                print(f"  ⚠ Skipping {pair['name']}: insufficient data ({len(df)} rows, need {config['parameters']['lookback_period']})")
                continue
            
            # Calculate spread and z-score
            df['spread'] = np.log(df['long_price']) - np.log(df['short_price'])
            df['zscore'] = calculate_zscore(df['spread'], config['parameters']['lookback_period'])
            
            train_data[pair['name']] = df
            print(f"  ✓ {pair['name']}: {len(df)} days")
        
        if len(train_data) == 0:
            raise ValueError("No training data available for any pair")
        
        print(f"  ✓ Fetched data for {len(train_data)} pairs")
        
        # 3. Run strategy on TRAINING data
        print(f"Running strategy on training period...")
        train_equity, train_trades = simulate_strategy(train_data, config['parameters'])
        train_sharpe = calculate_sharpe(train_equity)
        print(f"  ✓ Training Sharpe: {train_sharpe:.3f} ({len(train_trades)} trades)")
        
        # 4. Fetch historical data for TESTING period
        print(f"\nFetching testing data...")
        test_data = {}
        for pair in config['pairs']:
            # Fetch history
            long_hist = qb.History([symbols[pair['name']]['long']], test_start, test_end, Resolution.Daily)
            short_hist = qb.History([symbols[pair['name']]['short']], test_start, test_end, Resolution.Daily)
            
            if long_hist.empty or short_hist.empty:
                print(f"  ⚠ Skipping {pair['name']}: no data")
                continue
            
            # Extract close prices
            if isinstance(long_hist.index, pd.MultiIndex):
                long_close = long_hist['close'].droplevel(0)
                short_close = short_hist['close'].droplevel(0)
            else:
                long_close = long_hist['close']
                short_close = short_hist['close']
            
            # Create aligned DataFrame
            df = pd.DataFrame({
                'long_price': long_close,
                'short_price': short_close
            }).dropna()
            
            # Only require lookback period worth of data
            if len(df) < config['parameters']['lookback_period']:
                print(f"  ⚠ Skipping {pair['name']}: insufficient data ({len(df)} rows, need {config['parameters']['lookback_period']})")
                continue
            
            # Calculate spread and z-score
            df['spread'] = np.log(df['long_price']) - np.log(df['short_price'])
            df['zscore'] = calculate_zscore(df['spread'], config['parameters']['lookback_period'])
            
            test_data[pair['name']] = df
            print(f"  ✓ {pair['name']}: {len(df)} days")
        
        if len(test_data) == 0:
            raise ValueError("No testing data available for any pair")
        
        print(f"  ✓ Fetched data for {len(test_data)} pairs")
        
        # 5. Run strategy on TESTING data
        print(f"Running strategy on testing period...")
        test_equity, test_trades = simulate_strategy(test_data, config['parameters'])
        test_sharpe = calculate_sharpe(test_equity)
        print(f"  ✓ Testing Sharpe: {test_sharpe:.3f} ({len(test_trades)} trades)")
        
        # 6. Calculate degradation
        if train_sharpe > 0:
            degradation = (train_sharpe - test_sharpe) / train_sharpe
        else:
            degradation = 1.0
        
        print(f"  Degradation: {degradation*100:.1f}%")
        
        # Store results
        results.append({
            'run': run + 1,
            'train_start': train_start,
            'train_end': train_end,
            'test_start': test_start,
            'test_end': test_end,
            'train_sharpe': float(train_sharpe),
            'test_sharpe': float(test_sharpe),
            'degradation': float(degradation),
            'train_trades': len(train_trades),
            'test_trades': len(test_trades)
        })
        
        print(f"  ✓ Run {run + 1} complete")
        
    except Exception as e:
        import traceback
        error_msg = str(e)
        traceback_str = traceback.format_exc()
        print(f"  ✗ Error in run {run + 1}: {error_msg}")
        print(f"  Traceback:\n{traceback_str}")
        errors.append({'run': run + 1, 'error': error_msg, 'traceback': traceback_str})
        continue

print(f"\n{'='*70}")
print(f"Monte Carlo Walk-Forward Complete")
print(f"  Successful runs: {len(results)}/{config['monte_carlo_runs']}")
print(f"  Failed runs: {len(errors)}/{config['monte_carlo_runs']}")
print(f"{'='*70}")

In [None]:
# ==================== ANALYSIS ====================

if len(results) == 0:
    print("✗ No successful runs to analyze")
else:
    df_results = pd.DataFrame(results)
    
    print("\n" + "="*70)
    print("AGGREGATE RESULTS")
    print("="*70)
    
    # Basic statistics
    mean_train = df_results['train_sharpe'].mean()
    std_train = df_results['train_sharpe'].std()
    mean_test = df_results['test_sharpe'].mean()
    std_test = df_results['test_sharpe'].std()
    mean_deg = df_results['degradation'].mean()
    std_deg = df_results['degradation'].std()
    
    print(f"\nPerformance Metrics:")
    print(f"  Baseline Sharpe (original):  {config['baseline_sharpe']:.3f}")
    print(f"  Mean Training Sharpe:         {mean_train:.3f} ± {std_train:.3f}")
    print(f"  Mean Testing Sharpe:          {mean_test:.3f} ± {std_test:.3f}")
    print(f"  Mean Degradation:             {mean_deg*100:.1f}% ± {std_deg*100:.1f}%")
    
    # ==================== BETTER OVERFITTING INDICATORS ====================
    
    print(f"\n" + "="*70)
    print("OVERFITTING INDICATORS")
    print("="*70)
    
    # 1. Test Sharpe Stability (Coefficient of Variation)
    test_sharpe_cv = (std_test / mean_test) if mean_test != 0 else float('inf')
    print(f"\n1. Test Sharpe Stability:")
    print(f"   Coefficient of Variation: {test_sharpe_cv:.2f}")
    print(f"   Interpretation: {'STABLE' if test_sharpe_cv < 0.5 else 'UNSTABLE' if test_sharpe_cv < 1.0 else 'HIGHLY UNSTABLE'}")
    print(f"   (Lower is better: <0.5 stable, 0.5-1.0 moderate, >1.0 unstable)")
    
    # 2. Walk-Forward Efficiency
    wf_efficiency = mean_test / mean_train if mean_train != 0 else 0
    print(f"\n2. Walk-Forward Efficiency:")
    print(f"   OOS Sharpe / IS Sharpe: {wf_efficiency:.1%}")
    print(f"   Interpretation: {'EXCELLENT' if wf_efficiency > 0.80 else 'GOOD' if wf_efficiency > 0.60 else 'ACCEPTABLE' if wf_efficiency > 0.40 else 'WEAK' if wf_efficiency > 0.25 else 'SEVERE OVERFIT'}")
    print(f"   (Expected: 25-80% for robust strategies)")
    
    # 3. Test Sharpe vs Baseline
    test_vs_baseline = (mean_test - config['baseline_sharpe']) / config['baseline_sharpe'] if config['baseline_sharpe'] != 0 else 0
    print(f"\n3. Test Sharpe vs Baseline:")
    print(f"   Difference: {test_vs_baseline:+.1%}")
    print(f"   Mean Test: {mean_test:.3f} vs Baseline: {config['baseline_sharpe']:.3f}")
    if abs(test_vs_baseline) < 0.15:
        print(f"   Interpretation: CONSISTENT (within 15%)")
    elif test_vs_baseline > 0.15:
        print(f"   Interpretation: UNEXPECTED - Test > Baseline (possible data issue)")
    else:
        print(f"   Interpretation: DEGRADED - Test << Baseline")
    
    # 4. Trade Count Analysis
    mean_train_trades = df_results['train_trades'].mean()
    mean_test_trades = df_results['test_trades'].mean()
    min_test_trades = df_results['test_trades'].min()
    print(f"\n4. Trade Count Analysis:")
    print(f"   Mean Training Trades: {mean_train_trades:.1f}")
    print(f"   Mean Testing Trades:  {mean_test_trades:.1f}")
    print(f"   Min Testing Trades:   {min_test_trades}")
    print(f"   Statistical Reliability: {'GOOD' if min_test_trades >= 30 else 'MARGINAL' if min_test_trades >= 15 else 'INSUFFICIENT'}")
    print(f"   (Need 30+ trades for statistical significance)")
    
    # 5. Consistency Analysis (% of runs with positive test Sharpe)
    positive_test_pct = (df_results['test_sharpe'] > 0).sum() / len(df_results)
    print(f"\n5. Consistency Analysis:")
    print(f"   Runs with positive test Sharpe: {(df_results['test_sharpe'] > 0).sum()}/{len(df_results)} ({positive_test_pct:.0%})")
    print(f"   Interpretation: {'HIGHLY CONSISTENT' if positive_test_pct >= 0.90 else 'CONSISTENT' if positive_test_pct >= 0.75 else 'MODERATE' if positive_test_pct >= 0.60 else 'INCONSISTENT'}")
    
    # 6. Sample Size Assessment
    print(f"\n6. Sample Size Assessment:")
    print(f"   Total runs: {len(df_results)}")
    print(f"   Statistical Power: {'ROBUST' if len(df_results) >= 1000 else 'ACCEPTABLE' if len(df_results) >= 100 else 'WEAK' if len(df_results) >= 50 else 'INSUFFICIENT' if len(df_results) >= 20 else 'ANECDOTAL'}")
    print(f"   Minimum required: 1000+ runs")
    if len(df_results) < 1000:
        print(f"   ⚠ WARNING: Results not statistically reliable with {len(df_results)} runs")
    
    # ==================== OVERALL ASSESSMENT ====================
    
    print(f"\n" + "="*70)
    print("OVERALL ROBUSTNESS ASSESSMENT")
    print("="*70 + "\n")
    
    # Calculate robustness score
    score_components = []
    
    # Test Sharpe stability (weight: 20%)
    if test_sharpe_cv < 0.5:
        score_components.append(('Sharpe Stability', 20, 20))
    elif test_sharpe_cv < 1.0:
        score_components.append(('Sharpe Stability', 20, 10))
    else:
        score_components.append(('Sharpe Stability', 20, 0))
    
    # Walk-forward efficiency (weight: 30%)
    if wf_efficiency > 0.80:
        score_components.append(('WF Efficiency', 30, 30))
    elif wf_efficiency > 0.60:
        score_components.append(('WF Efficiency', 30, 25))
    elif wf_efficiency > 0.40:
        score_components.append(('WF Efficiency', 30, 15))
    elif wf_efficiency > 0.25:
        score_components.append(('WF Efficiency', 30, 5))
    else:
        score_components.append(('WF Efficiency', 30, 0))
    
    # Test vs baseline (weight: 20%)
    if abs(test_vs_baseline) < 0.15:
        score_components.append(('Test vs Baseline', 20, 20))
    elif abs(test_vs_baseline) < 0.30:
        score_components.append(('Test vs Baseline', 20, 10))
    else:
        score_components.append(('Test vs Baseline', 20, 0))
    
    # Trade count (weight: 15%)
    if min_test_trades >= 30:
        score_components.append(('Trade Count', 15, 15))
    elif min_test_trades >= 15:
        score_components.append(('Trade Count', 15, 8))
    else:
        score_components.append(('Trade Count', 15, 0))
    
    # Consistency (weight: 15%)
    if positive_test_pct >= 0.90:
        score_components.append(('Consistency', 15, 15))
    elif positive_test_pct >= 0.75:
        score_components.append(('Consistency', 15, 12))
    elif positive_test_pct >= 0.60:
        score_components.append(('Consistency', 15, 8))
    else:
        score_components.append(('Consistency', 15, 0))
    
    total_score = sum(s[2] for s in score_components)
    max_score = sum(s[1] for s in score_components)
    
    print("Score Breakdown:")
    for name, max_pts, earned_pts in score_components:
        print(f"  {name}: {earned_pts}/{max_pts}")
    
    print(f"\nTotal Robustness Score: {total_score}/{max_score} ({total_score/max_score*100:.0f}%)")
    
    # Final decision
    if len(df_results) < 1000:
        decision = "INSUFFICIENT_SAMPLES"
        reason = f"Only {len(df_results)} runs (need 1000+ for validation)"
        recommendation = "Continue scaling to 1000+ runs before making conclusion"
    elif total_score >= 85:
        decision = "ROBUST_STRATEGY"
        reason = f"Score {total_score}/{max_score} - strong generalization"
        recommendation = "Strategy passes validation - ready for paper trading"
    elif total_score >= 70:
        decision = "PROCEED_WITH_CAUTION"
        reason = f"Score {total_score}/{max_score} - acceptable but not excellent"
        recommendation = "Strategy shows reasonable robustness - additional validation recommended"
    elif total_score >= 50:
        decision = "WEAK_ROBUSTNESS"
        reason = f"Score {total_score}/{max_score} - multiple concerns"
        recommendation = "Strategy shows weak generalization - use with caution or re-optimize"
    else:
        decision = "ABANDON_STRATEGY"
        reason = f"Score {total_score}/{max_score} - severe overfitting"
        recommendation = "Strategy fails validation - consider new hypothesis"
    
    print(f"\n✓ Decision: {decision}")
    print(f"  Reason: {reason}")
    print(f"  Recommendation: {recommendation}")
    
    # Save results
    output_data = {
        'strategy': 'Statistical Arbitrage Pairs Trading',
        'hypothesis_id': 5,
        'project_id': config['project_id'],
        'summary': {
            'sample_size': len(results),
            'successful_runs': len(results),
            'failed_runs': len(errors),
            'mean_train_sharpe': float(mean_train),
            'std_train_sharpe': float(std_train),
            'mean_test_sharpe': float(mean_test),
            'std_test_sharpe': float(std_test),
            'mean_degradation': float(mean_deg),
            'std_degradation': float(std_deg),
            'test_sharpe_cv': float(test_sharpe_cv),
            'wf_efficiency': float(wf_efficiency),
            'test_vs_baseline_pct': float(test_vs_baseline),
            'mean_train_trades': float(mean_train_trades),
            'mean_test_trades': float(mean_test_trades),
            'min_test_trades': int(min_test_trades),
            'positive_test_pct': float(positive_test_pct),
            'robustness_score': int(total_score),
            'max_score': int(max_score),
            'decision': decision,
            'reason': reason,
            'recommendation': recommendation
        },
        'detailed_results': results,
        'errors': errors
    }
    
    filename = f"walkforward_stat_arb_h5_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(filename, 'w') as f:
        json.dump(output_data, f, indent=2, default=str)
    
    print(f"\n✓ Results saved to: {filename}")
    print("\n" + "="*70)
    print("MONTE CARLO WALK-FORWARD ANALYSIS COMPLETE")
    print("="*70)