# Multi-Run Cross-Strategy Analysis

This notebook analyzes results across multiple parameter sweep runs to build comprehensive ensembles.

**Use Cases:**
- Combine results from different indicator sweeps (bollinger, rsi, momentum, etc.)
- Build ensembles from diverse strategy types
- Compare performance across different market conditions
- Leverage strategy hashing to avoid duplicates

In [None]:
# parameters
# List of run directories to analyze
run_dirs = [
    "/path/to/results/run_20250623_143030",  # bollinger run
    "/path/to/results/run_20250624_090000",  # rsi run
    # Add more runs as needed
]
output_name = "multi_run_analysis"
min_sharpe = 1.0
correlation_threshold = 0.7
max_strategies_per_type = 5  # Limit strategies per type for diversity

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import duckdb
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Initialize DuckDB
con = duckdb.connect()

print(f"Analyzing {len(run_dirs)} runs")

## Load All Strategy Indices

In [None]:
# Load strategy indices from all runs
all_strategies = []
run_metadata = {}

for run_dir in run_dirs:
    run_path = Path(run_dir)
    if not run_path.exists():
        print(f"⚠️ Run directory not found: {run_dir}")
        continue
        
    # Load strategy index
    index_path = run_path / 'strategy_index.parquet'
    if index_path.exists():
        strategies = pd.read_parquet(index_path)
        strategies['run_dir'] = str(run_path)
        strategies['run_id'] = run_path.name
        all_strategies.append(strategies)
        
        # Load config if available
        config_path = run_path / 'config.json'
        if config_path.exists():
            with open(config_path) as f:
                config = json.load(f)
                run_metadata[run_path.name] = {
                    'config_name': config.get('name', 'unknown'),
                    'symbols': config.get('symbols', []),
                    'timeframe': config.get('timeframe', 'unknown')
                }
        
        print(f"✅ Loaded {len(strategies)} strategies from {run_path.name}")
    else:
        print(f"⚠️ No strategy index found in {run_dir}")

# Combine all strategies
if all_strategies:
    combined_strategies = pd.concat(all_strategies, ignore_index=True)
    print(f"\n📊 Total strategies across all runs: {len(combined_strategies)}")
    
    # Check for duplicates using strategy hash
    duplicates = combined_strategies.groupby('strategy_hash').size()
    duplicates = duplicates[duplicates > 1]
    if len(duplicates) > 0:
        print(f"⚠️ Found {len(duplicates)} duplicate strategies (same hash)")
        # Remove duplicates, keeping the first occurrence
        combined_strategies = combined_strategies.drop_duplicates(subset=['strategy_hash'], keep='first')
        print(f"📊 Unique strategies: {len(combined_strategies)}")
else:
    print("❌ No strategies loaded")
    combined_strategies = pd.DataFrame()

## Create Unified Signal Views

In [None]:
# Create DuckDB views for all signal data
if len(combined_strategies) > 0:
    # Create a view for all signals across runs
    signal_paths = []
    for _, strategy in combined_strategies.iterrows():
        full_path = Path(strategy['run_dir']) / strategy['trace_path']
        if full_path.exists():
            signal_paths.append(str(full_path))
    
    if signal_paths:
        # Create union of all signals
        print(f"Creating unified view of {len(signal_paths)} signal files...")
        
        # For performance, we'll create a view that reads files on demand
        con.execute("CREATE OR REPLACE VIEW all_signals AS ")
        
        union_parts = []
        for i, path in enumerate(signal_paths[:100]):  # Limit for initial testing
            union_parts.append(f"SELECT * FROM read_parquet('{path}')")
        
        if union_parts:
            query = " UNION ALL ".join(union_parts)
            con.execute(f"CREATE OR REPLACE VIEW all_signals AS {query}")
            
            # Test the view
            signal_count = con.execute("SELECT COUNT(*) as cnt FROM all_signals").df()['cnt'][0]
            print(f"✅ Created unified signal view with {signal_count:,} signals")

## Cross-Run Performance Analysis

In [None]:
# Analyze strategy distribution across runs
if len(combined_strategies) > 0:
    print("Strategy Distribution:")
    print("=" * 50)
    
    # By run
    by_run = combined_strategies.groupby('run_id').agg({
        'strategy_hash': 'count',
        'strategy_type': lambda x: x.value_counts().to_dict()
    })
    by_run.columns = ['total_strategies', 'strategy_types']
    print("\nBy Run:")
    for run_id, row in by_run.iterrows():
        meta = run_metadata.get(run_id, {})
        print(f"\n{run_id} ({meta.get('config_name', 'unknown')}):")
        print(f"  Total: {row['total_strategies']}")
        print(f"  Types: {row['strategy_types']}")
    
    # Overall by type
    print("\nOverall by Strategy Type:")
    type_counts = combined_strategies['strategy_type'].value_counts()
    for stype, count in type_counts.items():
        print(f"  {stype}: {count}")

## Filter High-Performance Strategies

In [None]:
# Note: We need to calculate performance if not already in the parquet files
# For now, assume performance metrics are in the strategy index

if 'sharpe_ratio' in combined_strategies.columns:
    # Filter by performance
    high_performers = combined_strategies[combined_strategies['sharpe_ratio'] >= min_sharpe].copy()
    print(f"\nHigh performers (Sharpe >= {min_sharpe}): {len(high_performers)}")
    
    # Apply diversity constraint
    diverse_performers = []
    for stype in high_performers['strategy_type'].unique():
        type_strategies = high_performers[high_performers['strategy_type'] == stype]
        # Take top N from each type
        top_n = type_strategies.nlargest(max_strategies_per_type, 'sharpe_ratio')
        diverse_performers.append(top_n)
    
    diverse_performers = pd.concat(diverse_performers, ignore_index=True)
    print(f"Diverse high performers (max {max_strategies_per_type} per type): {len(diverse_performers)}")
    
    # Display top strategies
    print("\nTop Strategies Across All Runs:")
    print("=" * 80)
    display_cols = ['strategy_type', 'sharpe_ratio', 'total_return', 'run_id']
    print(diverse_performers.nlargest(20, 'sharpe_ratio')[display_cols].to_string(index=False))
else:
    print("⚠️ Performance metrics not found in strategy index. Need to calculate from signals.")
    diverse_performers = combined_strategies  # Use all for now

## Cross-Strategy Correlation Analysis

In [None]:
# For correlation analysis, we'll use the signal overlap approach
# This is more efficient than loading all return series

if len(diverse_performers) > 1:
    print("Calculating strategy correlations using signal overlap...")
    
    # Sample strategies for correlation calculation
    sample_size = min(50, len(diverse_performers))
    sampled_strategies = diverse_performers.sample(sample_size)
    
    correlation_query = f"""
    WITH strategy_signals AS (
        SELECT 
            strategy_hash,
            DATE_TRUNC('hour', ts) as hour_ts,
            AVG(val) as avg_signal
        FROM all_signals
        WHERE strategy_hash IN ({','.join([f"'{h}'" for h in sampled_strategies['strategy_hash']])})
        GROUP BY strategy_hash, hour_ts
    ),
    signal_pairs AS (
        SELECT 
            s1.strategy_hash as hash1,
            s2.strategy_hash as hash2,
            CORR(s1.avg_signal, s2.avg_signal) as correlation
        FROM strategy_signals s1
        JOIN strategy_signals s2 ON s1.hour_ts = s2.hour_ts
        WHERE s1.strategy_hash < s2.strategy_hash
        GROUP BY s1.strategy_hash, s2.strategy_hash
        HAVING COUNT(*) > 100  -- Minimum overlap
    )
    SELECT * FROM signal_pairs
    WHERE ABS(correlation) < {correlation_threshold}
    ORDER BY correlation
    """
    
    # This query might be slow for many strategies
    # For now, we'll skip the actual execution and provide a template
    print("\n💡 To run correlation analysis, execute the correlation query above")
    print("   or use the correlation analysis snippet for detailed analysis")

## Build Multi-Run Ensemble

In [None]:
# Simple ensemble selection based on diversity
ensemble = []
selected_types = set()

# First, take the best from each strategy type
for stype in diverse_performers['strategy_type'].unique():
    type_best = diverse_performers[diverse_performers['strategy_type'] == stype].nlargest(1, 'sharpe_ratio')
    if len(type_best) > 0:
        ensemble.append(type_best.iloc[0])
        selected_types.add(stype)

ensemble_df = pd.DataFrame(ensemble)
print(f"\n🎯 Multi-Run Ensemble ({len(ensemble_df)} strategies from {len(selected_types)} types):")
print("=" * 80)

for _, strategy in ensemble_df.iterrows():
    print(f"\n{strategy['strategy_type']} from {strategy['run_id']}:")
    print(f"  Hash: {strategy['strategy_hash'][:8]}")
    if 'sharpe_ratio' in strategy:
        print(f"  Sharpe: {strategy['sharpe_ratio']:.2f}")
    if 'param_names' in strategy and pd.notna(strategy['param_names']):
        print(f"  Params: {strategy['param_names']}")

## Export Multi-Run Results

In [None]:
# Create comprehensive export
export_data = {
    'analysis_info': {
        'generated_at': datetime.now().isoformat(),
        'runs_analyzed': run_dirs,
        'total_strategies': len(combined_strategies),
        'unique_strategies': len(combined_strategies['strategy_hash'].unique()),
        'high_performers': len(high_performers) if 'high_performers' in locals() else 0
    },
    'run_metadata': run_metadata,
    'ensemble': ensemble_df.to_dict('records'),
    'strategy_type_distribution': combined_strategies['strategy_type'].value_counts().to_dict()
}

# Save to file
output_path = Path(f"{output_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
with open(output_path, 'w') as f:
    json.dump(export_data, f, indent=2, default=str)

print(f"\n✅ Results exported to {output_path}")

# Also save the combined strategy index
combined_strategies.to_parquet(f"{output_name}_strategies.parquet")
print(f"✅ Combined strategy index saved to {output_name}_strategies.parquet")

## Visualization

In [None]:
# Visualize strategy distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Strategy types across runs
type_by_run = combined_strategies.groupby(['run_id', 'strategy_type']).size().unstack(fill_value=0)
type_by_run.plot(kind='bar', stacked=True, ax=axes[0])
axes[0].set_title('Strategy Distribution by Run')
axes[0].set_xlabel('Run ID')
axes[0].set_ylabel('Number of Strategies')
axes[0].legend(title='Strategy Type', bbox_to_anchor=(1.05, 1), loc='upper left')

# Performance distribution if available
if 'sharpe_ratio' in combined_strategies.columns:
    combined_strategies.boxplot(column='sharpe_ratio', by='strategy_type', ax=axes[1])
    axes[1].set_title('Sharpe Ratio Distribution by Strategy Type')
    axes[1].set_xlabel('Strategy Type')
    axes[1].set_ylabel('Sharpe Ratio')
    plt.suptitle('')  # Remove default title

plt.tight_layout()
plt.show()