# Store-Based Strategy Analysis

This notebook demonstrates run-invariant analysis by querying the global strategy store directly.
No run IDs or run directories required - just query by strategy type and parameters.

In [None]:
# Parameters - these can be set by papermill or manually
strategy_type = 'bollinger_bands'  # Query specific strategy type
symbol = 'SPY'  # Optional: filter by symbol
timeframe = '5m'  # Optional: filter by timeframe
min_strategies = 5  # Minimum strategies needed for analysis
performance_limit = 100  # Max strategies to calculate detailed performance for

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Import analytics modules
from src.analytics.modules.core import (
    load_global_traces,
    load_strategy_index,
    load_market_data,
    calculate_returns
)

from src.analytics.modules.signal_analysis import (
    count_signals,
    calculate_signal_persistence,
    analyze_signal_patterns
)

from src.analytics.modules.performance import (
    calculate_strategy_performance,
    calculate_ensemble_sharpe
)

from src.analytics.modules.visualization import (
    plot_signal_heatmap,
    plot_performance_distribution,
    plot_parameter_sensitivity
)

## 1. Query Strategy Store

Load strategies directly from the global store by type and parameters.

In [None]:
# Load the strategy index to see what's available
strategy_index = load_strategy_index()

print(f"Total strategies in store: {len(strategy_index)}")
print(f"\nStrategy types available:")
print(strategy_index['strategy_type'].value_counts())

In [None]:
# Filter strategies based on our criteria
filtered_strategies = strategy_index.copy()

# Apply filters
if strategy_type and strategy_type != 'all':
    filtered_strategies = filtered_strategies[filtered_strategies['strategy_type'] == strategy_type]
    print(f"Filtered to {strategy_type}: {len(filtered_strategies)} strategies")

if symbol:
    filtered_strategies = filtered_strategies[filtered_strategies['symbol'] == symbol]
    print(f"Filtered to {symbol}: {len(filtered_strategies)} strategies")

if timeframe:
    filtered_strategies = filtered_strategies[filtered_strategies['timeframe'] == timeframe]
    print(f"Filtered to {timeframe}: {len(filtered_strategies)} strategies")

print(f"\nTotal matching strategies: {len(filtered_strategies)}")

if len(filtered_strategies) == 0:
    print("\nNo strategies match the criteria!")
    print("\nAvailable combinations:")
    print(strategy_index.groupby(['strategy_type', 'symbol', 'timeframe']).size().to_frame('count'))
elif len(filtered_strategies) < min_strategies:
    print(f"\nWarning: Only {len(filtered_strategies)} strategies found (minimum {min_strategies} required)")
    print("Consider broadening your search criteria.")

In [None]:
# Show parameter distributions for filtered strategies
if len(filtered_strategies) > 0:
    print("\nParameter columns found:")
    param_cols = [col for col in filtered_strategies.columns 
                  if col not in ['strategy_hash', 'strategy_type', 'component_type', 
                                'symbol', 'timeframe', 'constraints', 'trace_path', 
                                'first_seen', 'full_config']]
    
    for col in param_cols:
        if pd.api.types.is_numeric_dtype(filtered_strategies[col]):
            print(f"\n{col}:")
            print(filtered_strategies[col].describe())
        else:
            print(f"\n{col}: {filtered_strategies[col].nunique()} unique values")

## 2. Load Signal Data

Load the actual signal traces for our filtered strategies.

In [None]:
# Load signals for all matching strategies
if len(filtered_strategies) > 0:
    print("Loading signal data...")
    
    all_signals = []
    for idx, strategy in filtered_strategies.iterrows():
        try:
            signals = load_global_traces(strategy_hash=strategy['strategy_hash'])
            if not signals.empty:
                # Add strategy metadata to signals
                signals['strategy_hash'] = strategy['strategy_hash']
                signals['strategy_type'] = strategy['strategy_type']
                
                # Add parameters as columns
                for col in param_cols:
                    if col in strategy:
                        signals[col] = strategy[col]
                
                all_signals.append(signals)
        except Exception as e:
            print(f"Error loading {strategy['strategy_hash']}: {e}")
    
    if all_signals:
        combined_signals = pd.concat(all_signals, ignore_index=True)
        print(f"\nLoaded {len(combined_signals):,} signal records from {len(all_signals)} strategies")
        
        # Convert timestamp to datetime
        combined_signals['timestamp'] = pd.to_datetime(combined_signals['timestamp'])
        
        # Show date range
        print(f"Date range: {combined_signals['timestamp'].min()} to {combined_signals['timestamp'].max()}")
    else:
        print("No signal data could be loaded!")
        combined_signals = pd.DataFrame()

## 3. Signal Analysis

Analyze the signals without needing market data or execution results.

In [None]:
# Count signals by strategy
if not combined_signals.empty:
    signal_counts = combined_signals.groupby('strategy_hash').agg({
        'signal': 'count',
        'direction': lambda x: (x != 'flat').sum()
    }).rename(columns={'signal': 'total_bars', 'direction': 'active_signals'})
    
    signal_counts['signal_rate'] = signal_counts['active_signals'] / signal_counts['total_bars']
    
    # Add strategy info
    signal_counts = signal_counts.merge(
        filtered_strategies[['strategy_hash'] + param_cols],
        left_index=True,
        right_on='strategy_hash'
    )
    
    print("\nSignal Statistics by Strategy:")
    print(signal_counts.sort_values('signal_rate', ascending=False).head(10))

In [None]:
# Analyze signal patterns
if not combined_signals.empty:
    # Group by strategy and analyze patterns
    pattern_results = []
    
    for strategy_hash in combined_signals['strategy_hash'].unique():
        strategy_signals = combined_signals[combined_signals['strategy_hash'] == strategy_hash]
        
        # Calculate signal persistence
        persistence = calculate_signal_persistence(strategy_signals)
        
        # Get pattern statistics
        patterns = analyze_signal_patterns(strategy_signals)
        
        pattern_results.append({
            'strategy_hash': strategy_hash,
            'avg_signal_duration': persistence['avg_duration'],
            'max_signal_duration': persistence['max_duration'],
            'num_signal_changes': patterns['num_changes'],
            'long_ratio': patterns['long_ratio'],
            'short_ratio': patterns['short_ratio']
        })
    
    pattern_df = pd.DataFrame(pattern_results)
    
    # Add strategy metadata
    pattern_df = pattern_df.merge(
        filtered_strategies[['strategy_hash'] + param_cols],
        on='strategy_hash'
    )
    
    print("\nSignal Pattern Analysis:")
    print(pattern_df.describe())

## 4. Parameter Analysis

Analyze how parameters affect signal generation.

In [None]:
# Analyze parameter relationships
if len(pattern_df) > 5 and param_cols:
    # Find numeric parameters
    numeric_params = [col for col in param_cols 
                     if pd.api.types.is_numeric_dtype(pattern_df[col])]
    
    if numeric_params:
        # Create parameter sensitivity plots
        fig, axes = plt.subplots(len(numeric_params), 2, figsize=(15, 5*len(numeric_params)))
        if len(numeric_params) == 1:
            axes = axes.reshape(1, -1)
        
        for i, param in enumerate(numeric_params):
            # Signal rate vs parameter
            signal_param_df = signal_counts[[param, 'signal_rate']].dropna()
            if len(signal_param_df) > 0:
                axes[i, 0].scatter(signal_param_df[param], signal_param_df['signal_rate'], alpha=0.6)
                axes[i, 0].set_xlabel(param)
                axes[i, 0].set_ylabel('Signal Rate')
                axes[i, 0].set_title(f'Signal Rate vs {param}')
            
            # Duration vs parameter
            duration_param_df = pattern_df[[param, 'avg_signal_duration']].dropna()
            if len(duration_param_df) > 0:
                axes[i, 1].scatter(duration_param_df[param], duration_param_df['avg_signal_duration'], alpha=0.6)
                axes[i, 1].set_xlabel(param)
                axes[i, 1].set_ylabel('Avg Signal Duration')
                axes[i, 1].set_title(f'Signal Duration vs {param}')
        
        plt.tight_layout()
        plt.show()

## 5. Signal Correlation Analysis

Analyze correlations between different parameter configurations.

In [None]:
# Calculate signal correlation matrix
if len(all_signals) > 5:
    print("Calculating signal correlations...")
    
    # Pivot signals to wide format
    signal_matrix = []
    strategy_info = []
    
    for signals in all_signals[:min(50, len(all_signals))]:  # Limit to 50 for performance
        if not signals.empty:
            strategy_hash = signals['strategy_hash'].iloc[0]
            # Convert directions to numeric
            signal_series = signals.set_index('timestamp')['direction'].map(
                {'long': 1, 'short': -1, 'flat': 0}
            ).fillna(0)
            
            signal_matrix.append(signal_series)
            strategy_info.append(strategy_hash)
    
    if signal_matrix:
        # Align all series to same index
        signal_df = pd.DataFrame(signal_matrix).T
        signal_df.columns = strategy_info
        
        # Calculate correlation
        corr_matrix = signal_df.corr()
        
        # Plot correlation heatmap
        plt.figure(figsize=(12, 10))
        mask = np.triu(np.ones_like(corr_matrix), k=1)
        sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', center=0, 
                   vmin=-1, vmax=1, square=True, linewidths=0.5,
                   cbar_kws={"shrink": 0.8})
        plt.title(f'{strategy_type} Strategy Signal Correlations')
        plt.tight_layout()
        plt.show()
        
        # Find low correlation pairs
        low_corr_threshold = 0.3
        low_corr_pairs = []
        
        for i in range(len(corr_matrix)):
            for j in range(i+1, len(corr_matrix)):
                if abs(corr_matrix.iloc[i, j]) < low_corr_threshold:
                    low_corr_pairs.append({
                        'strategy1': corr_matrix.index[i],
                        'strategy2': corr_matrix.index[j],
                        'correlation': corr_matrix.iloc[i, j]
                    })
        
        print(f"\nFound {len(low_corr_pairs)} low-correlation strategy pairs (< {low_corr_threshold})")
        if low_corr_pairs:
            print("\nTop diversification candidates:")
            print(pd.DataFrame(low_corr_pairs).head(10))

## 6. Performance Estimation (Optional)

If we have market data available, we can estimate performance without execution.

In [None]:
# Try to load market data for performance estimation
if not combined_signals.empty and symbol:
    try:
        print(f"\nAttempting to load market data for {symbol}...")
        market_data = load_market_data(symbol)
        
        if not market_data.empty:
            print(f"Loaded {len(market_data):,} bars of market data")
            
            # Calculate performance for a subset of strategies
            perf_results = []
            strategies_to_analyze = min(performance_limit, len(all_signals))
            
            print(f"\nCalculating performance for {strategies_to_analyze} strategies...")
            
            for i, signals in enumerate(all_signals[:strategies_to_analyze]):
                if not signals.empty:
                    try:
                        strategy_hash = signals['strategy_hash'].iloc[0]
                        perf = calculate_strategy_performance(
                            signals, 
                            market_data,
                            initial_capital=100000
                        )
                        
                        perf['strategy_hash'] = strategy_hash
                        perf_results.append(perf)
                        
                        if (i + 1) % 10 == 0:
                            print(f"  Processed {i + 1}/{strategies_to_analyze} strategies")
                    except Exception as e:
                        print(f"  Error calculating performance for {strategy_hash}: {e}")
            
            if perf_results:
                perf_df = pd.DataFrame(perf_results)
                
                # Add strategy metadata
                perf_df = perf_df.merge(
                    filtered_strategies[['strategy_hash'] + param_cols],
                    on='strategy_hash'
                )
                
                print("\nPerformance Summary:")
                print(perf_df[['total_return', 'sharpe_ratio', 'max_drawdown', 
                              'win_rate', 'profit_factor']].describe())
                
                # Plot performance distribution
                fig, axes = plt.subplots(2, 2, figsize=(15, 10))
                
                perf_df['sharpe_ratio'].hist(bins=30, ax=axes[0, 0])
                axes[0, 0].set_title('Sharpe Ratio Distribution')
                axes[0, 0].set_xlabel('Sharpe Ratio')
                
                perf_df['total_return'].hist(bins=30, ax=axes[0, 1])
                axes[0, 1].set_title('Total Return Distribution')
                axes[0, 1].set_xlabel('Total Return')
                
                perf_df.plot.scatter('max_drawdown', 'sharpe_ratio', ax=axes[1, 0])
                axes[1, 0].set_title('Risk-Return Profile')
                axes[1, 0].set_xlabel('Max Drawdown')
                axes[1, 0].set_ylabel('Sharpe Ratio')
                
                perf_df.plot.scatter('win_rate', 'profit_factor', ax=axes[1, 1])
                axes[1, 1].set_title('Win Rate vs Profit Factor')
                axes[1, 1].set_xlabel('Win Rate')
                axes[1, 1].set_ylabel('Profit Factor')
                
                plt.tight_layout()
                plt.show()
                
                # Save performance results
                perf_df.to_csv(f'{strategy_type}_performance_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv', index=False)
                print(f"\nPerformance results saved to CSV")
        else:
            print("No market data found - skipping performance estimation")
            
    except Exception as e:
        print(f"Could not load market data: {e}")
        print("Skipping performance estimation")

## 7. Summary and Recommendations

Summarize findings and suggest next steps.

In [None]:
# Generate summary
print("=" * 80)
print(f"ANALYSIS SUMMARY: {strategy_type} Strategies")
print("=" * 80)

if len(filtered_strategies) > 0:
    print(f"\n✓ Analyzed {len(filtered_strategies)} {strategy_type} strategies")
    print(f"✓ Total signal records: {len(combined_signals):,}")
    
    if not signal_counts.empty:
        print(f"\n📊 Signal Statistics:")
        print(f"   - Average signal rate: {signal_counts['signal_rate'].mean():.2%}")
        print(f"   - Signal rate range: {signal_counts['signal_rate'].min():.2%} - {signal_counts['signal_rate'].max():.2%}")
    
    if len(pattern_df) > 0:
        print(f"\n📈 Pattern Analysis:")
        print(f"   - Avg signal duration: {pattern_df['avg_signal_duration'].mean():.1f} bars")
        print(f"   - Long/Short ratio: {pattern_df['long_ratio'].mean():.2f} / {pattern_df['short_ratio'].mean():.2f}")
    
    if 'perf_df' in locals() and not perf_df.empty:
        print(f"\n💰 Performance Estimates:")
        print(f"   - Median Sharpe: {perf_df['sharpe_ratio'].median():.2f}")
        print(f"   - Best Sharpe: {perf_df['sharpe_ratio'].max():.2f}")
        print(f"   - Strategies with Sharpe > 1.0: {(perf_df['sharpe_ratio'] > 1.0).sum()}")
    
    print(f"\n🎯 Recommendations:")
    if len(filtered_strategies) < 20:
        print(f"   - Consider generating more {strategy_type} variations for robust analysis")
    
    if len(low_corr_pairs) > 0:
        print(f"   - Found {len(low_corr_pairs)} low-correlation pairs suitable for ensemble")
    
    if numeric_params:
        print(f"   - Key parameters to optimize: {', '.join(numeric_params)}")
    
else:
    print(f"\n❌ No {strategy_type} strategies found in the store")
    print("\nNext steps:")
    print("1. Run signal generation with this strategy type")
    print("2. Check available strategy types in the store")
    print("3. Adjust filter criteria (symbol, timeframe)")

print("\n" + "=" * 80)

In [None]:
# Export key results
if len(filtered_strategies) > 0:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Save strategy metadata
    filtered_strategies.to_csv(f'{strategy_type}_strategies_{timestamp}.csv', index=False)
    print(f"\nExported {len(filtered_strategies)} strategy definitions")
    
    # Save signal statistics if available  
    if not signal_counts.empty:
        signal_counts.to_csv(f'{strategy_type}_signal_stats_{timestamp}.csv', index=False)
        print(f"Exported signal statistics")
    
    # Save pattern analysis if available
    if len(pattern_df) > 0:
        pattern_df.to_csv(f'{strategy_type}_patterns_{timestamp}.csv', index=False) 
        print(f"Exported pattern analysis")
    
    print(f"\nAll results exported with timestamp: {timestamp}")