# Growth Scorer Evaluation

This notebook evaluates the Growth Scorer ensemble against the targets defined in `docs/metrics_and_evaluation.md`.

## 1. Setup

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from models.growth_scorer import GrowthScorer
from backend.utils.growth_data import (
    load_fundamentals,
    load_technical_indicators,
    load_prices,
    merge_growth_features,
    compute_forward_returns,
    engineer_growth_features,
    split_growth_data,
    validate_growth_features
)
from backend.utils.growth_metrics import (
    compute_all_growth_metrics,
    spearman_correlation,
    top_k_precision,
    compute_excess_return,
    information_ratio,
    decile_analysis,
    hit_rate
)
from backend.utils.config import config

%matplotlib inline
sns.set_style('whitegrid')
print("Imports complete")

## 2. Load Model and Data

In [None]:
# Load checkpoint - update path to your trained model
checkpoint_path = os.environ.get('GROWTH_CHECKPOINT', 'models/checkpoints/growth_scorer_latest.pkl')

if not os.path.exists(checkpoint_path):
    # Find the most recent checkpoint
    ckpt_dir = 'models/checkpoints'
    if os.path.exists(ckpt_dir):
        files = [f for f in os.listdir(ckpt_dir) if f.startswith('growth_scorer_') and f.endswith('.pkl')]
        if files:
            checkpoint_path = os.path.join(ckpt_dir, sorted(files)[-1])
            print(f"Using most recent checkpoint: {checkpoint_path}")
        else:
            print("No growth scorer checkpoints found. Please train a model first using scripts/train_growth_scorer.py")
            checkpoint_path = None
    else:
        checkpoint_path = None

if checkpoint_path and os.path.exists(checkpoint_path):
    model, metadata = GrowthScorer.load_checkpoint(checkpoint_path)
    print(f"Loaded model from: {checkpoint_path}")
    print(f"Model type: {metadata.get('args', {}).get('model_type', 'unknown')}")
    print(f"Number of features: {metadata.get('num_features', len(model.feature_names))}")
    print(f"Split type: {metadata.get('split_type', 'unknown')}")
else:
    print("ERROR: No checkpoint found. Please train a model first.")
    model = None
    metadata = {}

# Load data
fundamentals_dir = metadata.get('args', {}).get('fundamentals_dir', 'data/raw/fundamentals')
technical_dir = metadata.get('args', {}).get('technical_dir', 'data/processed')
price_dir = metadata.get('args', {}).get('price_dir', 'data/raw/prices')
horizon_days = metadata.get('args', {}).get('horizon_days', 60)

print(f"Loading data from:")
print(f"  Fundamentals: {fundamentals_dir}")
print(f"  Technicals: {technical_dir}")
print(f"  Prices: {price_dir}")
print(f"  Horizon: {horizon_days} days")

# Load price data
price_df = load_prices(price_dir)
print(f"Loaded {len(price_df)} price records")

# Load fundamentals
fund_df = load_fundamentals(fundamentals_dir)
print(f"Loaded {len(fund_df)} fundamental records")

# Load technical indicators
tech_df = load_technical_indicators(technical_dir, price_df=price_df)
print(f"Loaded {len(tech_df)} technical indicator records")

# Compute forward returns
price_df['fwd_return'] = compute_forward_returns(price_df, horizon_days)

# Merge features
if not fund_df.empty:
    merged_df = merge_growth_features(fund_df, tech_df, price_df)
else:
    merged_df = tech_df.merge(
        price_df[['ticker', 'date', 'fwd_return']], 
        on=['ticker', 'date'], 
        how='inner'
    )

merged_df = merged_df.dropna(subset=['fwd_return'])
print(f"Merged dataset size: {len(merged_df)} records")

# Split data (use same method as training)
train_df, val_df, test_df = split_growth_data(merged_df)
print(f"Split sizes: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

# Engineer features
print("Engineering features...")
train_engineered, feature_names, scalers = engineer_growth_features(train_df, is_train=True)
val_engineered, _, _ = engineer_growth_features(val_df, is_train=False, scalers=scalers)
test_engineered, _, _ = engineer_growth_features(test_df, is_train=False, scalers=scalers)

# Extract feature matrices and targets
X_val = val_engineered[feature_names].values
y_val = val_df['fwd_return'].values

X_test = test_engineered[feature_names].values
y_test = test_df['fwd_return'].values

print(f"Validation feature shape: {X_val.shape}")
print(f"Test feature shape: {X_test.shape}")

# Generate predictions
if model:
    val_scores = model.predict(X_val)
    test_scores = model.predict(X_test)
    print(f"Predictions generated")
    print(f"Val scores range: [{val_scores.min():.4f}, {val_scores.max():.4f}]")
    print(f"Test scores range: [{test_scores.min():.4f}, {test_scores.max():.4f}]")
else:
    print("ERROR: No model loaded")
    val_scores = None
    test_scores = None

## 3. Validation Metrics

Target: Spearman >= 0.30, Top-10 >= 70%, Excess Return >= 3%

In [None]:
# Compute validation metrics
if val_scores is not None:
    val_metrics = compute_all_growth_metrics(val_scores, y_val, benchmark_return=0.0)
    
    print("VALIDATION METRICS")
    print("=" * 60)
    print(f"Spearman Correlation: {val_metrics['correlation']:.4f} (p={val_metrics.get('spearman_p_value', 1.0):.4f})")
    print(f"  Target: >= 0.30")
    print(f"  Status: {'PASS ✓' if val_metrics['correlation'] >= 0.30 else 'FAIL ✗'}")
    print()
    
    if 'top_k_precision' in val_metrics:
        print("Top-K Precision:")
        for k, prec in sorted(val_metrics['top_k_precision'].items()):
            status = 'PASS ✓' if (k == 10 and prec >= 0.70) else ''
            print(f"  Top-{k}: {prec:.4f} {status}")
        print(f"  Target (Top-10): >= 0.70")
    print()
    
    print(f"Excess Return: {val_metrics['excess_return']:.4f} ({val_metrics['excess_return']*100:.2f}%)")
    print(f"  p-value: {val_metrics.get('excess_p_value', 1.0):.4f}")
    print(f"  Target: >= 0.03 (3%)")
    print(f"  Status: {'PASS ✓' if val_metrics['excess_return'] >= 0.03 else 'FAIL ✗'}")
    print()
    
    if 'decile_spread' in val_metrics:
        print(f"Decile Spread (Top - Bottom): {val_metrics['decile_spread']:.4f} ({val_metrics['decile_spread']*100:.2f}%)")
    print("=" * 60)
else:
    print("No validation scores available")


In [None]:
## 4. Test Set Metrics


In [None]:
# Compute test metrics
if test_scores is not None:
    test_metrics = compute_all_growth_metrics(test_scores, y_test, benchmark_return=0.0)
    
    print("TEST METRICS")
    print("=" * 60)
    print(f"Spearman Correlation: {test_metrics['correlation']:.4f} (p={test_metrics.get('spearman_p_value', 1.0):.4f})")
    print(f"  Target: >= 0.30")
    print(f"  Status: {'PASS ✓' if test_metrics['correlation'] >= 0.30 else 'FAIL ✗'}")
    print()
    
    if 'top_k_precision' in test_metrics:
        print("Top-K Precision:")
        for k, prec in sorted(test_metrics['top_k_precision'].items()):
            status = 'PASS ✓' if (k == 10 and prec >= 0.70) else ''
            print(f"  Top-{k}: {prec:.4f} {status}")
        print(f"  Target (Top-10): >= 0.70")
    print()
    
    print(f"Excess Return: {test_metrics['excess_return']:.4f} ({test_metrics['excess_return']*100:.2f}%)")
    print(f"  p-value: {test_metrics.get('excess_p_value', 1.0):.4f}")
    print(f"  Target: >= 0.03 (3%)")
    print(f"  Status: {'PASS ✓' if test_metrics['excess_return'] >= 0.03 else 'FAIL ✗'}")
    print()
    
    if 'decile_spread' in test_metrics:
        print(f"Decile Spread (Top - Bottom): {test_metrics['decile_spread']:.4f} ({test_metrics['decile_spread']*100:.2f}%)")
    print("=" * 60)
else:
    print("No test scores available")


In [None]:
## 5. Visualizations


In [None]:
# Correlation scatter plot
if test_scores is not None:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Validation
    ax1.scatter(val_scores, y_val, alpha=0.5, s=20)
    ax1.set_xlabel('Predicted Growth Score')
    ax1.set_ylabel('Realized Return')
    ax1.set_title(f'Validation: Score vs Return (ρ={val_metrics["correlation"]:.3f})')
    ax1.grid(True, alpha=0.3)
    
    # Add trend line
    z = np.polyfit(val_scores, y_val, 1)
    p = np.poly1d(z)
    x_line = np.linspace(val_scores.min(), val_scores.max(), 100)
    ax1.plot(x_line, p(x_line), 'r--', alpha=0.8, linewidth=2)
    
    # Test
    ax2.scatter(test_scores, y_test, alpha=0.5, s=20, color='orange')
    ax2.set_xlabel('Predicted Growth Score')
    ax2.set_ylabel('Realized Return')
    ax2.set_title(f'Test: Score vs Return (ρ={test_metrics["correlation"]:.3f})')
    ax2.grid(True, alpha=0.3)
    
    # Add trend line
    z = np.polyfit(test_scores, y_test, 1)
    p = np.poly1d(z)
    x_line = np.linspace(test_scores.min(), test_scores.max(), 100)
    ax2.plot(x_line, p(x_line), 'r--', alpha=0.8, linewidth=2)
    
    plt.tight_layout()
    plt.savefig('growth_scorer_correlation.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No scores available for plotting")


In [None]:
# Decile analysis
if test_scores is not None:
    deciles = decile_analysis(test_scores, y_test, num_deciles=10)
    
    if not deciles.empty:
        fig, ax = plt.subplots(figsize=(10, 6))
        
        x_pos = np.arange(len(deciles))
        colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(deciles)))
        
        bars = ax.bar(x_pos, deciles['mean'], yerr=deciles['std'], 
                      capsize=5, color=colors, alpha=0.8, edgecolor='black')
        
        ax.set_xlabel('Decile (1=Highest Score, 10=Lowest Score)', fontsize=12)
        ax.set_ylabel('Mean Realized Return', fontsize=12)
        ax.set_title('Decile Analysis: Mean Return by Predicted Score Decile', fontsize=14, fontweight='bold')
        ax.set_xticks(x_pos)
        ax.set_xticklabels([f'D{i}' for i in deciles.index])
        ax.axhline(0, color='black', linestyle='--', linewidth=1, alpha=0.5)
        ax.grid(axis='y', alpha=0.3)
        
        # Add count labels
        for i, (idx, row) in enumerate(deciles.iterrows()):
            ax.text(i, row['mean'] + row['std'] + 0.005, f"n={int(row['count'])}", 
                   ha='center', va='bottom', fontsize=8)
        
        plt.tight_layout()
        plt.savefig('growth_scorer_deciles.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        print("\nDecile Statistics:")
        print(deciles.round(4))
    else:
        print("Insufficient data for decile analysis")
else:
    print("No scores available for decile analysis")


In [None]:
# Feature importance
if model and hasattr(model, 'get_feature_importances'):
    importance_df = model.get_feature_importances()
    top_n = 20
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    top_features = importance_df.head(top_n)
    y_pos = np.arange(len(top_features))
    
    ax.barh(y_pos, top_features['importance'], color='steelblue', alpha=0.8)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(top_features['feature'])
    ax.invert_yaxis()
    ax.set_xlabel('Importance', fontsize=12)
    ax.set_title(f'Top {top_n} Feature Importances', fontsize=14, fontweight='bold')
    ax.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('growth_scorer_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nTop {top_n} Features:")
    print(top_features.to_string(index=False))
else:
    print("Feature importances not available")


In [None]:
## 6. Summary and Conclusions


In [None]:
# Summary table
if test_scores is not None and val_scores is not None:
    summary_data = {
        'Metric': [
            'Spearman Correlation',
            'Top-10 Precision',
            'Excess Return (%)',
            'Decile Spread (%)'
        ],
        'Target': [
            '≥ 0.30',
            '≥ 0.70',
            '≥ 3.0%',
            'Positive'
        ],
        'Validation': [
            f"{val_metrics['correlation']:.4f}",
            f"{val_metrics.get('top_k_precision', {}).get(10, 0):.4f}",
            f"{val_metrics['excess_return']*100:.2f}%",
            f"{val_metrics.get('decile_spread', 0)*100:.2f}%"
        ],
        'Test': [
            f"{test_metrics['correlation']:.4f}",
            f"{test_metrics.get('top_k_precision', {}).get(10, 0):.4f}",
            f"{test_metrics['excess_return']*100:.2f}%",
            f"{test_metrics.get('decile_spread', 0)*100:.2f}%"
        ],
        'Val Status': [
            '✓' if val_metrics['correlation'] >= 0.30 else '✗',
            '✓' if val_metrics.get('top_k_precision', {}).get(10, 0) >= 0.70 else '✗',
            '✓' if val_metrics['excess_return'] >= 0.03 else '✗',
            '✓' if val_metrics.get('decile_spread', 0) > 0 else '✗'
        ],
        'Test Status': [
            '✓' if test_metrics['correlation'] >= 0.30 else '✗',
            '✓' if test_metrics.get('top_k_precision', {}).get(10, 0) >= 0.70 else '✗',
            '✓' if test_metrics['excess_return'] >= 0.03 else '✗',
            '✓' if test_metrics.get('decile_spread', 0) > 0 else '✗'
        ]
    }
    
    summary_df = pd.DataFrame(summary_data)
    print("\n" + "=" * 80)
    print("GROWTH SCORER EVALUATION SUMMARY")
    print("=" * 80)
    print(summary_df.to_string(index=False))
    print("=" * 80)
    
    # Overall assessment
    val_pass = all([
        val_metrics['correlation'] >= 0.30,
        val_metrics.get('top_k_precision', {}).get(10, 0) >= 0.70,
        val_metrics['excess_return'] >= 0.03
    ])
    
    test_pass = all([
        test_metrics['correlation'] >= 0.30,
        test_metrics.get('top_k_precision', {}).get(10, 0) >= 0.70,
        test_metrics['excess_return'] >= 0.03
    ])
    
    print("\nOVERALL ASSESSMENT:")
    print(f"Validation: {'PASS ✓' if val_pass else 'FAIL ✗'}")
    print(f"Test:       {'PASS ✓' if test_pass else 'FAIL ✗'}")
    print("\nThe model demonstrates {'strong' if test_pass else 'limited'} predictive power for growth scoring.")
else:
    print("Insufficient data for summary")
