# Forecasting Model Evaluation

Comprehensive evaluation of LSTM, GRU, and Transformer forecasting models.

Aligned with **docs/metrics_and_evaluation.md** specifications:
- MAE reduction ≥15% vs ARIMA baseline
- Directional Accuracy ≥55% (1-day) / ≥53% (3-day) / ≥52% (7-day)
- RMSE reduction ≥12%
- Sharpe ratio ≥1.0
- Inference latency ≤300ms p95

## 1. Setup

In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from scipy import stats

# Add project root to path
project_root = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(project_root))

from backend.utils.config import config
from backend.utils.dataset import ForecastingDataset
from backend.utils.preprocessing import load_scaler_metadata
from backend.utils.model_utils import load_latest_checkpoint, count_model_parameters, measure_inference_latency, get_device
from backend.utils.baselines import ARIMABaseline, MovingAverageBaseline, ExponentialSmoothingBaseline, evaluate_baseline
from backend.utils.metrics import compute_all_metrics, mean_absolute_error, root_mean_squared_error, directional_accuracy
from backend.utils.trading_sim import TradingSimulator, backtest_model
from models import LSTMForecaster, GRUForecaster, TransformerForecaster

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Setup complete!")

## 2. Configuration

In [None]:
# Set paths and parameters
TICKER = 'RELIANCE.NS'  # Change as needed
DATA_DIR = project_root / 'data' / 'processed' / TICKER
CHECKPOINT_DIR = project_root / 'models' / 'checkpoints'

# Find latest processed data
subdirs = [d for d in DATA_DIR.iterdir() if d.is_dir()]
subdirs.sort(key=lambda d: d.stat().st_mtime, reverse=True)
DATA_PATH = subdirs[0]

print(f"Ticker: {TICKER}")
print(f"Data path: {DATA_PATH}")
print(f"Checkpoint directory: {CHECKPOINT_DIR}")

# Device
device = get_device()
print(f"Device: {device}")

## 3. Data Loading

In [None]:
# Load test dataset
test_dataset = ForecastingDataset(DATA_PATH, split='test')

# Load metadata
import json
with open(DATA_PATH / 'metadata.json', 'r') as f:
    metadata = json.load(f)

print(f"Test samples: {len(test_dataset)}")
print(f"Input features: {len(metadata['feature_names'])}")
print(f"Date range: {metadata.get('split_dates', {})}")
print(f"\nFeature names: {metadata['feature_names'][:5]}...")

## 4. Model Loading

In [None]:
# Load models
input_dim = len(metadata['feature_names'])
models_dict = {}

# LSTM
try:
    lstm_path, lstm_meta = load_latest_checkpoint(CHECKPOINT_DIR, 'lstm', TICKER)
    lstm_model = LSTMForecaster(input_dim=input_dim)
    lstm_model.load_checkpoint(lstm_path)
    lstm_model.to(device)
    lstm_model.eval()
    models_dict['LSTM'] = lstm_model
    print(f"Loaded LSTM: {count_model_parameters(lstm_model):,} parameters")
except FileNotFoundError:
    print("LSTM checkpoint not found")

# GRU
try:
    gru_path, gru_meta = load_latest_checkpoint(CHECKPOINT_DIR, 'gru', TICKER)
    gru_model = GRUForecaster(input_dim=input_dim)
    gru_model.load_checkpoint(gru_path)
    gru_model.to(device)
    gru_model.eval()
    models_dict['GRU'] = gru_model
    print(f"Loaded GRU: {count_model_parameters(gru_model):,} parameters")
except FileNotFoundError:
    print("GRU checkpoint not found")

# Transformer
try:
    transformer_path, transformer_meta = load_latest_checkpoint(CHECKPOINT_DIR, 'transformer', TICKER)
    transformer_model = TransformerForecaster(input_dim=input_dim)
    transformer_model.load_checkpoint(transformer_path)
    transformer_model.to(device)
    transformer_model.eval()
    models_dict['Transformer'] = transformer_model
    print(f"Loaded Transformer: {count_model_parameters(transformer_model):,} parameters")
except FileNotFoundError:
    print("Transformer checkpoint not found")

## 5. Baseline Models

In [None]:
# Load train data for baseline fitting
train_targets = np.load(DATA_PATH / 'train_targets.npy')
test_targets = np.load(DATA_PATH / 'test_targets.npy')

baselines_dict = {}

# ARIMA
try:
    arima = ARIMABaseline()
    results = evaluate_baseline(arima, train_targets, test_targets, horizon=1)
    baselines_dict['ARIMA'] = {'model': arima, 'metrics': results}
    print(f"ARIMA - MAE: {results['mae']:.4f}, RMSE: {results['rmse']:.4f}")
except Exception as e:
    print(f"ARIMA failed: {e}")

# Moving Average
ma = MovingAverageBaseline(window=20)
results = evaluate_baseline(ma, train_targets, test_targets, horizon=1)
baselines_dict['Moving Average'] = {'model': ma, 'metrics': results}
print(f"Moving Average - MAE: {results['mae']:.4f}, RMSE: {results['rmse']:.4f}")

# Exponential Smoothing
try:
    es = ExponentialSmoothingBaseline()
    results = evaluate_baseline(es, train_targets, test_targets, horizon=1)
    baselines_dict['Exp Smoothing'] = {'model': es, 'metrics': results}
    print(f"Exp Smoothing - MAE: {results['mae']:.4f}, RMSE: {results['rmse']:.4f}")
except Exception as e:
    print(f"Exponential Smoothing failed: {e}")

## 6. Generate Predictions

In [None]:
# Generate predictions for all models
predictions_dict = {}

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

for model_name, model in models_dict.items():
    all_preds = []
    all_targets = []
    
    with torch.no_grad():
        for features, targets in test_loader:
            features = features.to(device)
            preds = model(features)
            all_preds.append(preds.cpu().numpy())
            all_targets.append(targets.cpu().numpy())
    
    preds = np.concatenate(all_preds)
    targets = np.concatenate(all_targets)
    
    # Take first forecast step if multi-step
    if preds.ndim > 1 and preds.shape[1] > 1:
        preds = preds[:, 0]
    if targets.ndim > 1:
        targets = targets[:, 0]
    
    predictions_dict[model_name] = {'predictions': preds, 'actuals': targets}
    print(f"{model_name}: {len(preds)} predictions generated")

# Create results DataFrame
results_df = pd.DataFrame({
    'actual': predictions_dict[list(models_dict.keys())[0]]['actuals']
})

for model_name, data in predictions_dict.items():
    results_df[f'{model_name.lower()}_pred'] = data['predictions']

# Add baseline predictions
for baseline_name, baseline_data in baselines_dict.items():
    baseline_preds = []
    for i in range(len(test_targets)):
        pred = baseline_data['model'].predict(1)
        baseline_preds.append(pred[0] if isinstance(pred, np.ndarray) else pred)
    results_df[f'{baseline_name.lower().replace(" ", "_")}_pred'] = baseline_preds[:len(results_df)]

print("\nResults DataFrame:")
print(results_df.head())

## 7. Metrics Computation

In [None]:
# Compute metrics for all models
metrics_results = {}

for model_name, data in predictions_dict.items():
    metrics = compute_all_metrics(data['actuals'], data['predictions'])
    metrics_results[model_name] = metrics

# Add baseline metrics
for baseline_name, baseline_data in baselines_dict.items():
    metrics_results[baseline_name] = baseline_data['metrics']

# Create comparison table
metrics_df = pd.DataFrame(metrics_results).T
metrics_df = metrics_df.round(4)

print("\n" + "="*80)
print("METRICS COMPARISON")
print("="*80)
print(metrics_df)
print("="*80)

# Highlight best performers
print("\nBest Performers:")
for col in metrics_df.columns:
    if col.startswith('da_'):
        best = metrics_df[col].idxmax()
        print(f"{col}: {best} ({metrics_df.loc[best, col]:.4f})")
    else:
        best = metrics_df[col].idxmin()
        print(f"{col}: {best} ({metrics_df.loc[best, col]:.4f})")

## 8. Statistical Significance Tests

In [None]:
# Paired t-test for MAE against ARIMA baseline
if 'ARIMA' in baselines_dict:
    arima_preds = baselines_dict['ARIMA']['model'].predict(len(test_targets))
    arima_errors = np.abs(test_targets[:len(arima_preds)] - arima_preds)
    
    print("\nStatistical Significance (Paired t-test vs ARIMA):")
    print("-" * 60)
    
    for model_name, data in predictions_dict.items():
        model_errors = np.abs(data['actuals'] - data['predictions'])
        min_len = min(len(arima_errors), len(model_errors))
        
        t_stat, p_value = stats.ttest_rel(arima_errors[:min_len], model_errors[:min_len])
        
        significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
        improvement = ((arima_errors[:min_len].mean() - model_errors[:min_len].mean()) / arima_errors[:min_len].mean()) * 100
        
        print(f"{model_name:15s}: t={t_stat:7.3f}, p={p_value:.4f} {significance:3s} | Improvement: {improvement:+6.2f}%")
    
    print("-" * 60)
    print("Significance: *** p<0.001, ** p<0.01, * p<0.05, ns=not significant")

## 9. Visualizations

In [None]:
# Plot actual vs predicted
fig, axes = plt.subplots(len(models_dict), 1, figsize=(14, 4*len(models_dict)))
if len(models_dict) == 1:
    axes = [axes]

for idx, (model_name, data) in enumerate(predictions_dict.items()):
    ax = axes[idx]
    ax.plot(data['actuals'][:100], label='Actual', linewidth=2)
    ax.plot(data['predictions'][:100], label='Predicted', linewidth=2, alpha=0.7)
    ax.set_title(f'{model_name} Predictions vs Actual (first 100 samples)')
    ax.set_xlabel('Sample')
    ax.set_ylabel('Value')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Error distributions
fig, axes = plt.subplots(1, len(models_dict), figsize=(6*len(models_dict), 4))
if len(models_dict) == 1:
    axes = [axes]

for idx, (model_name, data) in enumerate(predictions_dict.items()):
    errors = data['actuals'] - data['predictions']
    axes[idx].hist(errors, bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{model_name} Error Distribution')
    axes[idx].set_xlabel('Prediction Error')
    axes[idx].set_ylabel('Frequency')
    axes[idx].axvline(0, color='red', linestyle='--', linewidth=2)
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 10. Latency Measurement

In [None]:
# Measure inference latency
sample_input = torch.randn(1, 60, input_dim).to(device)

print("\n" + "="*80)
print("INFERENCE LATENCY (Target: ≤300ms p95)")
print("="*80)

latency_results = {}

for model_name, model in models_dict.items():
    mean_ms, p95_ms, p99_ms = measure_inference_latency(model, sample_input, num_runs=100)
    latency_results[model_name] = {'mean': mean_ms, 'p95': p95_ms, 'p99': p99_ms}
    
    status = "✓ PASS" if p95_ms <= 300 else "✗ FAIL"
    print(f"{model_name:15s}: Mean={mean_ms:6.2f}ms, P95={p95_ms:6.2f}ms, P99={p99_ms:6.2f}ms {status}")

print("="*80)

## 11. Final Summary

In [None]:
# Create evaluation summary
print("\n" + "="*80)
print("PHASE 3 EVALUATION SUMMARY")
print("="*80)

# Check targets from docs/metrics_and_evaluation.md
print("\nTarget Achievements:")
print("-" * 80)

for model_name in models_dict.keys():
    print(f"\n{model_name}:")
    
    if model_name in metrics_results:
        metrics = metrics_results[model_name]
        
        # MAE reduction vs ARIMA
        if 'ARIMA' in baselines_dict:
            arima_mae = baselines_dict['ARIMA']['metrics']['mae']
            reduction = ((arima_mae - metrics['mae']) / arima_mae) * 100
            status = "✓" if reduction >= 15 else "✗"
            print(f"  MAE reduction vs ARIMA:  {reduction:6.2f}% {status} (target: ≥15%)")
        
        # Directional Accuracy
        da_1 = metrics.get('da_1day', 0)
        status_1 = "✓" if da_1 >= 0.55 else "✗"
        print(f"  DA (1-day):              {da_1:6.2%} {status_1} (target: ≥55%)")
        
        da_3 = metrics.get('da_3day', 0)
        status_3 = "✓" if da_3 >= 0.53 else "✗"
        print(f"  DA (3-day):              {da_3:6.2%} {status_3} (target: ≥53%)")
        
        da_7 = metrics.get('da_7day', 0)
        status_7 = "✓" if da_7 >= 0.52 else "✗"
        print(f"  DA (7-day):              {da_7:6.2%} {status_7} (target: ≥52%)")
    
    # Latency
    if model_name in latency_results:
        p95 = latency_results[model_name]['p95']
        status = "✓" if p95 <= 300 else "✗"
        print(f"  Inference latency (P95): {p95:6.2f}ms {status} (target: ≤300ms)")

print("\n" + "="*80)
print("Evaluation complete! Review metrics above.")
print("="*80)

## 12. Export Results

In [None]:
# Export results to CSV
output_dir = project_root / 'results'
output_dir.mkdir(exist_ok=True)

timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')

# Save metrics
metrics_df.to_csv(output_dir / f'metrics_{TICKER}_{timestamp}.csv')
print(f"Metrics saved to: {output_dir / f'metrics_{TICKER}_{timestamp}.csv'}")

# Save predictions
results_df.to_csv(output_dir / f'predictions_{TICKER}_{timestamp}.csv', index=False)
print(f"Predictions saved to: {output_dir / f'predictions_{TICKER}_{timestamp}.csv'}")