# Production Model Evaluation

Comprehensive evaluation of trained MIGT-TVDT model from Phase 5 outputs.

**Evaluation Components:**
1. Model loading and inference on full test set
2. Distributional metrics (CRPS, calibration, PICP, MPIW)
3. Point prediction metrics (IC, DA, RMSE)
4. Financial metrics (Sharpe, Sortino, MDD, profit factor)
5. Calibration analysis and visualization
6. Multi-horizon backtesting
7. Comprehensive evaluation report

**Outputs:** All results saved to `/content/drive/MyDrive/Colab Notebooks/Transformers/FP/evaluation_results`

## 1. Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import sys

BASE_DIR = Path('/content/drive/MyDrive/Colab Notebooks/Transformers/FP')
sys.path.insert(0, str(BASE_DIR / 'src'))

print(f'Base directory: {BASE_DIR}')

Mounted at /content/drive
Base directory: /content/drive/MyDrive/Colab Notebooks/Transformers/FP


In [2]:
!pip install -q scipy matplotlib

In [3]:
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
import json
from datetime import datetime
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

Device: cuda
GPU: NVIDIA A100-SXM4-80GB
VRAM: 85.2 GB


## 2. Configure Paths

In [4]:
paths = {
    'checkpoint': BASE_DIR / 'outputs' / 'checkpoint_best.pt',
    'training_history': BASE_DIR / 'outputs' / 'training_history.json',
    'processed_data': BASE_DIR / 'data' / 'processed',
    'results_dir': BASE_DIR / 'evaluation_results'
}

paths['results_dir'].mkdir(parents=True, exist_ok=True)

print("Paths:")
for key, path in paths.items():
    status = "exists" if path.exists() else "missing"
    print(f"  {key}: {status}")

if not paths['checkpoint'].exists():
    raise FileNotFoundError(f"Checkpoint not found: {paths['checkpoint']}")

Paths:
  checkpoint: exists
  training_history: exists
  processed_data: exists
  results_dir: exists


## 3. Load Training History

In [5]:
with open(paths['training_history'], 'r') as f:
    history = json.load(f)

print("Training History:")
print(f"  Epochs: {len(history['train_loss'])}")
print(f"  Best epoch: {np.argmin(history['val_loss']) + 1}")
print(f"  Best val loss: {np.min(history['val_loss']):.6f}")
print(f"  Final train loss: {history['train_loss'][-1]:.6f}")
print(f"  Final val loss: {history['val_loss'][-1]:.6f}")

Training History:
  Epochs: 18
  Best epoch: 17
  Best val loss: 0.001513
  Final train loss: 0.000883
  Final val loss: 0.001519


## 4. Load Model

In [6]:
from model.migt_tvdt import MIGT_TVDT

checkpoint = torch.load(paths['checkpoint'], map_location=device)

print(f"Checkpoint: epoch {checkpoint['epoch']}, val_loss {checkpoint['val_loss']:.6f}")

model_config = checkpoint['config']['model']
print(f"\nModel config:")
print(f"  n_variables: {model_config['n_variables']}")
print(f"  max_seq_len: {model_config['max_seq_len']}")
print(f"  n_horizons: {model_config['n_horizons']}")
print(f"  n_quantiles: {model_config['n_quantiles']}")
print(f"  d_model: {model_config['d_model']}")

model = MIGT_TVDT(model_config)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

n_params = sum(p.numel() for p in model.parameters())
print(f"\nParameters: {n_params:,}")

Checkpoint: epoch 16, val_loss 0.001513

Model config:
  n_variables: 24
  max_seq_len: 288
  n_horizons: 5
  n_quantiles: 7
  d_model: 256

Parameters: 6,866,984


## 5. Extract Config Parameters

In [7]:
# Quantiles from checkpoint config
if 'quantiles' in checkpoint['config']:
    quantiles = checkpoint['config']['quantiles']
elif 'quantile_regression' in checkpoint['config']:
    quantiles = checkpoint['config']['quantile_regression']['quantiles']
else:
    # Default from problem statement
    quantiles = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]

# Horizon names from checkpoint or default
if 'horizon_names' in checkpoint['config']:
    horizon_names = checkpoint['config']['horizon_names']
elif 'horizons' in checkpoint['config'].get('model', {}):
    horizon_names = checkpoint['config']['model']['horizons']
else:
    # Default from problem statement
    horizon_names = ['15m', '30m', '60m', '2h', '4h']

print(f"Quantiles: {quantiles}")
print(f"Horizons: {horizon_names}")

Quantiles: [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
Horizons: ['15m', '30m', '60m', '2h', '4h']


## 6. Load Test Data

In [8]:
from data.dataset import NQDataModule

data_module = NQDataModule(
    data_path=paths['processed_data'] / 'nq_features_full.parquet',
    batch_size=checkpoint['config']['training']['batch_size'],
    num_workers=8,
    pin_memory=True,
    prefetch_factor=2,
    persistent_workers=True,
    subsample_fraction=1.0,
    subsample_seed=42,
    apply_subsample_to_all_splits=False
)

data_module.setup()
test_loader = data_module.val_dataloader()

print(f"Test batches: {len(test_loader)}")
print(f"Batch size: {checkpoint['config']['training']['batch_size']}")
print(f"Approx samples: {len(test_loader) * checkpoint['config']['training']['batch_size']}")


Loading data from /content/drive/MyDrive/Colab Notebooks/Transformers/FP/data/processed/nq_features_full.parquet
Features: 24
Targets: 5
Split statistics:
  Train: 808,996 samples (2010-06-07 to 2021-12-31)
  Val:   141,516 samples (2022-01-02 to 2023-12-29)
  Test:  136,284 samples (2024-01-02 to 2025-12-03)

Temporal gaps:
  Train-Val gap: 49.1 hours
  Val-Test gap: 74.1 hours
  Purged samples: ~576 total (~288 per gap)
[PASS] No data leakage detected:
  Train-Val gap: 49.1 hours
  Val-Test gap: 74.1 hours

Dataset sizes:
  Train: 808,708
  Val:   141,228
  Test:  135,996
Test batches: 1104
Batch size: 128
Approx samples: 141312


## 7. Run Inference

In [9]:
from evaluation.inference import ModelPredictor

print("Running optimized inference with AMP...")
print("  Phase 6.1: Async GPU-CPU transfer + FP16 tensor cores")
print("  Expected speedup: ~5x vs original implementation")
print()

# Initialize optimized predictor
# use_amp=True by default, provides 2-3x speedup on A100
# Async tensor collection provides additional 2x speedup
predictor = ModelPredictor(model, device, use_amp=True)

# Run inference
result = predictor.predict_dataset(
    test_loader,
    return_targets=True,
    show_progress=True
)

predictions = result['predictions']
targets = result['targets']

print(f"\nPredictions shape: {predictions.shape}")
print(f"Targets shape: {targets.shape}")

Running optimized inference with AMP...
  Phase 6.1: Async GPU-CPU transfer + FP16 tensor cores
  Expected speedup: ~5x vs original implementation



  with autocast(enabled=self.use_amp):
Predicting: 100%|██████████| 1104/1104 [02:40<00:00,  6.86it/s]


Predictions shape: (141228, 5, 7)
Targets shape: (141228, 5)





In [10]:
import time

print("=" * 60)
print("INFERENCE PERFORMANCE VALIDATION (Phase 6.1)")
print("=" * 60)
print()

# Benchmark on subset by limiting number of batches (not using Subset)
n_batches = min(8, len(test_loader))
print(f"Benchmarking: First {n_batches} batches from test set")
print(f"Approx samples: {n_batches * test_loader.batch_size}\n")

# Helper function
def benchmark_inference(predictor, loader, n_batches):
    """Run inference on first n_batches and return time + predictions."""
    start = time.time()
    preds = []
    count = 0
    for batch in loader:
        outputs = predictor.predict_batch(batch)
        preds.append(outputs['quantiles'].cpu())
        count += 1
        if count >= n_batches:
            break
    elapsed = time.time() - start
    return elapsed, torch.cat(preds, dim=0).numpy()

# Warmup GPU
predictor_amp = ModelPredictor(model, device, use_amp=True)
_, _ = benchmark_inference(predictor_amp, test_loader, 2)

# Benchmark with AMP (FP16)
time_amp, preds_amp = benchmark_inference(predictor_amp, test_loader, n_batches)

# Benchmark without AMP (FP32)
predictor_fp32 = ModelPredictor(model, device, use_amp=False)
time_fp32, preds_fp32 = benchmark_inference(predictor_fp32, test_loader, n_batches)

# Calculate speedup
speedup = time_fp32 / time_amp

print(f"Performance Results:")
print(f"  FP32 time: {time_fp32:.3f}s ({time_fp32/n_batches:.3f}s/batch)")
print(f"  AMP time:  {time_amp:.3f}s ({time_amp/n_batches:.3f}s/batch)")
print(f"  Speedup:   {speedup:.2f}x")
print()

# Verify numerical accuracy
max_error = np.abs(preds_amp - preds_fp32).max()
mean_error = np.abs(preds_amp - preds_fp32).mean()
rel_error = max_error / (np.abs(preds_fp32).mean() + 1e-8)

print(f"Numerical Accuracy:")
print(f"  Max absolute error: {max_error:.2e}")
print(f"  Mean absolute error: {mean_error:.2e}")
print(f"  Max relative error: {rel_error:.2e}")
print(f"  Status: {'✓ PASS' if rel_error < 1e-4 else '✗ FAIL'} (threshold: 1e-4)")
print()

# Performance assessment
print(f"Performance Assessment:")
print(f"  Expected on A100: 2.5-3.0x speedup from AMP")
print(f"  Your result: {speedup:.2f}x - {'✓ EXCELLENT' if speedup > 2.5 else '✓ GOOD' if speedup > 2.0 else '⚠ CHECK GPU TYPE'}")
print()
print("Note: Combined with async fix (~2x), total speedup vs original ~5x")
print("=" * 60)

INFERENCE PERFORMANCE VALIDATION (Phase 6.1)

Benchmarking: First 8 batches from test set
Approx samples: 1024

Performance Results:
  FP32 time: 4.689s (0.586s/batch)
  AMP time:  2.110s (0.264s/batch)
  Speedup:   2.22x

Numerical Accuracy:
  Max absolute error: 2.86e-04
  Mean absolute error: 6.75e-05
  Max relative error: 6.47e-01
  Status: ✗ FAIL (threshold: 1e-4)

Performance Assessment:
  Expected on A100: 2.5-3.0x speedup from AMP
  Your result: 2.22x - ✓ GOOD

Note: Combined with async fix (~2x), total speedup vs original ~5x


## 8. Compute Metrics

In [11]:
from evaluation.metrics import MetricsSummary

print("Computing metrics...")
summary = MetricsSummary(quantiles=quantiles, horizon_names=horizon_names)
metrics = summary.compute_all(predictions, targets)

print("\nMetrics by horizon:")
for h in horizon_names:
    print(f"\n{h}:")
    print(f"  CRPS: {metrics['distributional'][h]['crps']:.6f}")
    print(f"  IC: {metrics['point'][h]['ic']:.4f}")
    print(f"  DA: {metrics['point'][h]['da']:.3f}")

Computing metrics...

Metrics by horizon:

15m:
  CRPS: 0.000500
  IC: 0.0026
  DA: 0.487

30m:
  CRPS: 0.000770
  IC: -0.0014
  DA: 0.485

60m:
  CRPS: 0.001034
  IC: 0.0027
  DA: 0.511

2h:
  CRPS: 0.001503
  IC: 0.0220
  DA: 0.516

4h:
  CRPS: 0.002158
  IC: 0.0220
  DA: 0.517


## 9. Calibration Analysis

In [12]:
from evaluation.calibration import CalibrationByHorizon

print("Calibration analysis...")
cal_analyzer = CalibrationByHorizon(quantiles, horizon_names)
calibration_results = cal_analyzer.compute_per_horizon(predictions, targets)

fig = cal_analyzer.plot_reliability_by_horizon(predictions, targets)
plot_path = paths['results_dir'] / "calibration_reliability.png"
fig.savefig(plot_path, dpi=150, bbox_inches='tight')
plt.close(fig)
print(f"Saved: {plot_path.name}")

print("\nCalibration metrics by horizon:")
for h in horizon_names:
    if h in calibration_results:
        print(f"  {h}:")
        print(f"    Mean error: {calibration_results[h]['mean_error']:.6f}")
        print(f"    Max error: {calibration_results[h]['max_error']:.6f}")
        print(f"    RMSE: {calibration_results[h]['rmse']:.6f}")

Calibration analysis...
Saved: calibration_reliability.png

Calibration metrics by horizon:
  15m:
    Mean error: 0.323575
    Max error: 0.517536
    RMSE: 0.354565
  30m:
    Mean error: 0.340619
    Max error: 0.634553
    RMSE: 0.394315
  60m:
    Mean error: 0.325496
    Max error: 0.528547
    RMSE: 0.357197
  2h:
    Mean error: 0.329751
    Max error: 0.558328
    RMSE: 0.364909
  4h:
    Mean error: 0.323236
    Max error: 0.512806
    RMSE: 0.354058


## 10. Backtest

In [13]:
from evaluation.backtest import MultiHorizonBacktester

print("Running backtest...")
backtester = MultiHorizonBacktester(predictions, targets, horizon_names)
bt_results = backtester.run()
bt_summary = backtester.get_metrics_summary()

print("\nBacktest summary:")
print(bt_summary.to_string())

bt_summary.to_csv(paths['results_dir'] / 'backtest_summary.csv')

ax = backtester.plot_equity_curves()
fig = ax.get_figure()
fig.savefig(paths['results_dir'] / 'equity_curves.png', dpi=150, bbox_inches='tight')
plt.close(fig)
print("\nSaved equity curves")

Running backtest...

Backtest summary:
           sharpe   sortino  max_drawdown  profit_factor  hit_rate    calmar  total_return  n_trades  mean_return  std_return
horizon                                                                                                                      
15m     -0.212200 -0.301717      0.692361       0.995109  0.486582 -0.084733     -0.352378    141228    -0.000002    0.001392
30m     -0.388238 -0.554579      0.916353       0.991186  0.485088 -0.154195     -0.665294    141228    -0.000006    0.002044
60m      0.572024  0.802242      0.970339       1.012977  0.510579  0.174382      2.075193    141228     0.000013    0.003222
2h       0.822033  1.152040      0.995815       1.018401  0.516017  0.338176      7.049192    141228     0.000021    0.003663
4h       1.081520  1.511463      0.999999       1.023525  0.517079  0.766481     58.695076    141228     0.000050    0.006459

Saved equity curves


## 11. Generate Report

In [14]:
from evaluation.inference import format_evaluation_report

eval_results = {
    'n_samples': len(targets),
    'metrics': metrics
}

report = format_evaluation_report(eval_results, horizon_names)

# Defensive extraction: handles both missing 'data' key and None value
data_config = checkpoint['config'].get('data', {})
subsample_frac = data_config.get('subsample_fraction') or 1.0

header = f"""# MIGT-TVDT Evaluation Report

**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Training Info
- Epochs: {checkpoint['epoch']}
- Val loss: {checkpoint['val_loss']:.6f}
- Subsample: {subsample_frac * 100:.0f}%

## Architecture
- Variables: {model_config['n_variables']}
- Max seq len: {model_config['max_seq_len']}
- Horizons: {horizon_names}
- Quantiles: {len(quantiles)}
- d_model: {model_config['d_model']}
- Parameters: {n_params:,}

## Test Set
- Samples: {len(targets):,}

---

"""

# Add backtest section
backtest_section = "\n## Backtest Results\n\n"
backtest_section += bt_summary.to_string()
backtest_section += "\n"

full_report = header + report + backtest_section

report_path = paths['results_dir'] / 'evaluation_report.md'
with open(report_path, 'w') as f:
    f.write(full_report)

print(full_report)

# MIGT-TVDT Evaluation Report

**Date:** 2025-12-12 16:38:35

## Training Info
- Epochs: 16
- Val loss: 0.001513
- Subsample: 100%

## Architecture
- Variables: 24
- Max seq len: 288
- Horizons: ['15m', '30m', '60m', '2h', '4h']
- Quantiles: 7
- d_model: 256
- Parameters: 6,866,984

## Test Set
- Samples: 141,228

---

# Model Evaluation Report

Samples evaluated: 141,228


## Distributional Metrics

| Horizon | CRPS | PICP-80 | PICP-50 | MPIW-80 | MPIW-50 |
|---------|------|---------|---------|---------|---------|
| 15m | 0.00050 | 0.000 | 0.000 | 0.00000 | 0.00000 |
| 30m | 0.00077 | 0.000 | 0.000 | 0.00000 | 0.00000 |
| 60m | 0.00103 | 0.000 | 0.000 | 0.00000 | 0.00000 |
| 2h | 0.00150 | 0.000 | 0.000 | 0.00000 | 0.00000 |
| 4h | 0.00216 | 0.000 | 0.000 | 0.00000 | 0.00000 |

## Point Metrics (Median)

| Horizon | IC | DA | RMSE | MAE |
|---------|----|----|------|-----|
| 15m | 0.0026 | 0.487 | 0.00161 | 0.00100 |
| 30m | -0.0014 | 0.485 | 0.00234 | 0.00154 |
| 60m | 0.0027 | 0.51

## 12. Save Results

In [15]:
# Save predictions
np.savez_compressed(
    paths['results_dir'] / 'predictions_targets.npz',
    predictions=predictions,
    targets=targets
)

# Helper for JSON serialization
def to_serializable(obj):
    if isinstance(obj, dict):
        return {k: to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (np.integer, np.floating)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, pd.DataFrame):
        return obj.to_dict('records')
    return obj

# Save metrics
with open(paths['results_dir'] / 'metrics.json', 'w') as f:
    json.dump(to_serializable(metrics), f, indent=2)

# Save calibration
with open(paths['results_dir'] / 'calibration.json', 'w') as f:
    json.dump(to_serializable(calibration_results), f, indent=2)

# Save backtest
with open(paths['results_dir'] / 'backtest.json', 'w') as f:
    json.dump(to_serializable(bt_results), f, indent=2)

# Metrics CSV
rows = []
for cat in ['distributional', 'point']:
    if cat in metrics:
        for h, m in metrics[cat].items():
            row = {'category': cat, 'horizon': h}
            for k, v in m.items():
                row[k] = float(v) if isinstance(v, (np.integer, np.floating)) else v
            rows.append(row)

pd.DataFrame(rows).to_csv(paths['results_dir'] / 'metrics_summary.csv', index=False)

print("\nAll results saved to:", paths['results_dir'])


All results saved to: /content/drive/MyDrive/Colab Notebooks/Transformers/FP/evaluation_results


## 13. Summary

In [16]:
print("=" * 60)
print("EVALUATION COMPLETE")
print("=" * 60)

print("\nKey Metrics:")
for h in horizon_names:
    print(f"\n{h}:")
    print(f"  IC: {metrics['point'][h]['ic']:.4f}")
    print(f"  DA: {metrics['point'][h]['da']:.3f}")
    print(f"  CRPS: {metrics['distributional'][h]['crps']:.6f}")

print(f"\nBacktest Performance:")
for h in horizon_names:
    if h in bt_summary.index:
        sharpe = bt_summary.loc[h, 'sharpe']
        max_dd = bt_summary.loc[h, 'max_drawdown']
        print(f"  {h}: Sharpe={sharpe:.3f}, Max DD={max_dd:.2%}")

print(f"\nResults: {paths['results_dir']}")
print("\nFiles:")
for f in paths['results_dir'].glob('*'):
    print(f"  {f.name}: {f.stat().st_size/1024:.1f} KB")

EVALUATION COMPLETE

Key Metrics:

15m:
  IC: 0.0026
  DA: 0.487
  CRPS: 0.000500

30m:
  IC: -0.0014
  DA: 0.485
  CRPS: 0.000770

60m:
  IC: 0.0027
  DA: 0.511
  CRPS: 0.001034

2h:
  IC: 0.0220
  DA: 0.516
  CRPS: 0.001503

4h:
  IC: 0.0220
  DA: 0.517
  CRPS: 0.002158

Backtest Performance:
  15m: Sharpe=-0.212, Max DD=69.24%
  30m: Sharpe=-0.388, Max DD=91.64%
  60m: Sharpe=0.572, Max DD=97.03%
  2h: Sharpe=0.822, Max DD=99.58%
  4h: Sharpe=1.082, Max DD=100.00%

Results: /content/drive/MyDrive/Colab Notebooks/Transformers/FP/evaluation_results

Files:
  calibration_reliability.png: 110.3 KB
  backtest_summary.csv: 1.0 KB
  equity_curves.png: 68.5 KB
  calibration.json: 1.9 KB
  evaluation_report.md: 2.1 KB
  metrics.json: 2.4 KB
  predictions_targets.npz: 2822.9 KB
  backtest.json: 52420.7 KB
  metrics_summary.csv: 1.2 KB
