# Production Model Evaluation

Comprehensive evaluation of trained MIGT-TVDT model from Phase 5 outputs.

**Evaluation Components:**
1. Model loading and inference on full test set
2. Distributional metrics (CRPS, calibration, PICP, MPIW)
3. Point prediction metrics (IC, DA, RMSE)
4. Financial metrics (Sharpe, Sortino, MDD, profit factor)
5. Calibration analysis and visualization
6. Multi-horizon backtesting
7. Comprehensive evaluation report

**Outputs:** All results saved to `/content/drive/MyDrive/Colab Notebooks/Transformers/FP/evaluation_results`

## 1. Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import sys

BASE_DIR = Path('/content/drive/MyDrive/Colab Notebooks/Transformers/FP')
sys.path.insert(0, str(BASE_DIR / 'src'))

print(f'Base directory: {BASE_DIR}')

Mounted at /content/drive
Base directory: /content/drive/MyDrive/Colab Notebooks/Transformers/FP


In [2]:
!pip install -q scipy matplotlib

In [3]:
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
import json
from datetime import datetime

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

Device: cuda
GPU: NVIDIA A100-SXM4-80GB
VRAM: 85.2 GB


## 2. Configure Paths

In [4]:
paths = {
    'checkpoint': BASE_DIR / 'outputs' / 'checkpoint_best.pt',
    'training_history': BASE_DIR / 'outputs' / 'training_history.json',
    'processed_data': BASE_DIR / 'data' / 'processed',
    'results_dir': BASE_DIR / 'evaluation_results'
}

paths['results_dir'].mkdir(parents=True, exist_ok=True)

print("Paths:")
for key, path in paths.items():
    status = "exists" if path.exists() else "missing"
    print(f"  {key}: {status}")

if not paths['checkpoint'].exists():
    raise FileNotFoundError(f"Checkpoint not found: {paths['checkpoint']}")

Paths:
  checkpoint: exists
  training_history: exists
  processed_data: exists
  results_dir: exists


## 3. Load Training History

In [5]:
with open(paths['training_history'], 'r') as f:
    history = json.load(f)

print("Training History:")
print(f"  Epochs: {len(history['train_loss'])}")
print(f"  Best epoch: {np.argmin(history['val_loss']) + 1}")
print(f"  Best val loss: {np.min(history['val_loss']):.6f}")
print(f"  Final train loss: {history['train_loss'][-1]:.6f}")
print(f"  Final val loss: {history['val_loss'][-1]:.6f}")

Training History:
  Epochs: 4
  Best epoch: 3
  Best val loss: 0.014659
  Final train loss: 0.013142
  Final val loss: 0.017043


## 4. Load Model

In [6]:
from model.migt_tvdt import MIGT_TVDT

checkpoint = torch.load(paths['checkpoint'], map_location=device)

print(f"Checkpoint: epoch {checkpoint['epoch']}, val_loss {checkpoint['val_loss']:.6f}")

model_config = checkpoint['config']['model']
print(f"\nModel config:")
print(f"  n_variables: {model_config['n_variables']}")
print(f"  max_seq_len: {model_config['max_seq_len']}")
print(f"  n_horizons: {model_config['n_horizons']}")
print(f"  n_quantiles: {model_config['n_quantiles']}")
print(f"  d_model: {model_config['d_model']}")

model = MIGT_TVDT(model_config)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

n_params = sum(p.numel() for p in model.parameters())
print(f"\nParameters: {n_params:,}")

Checkpoint: epoch 2, val_loss 0.014659

Model config:
  n_variables: 24
  max_seq_len: 288
  n_horizons: 5
  n_quantiles: 7
  d_model: 256

Parameters: 6,866,984


## 5. Extract Config Parameters

In [7]:
# Quantiles from checkpoint config
if 'quantiles' in checkpoint['config']:
    quantiles = checkpoint['config']['quantiles']
elif 'quantile_regression' in checkpoint['config']:
    quantiles = checkpoint['config']['quantile_regression']['quantiles']
else:
    # Default from problem statement
    quantiles = [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]

# Horizon names from checkpoint or default
if 'horizon_names' in checkpoint['config']:
    horizon_names = checkpoint['config']['horizon_names']
elif 'horizons' in checkpoint['config'].get('model', {}):
    horizon_names = checkpoint['config']['model']['horizons']
else:
    # Default from problem statement
    horizon_names = ['15m', '30m', '60m', '2h', '4h']

print(f"Quantiles: {quantiles}")
print(f"Horizons: {horizon_names}")

Quantiles: [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
Horizons: ['15m', '30m', '60m', '2h', '4h']


## 6. Load Test Data

In [8]:
from data.dataset import NQDataModule

data_module = NQDataModule(
    data_path=paths['processed_data'] / 'nq_features_full.parquet',
    batch_size=checkpoint['config']['training']['batch_size'],
    num_workers=8,
    pin_memory=True,
    prefetch_factor=2,
    persistent_workers=True,
    subsample_fraction=0.10,
    subsample_seed=42,
    apply_subsample_to_all_splits=True
)

data_module.setup()
test_loader = data_module.test_dataloader()

print(f"Test batches: {len(test_loader)}")
print(f"Batch size: {checkpoint['config']['training']['batch_size']}")
print(f"Approx samples: {len(test_loader) * checkpoint['config']['training']['batch_size']}")


Loading data from /content/drive/MyDrive/Colab Notebooks/Transformers/FP/data/processed/nq_features_full.parquet
Features: 24
Targets: 5
Split statistics:
  Train: 808,996 samples (2010-06-07 to 2021-12-31)
  Val:   141,516 samples (2022-01-02 to 2023-12-29)
  Test:  136,284 samples (2024-01-02 to 2025-12-03)

Temporal gaps:
  Train-Val gap: 49.1 hours
  Val-Test gap: 74.1 hours
  Purged samples: ~576 total (~288 per gap)
[PASS] No data leakage detected:
  Train-Val gap: 49.1 hours
  Val-Test gap: 74.1 hours

Dataset sizes:
  Train: 80,870
  Val:   14,122
  Test:  13,599
Test batches: 71
Batch size: 192
Approx samples: 13632


## 7. Run Inference

In [9]:
from evaluation.inference import ModelPredictor

print("Running inference...")
predictor = ModelPredictor(model, device)
result = predictor.predict_dataset(test_loader, return_targets=True, show_progress=True)

predictions = result['predictions']
targets = result['targets']

print(f"\nPredictions shape: {predictions.shape}")
print(f"Targets shape: {targets.shape}")

Running inference...


Predicting: 100%|██████████| 71/71 [00:53<00:00,  1.32it/s]


Predictions shape: (13599, 5, 7)
Targets shape: (13599, 5)





## 8. Compute Metrics

In [11]:
from evaluation.metrics import MetricsSummary

print("Computing metrics...")
summary = MetricsSummary(quantiles=quantiles, horizon_names=horizon_names)
metrics = summary.compute_all(predictions, targets)

print("\nMetrics by horizon:")
for h in horizon_names:
    print(f"\n{h}:")
    print(f"  CRPS: {metrics['distributional'][h]['crps']:.6f}")
    print(f"  IC: {metrics['point'][h]['ic']:.4f}")
    print(f"  DA: {metrics['point'][h]['da']:.3f}")

Computing metrics...

Metrics by horizon:

15m:
  CRPS: 0.033306
  IC: 0.0011
  DA: 0.517

30m:
  CRPS: 0.001444
  IC: -0.0060
  DA: 0.518

60m:
  CRPS: 0.008692
  IC: -0.0014
  DA: 0.529

2h:
  CRPS: 0.012381
  IC: -0.0226
  DA: 0.536

4h:
  CRPS: 0.004305
  IC: -0.0260
  DA: 0.545


## 9. Calibration Analysis

In [14]:
from evaluation.calibration import CalibrationByHorizon

print("Calibration analysis...")
cal_analyzer = CalibrationByHorizon(quantiles, horizon_names)
calibration_results = cal_analyzer.compute_per_horizon(predictions, targets)

fig = cal_analyzer.plot_reliability_by_horizon(predictions, targets)
plot_path = paths['results_dir'] / "calibration_reliability.png"
fig.savefig(plot_path, dpi=150, bbox_inches='tight')
plt.close(fig)
print(f"Saved: {plot_path.name}")

print("\nCalibration metrics by horizon:")
for h in horizon_names:
    if h in calibration_results:
        print(f"  {h}:")
        print(f"    Mean error: {calibration_results[h]['mean_error']:.6f}")
        print(f"    Max error: {calibration_results[h]['max_error']:.6f}")
        print(f"    RMSE: {calibration_results[h]['rmse']:.6f}")

Calibration analysis...
Saved: calibration_reliability.png

Calibration metrics by horizon:
  15m:
    Mean error: 0.500000
    Max error: 0.950000
    RMSE: 0.609449
  30m:
    Mean error: 0.393354
    Max error: 0.714913
    RMSE: 0.471042
  60m:
    Mean error: 0.498193
    Max error: 0.938749
    RMSE: 0.606710
  2h:
    Mean error: 0.498456
    Max error: 0.945294
    RMSE: 0.607401
  4h:
    Mean error: 0.470586
    Max error: 0.886613
    RMSE: 0.575760


## 10. Backtest

In [15]:
from evaluation.backtest import MultiHorizonBacktester

print("Running backtest...")
backtester = MultiHorizonBacktester(predictions, targets, horizon_names)
bt_results = backtester.run()
bt_summary = backtester.get_metrics_summary()

print("\nBacktest summary:")
print(bt_summary.to_string())

bt_summary.to_csv(paths['results_dir'] / 'backtest_summary.csv')

ax = backtester.plot_equity_curves()
fig = ax.get_figure()
fig.savefig(paths['results_dir'] / 'equity_curves.png', dpi=150, bbox_inches='tight')
plt.close(fig)
print("\nSaved equity curves")

Running backtest...

Backtest summary:
           sharpe   sortino  max_drawdown  profit_factor  hit_rate    calmar  total_return  n_trades  mean_return  std_return
horizon                                                                                                                      
15m      2.088242  2.966822      0.069485       1.051661  0.517023  5.801443      0.263772     13599     0.000018    0.001204
30m      2.419255  3.374956      0.155986       1.058892  0.518494  4.578941      0.451304     13599     0.000029    0.001668
60m      1.438116  1.950728      0.293658       1.034214  0.528862  1.812895      0.343300     13599     0.000025    0.002396
2h       1.942117  2.639945      0.504079       1.045922  0.536363  2.459614      0.745920     13599     0.000047    0.003368
4h       3.311340  4.523842      0.706536       1.076169  0.544746  9.106573      3.000511     13599     0.000114    0.004806

Saved equity curves


## 11. Generate Report

In [16]:
from evaluation.inference import format_evaluation_report

eval_results = {
    'n_samples': len(targets),
    'metrics': metrics
}

report = format_evaluation_report(eval_results, horizon_names)

header = f"""# MIGT-TVDT Evaluation Report

**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Training Info
- Epochs: {checkpoint['epoch']}
- Val loss: {checkpoint['val_loss']:.6f}
- Subsample: {checkpoint['config']['data'].get('subsample_fraction', 1.0)*100:.0f}%

## Architecture
- Variables: {model_config['n_variables']}
- Max seq len: {model_config['max_seq_len']}
- Horizons: {horizon_names}
- Quantiles: {len(quantiles)}
- d_model: {model_config['d_model']}
- Parameters: {n_params:,}

## Test Set
- Samples: {len(targets):,}

---

"""

# Add backtest section
backtest_section = "\n## Backtest Results\n\n"
backtest_section += bt_summary.to_string()
backtest_section += "\n"

full_report = header + report + backtest_section

report_path = paths['results_dir'] / 'evaluation_report.md'
with open(report_path, 'w') as f:
    f.write(full_report)

print(full_report)

# MIGT-TVDT Evaluation Report

**Date:** 2025-12-09 04:38:40

## Training Info
- Epochs: 2
- Val loss: 0.014659
- Subsample: 10%

## Architecture
- Variables: 24
- Max seq len: 288
- Horizons: ['15m', '30m', '60m', '2h', '4h']
- Quantiles: 7
- d_model: 256
- Parameters: 6,866,984

## Test Set
- Samples: 13,599

---

# Model Evaluation Report

Samples evaluated: 13,599


## Distributional Metrics

| Horizon | CRPS | PICP-80 | PICP-50 | MPIW-80 | MPIW-50 |
|---------|------|---------|---------|---------|---------|
| 15m | 0.03331 | 0.000 | 0.000 | 0.00689 | 0.00314 |
| 30m | 0.00144 | 0.169 | 0.026 | 0.00772 | 0.00315 |
| 60m | 0.00869 | 0.001 | 0.000 | 0.00755 | 0.00346 |
| 2h | 0.01238 | 0.003 | 0.001 | 0.00652 | 0.00294 |
| 4h | 0.00431 | 0.030 | 0.017 | 0.00487 | 0.00254 |

## Point Metrics (Median)

| Horizon | IC | DA | RMSE | MAE |
|---------|----|----|------|-----|
| 15m | 0.0011 | 0.517 | 0.07021 | 0.07016 |
| 30m | -0.0060 | 0.518 | 0.00488 | 0.00447 |
| 60m | -0.0014 | 0.529 |

## 12. Save Results

In [17]:
# Save predictions
np.savez_compressed(
    paths['results_dir'] / 'predictions_targets.npz',
    predictions=predictions,
    targets=targets
)

# Helper for JSON serialization
def to_serializable(obj):
    if isinstance(obj, dict):
        return {k: to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, (np.integer, np.floating)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, pd.DataFrame):
        return obj.to_dict('records')
    return obj

# Save metrics
with open(paths['results_dir'] / 'metrics.json', 'w') as f:
    json.dump(to_serializable(metrics), f, indent=2)

# Save calibration
with open(paths['results_dir'] / 'calibration.json', 'w') as f:
    json.dump(to_serializable(calibration_results), f, indent=2)

# Save backtest
with open(paths['results_dir'] / 'backtest.json', 'w') as f:
    json.dump(to_serializable(bt_results), f, indent=2)

# Metrics CSV
rows = []
for cat in ['distributional', 'point']:
    if cat in metrics:
        for h, m in metrics[cat].items():
            row = {'category': cat, 'horizon': h}
            for k, v in m.items():
                row[k] = float(v) if isinstance(v, (np.integer, np.floating)) else v
            rows.append(row)

pd.DataFrame(rows).to_csv(paths['results_dir'] / 'metrics_summary.csv', index=False)

print("\nAll results saved to:", paths['results_dir'])


All results saved to: /content/drive/MyDrive/Colab Notebooks/Transformers/FP/evaluation_results


## 13. Summary

In [20]:
print("=" * 60)
print("EVALUATION COMPLETE")
print("=" * 60)

print("\nKey Metrics:")
for h in horizon_names:
    print(f"\n{h}:")
    print(f"  IC: {metrics['point'][h]['ic']:.4f}")
    print(f"  DA: {metrics['point'][h]['da']:.3f}")
    print(f"  CRPS: {metrics['distributional'][h]['crps']:.6f}")

print(f"\nBacktest Performance:")
for h in horizon_names:
    if h in bt_summary.index:
        sharpe = bt_summary.loc[h, 'sharpe']
        max_dd = bt_summary.loc[h, 'max_drawdown']
        print(f"  {h}: Sharpe={sharpe:.3f}, Max DD={max_dd:.2%}")

print(f"\nResults: {paths['results_dir']}")
print("\nFiles:")
for f in paths['results_dir'].glob('*'):
    print(f"  {f.name}: {f.stat().st_size/1024:.1f} KB")

EVALUATION COMPLETE

Key Metrics:

15m:
  IC: 0.0011
  DA: 0.517
  CRPS: 0.033306

30m:
  IC: -0.0060
  DA: 0.518
  CRPS: 0.001444

60m:
  IC: -0.0014
  DA: 0.529
  CRPS: 0.008692

2h:
  IC: -0.0226
  DA: 0.536
  CRPS: 0.012381

4h:
  IC: -0.0260
  DA: 0.545
  CRPS: 0.004305

Backtest Performance:
  15m: Sharpe=2.088, Max DD=6.95%
  30m: Sharpe=2.419, Max DD=15.60%
  60m: Sharpe=1.438, Max DD=29.37%
  2h: Sharpe=1.942, Max DD=50.41%
  4h: Sharpe=3.311, Max DD=70.65%

Results: /content/drive/MyDrive/Colab Notebooks/Transformers/FP/evaluation_results

Files:
  calibration_reliability.png: 110.6 KB
  backtest_summary.csv: 1.0 KB
  equity_curves.png: 178.9 KB
  evaluation_report.md: 2.1 KB
  predictions_targets.npz: 1798.9 KB
  metrics.json: 2.4 KB
  calibration.json: 1.7 KB
  backtest.json: 5374.3 KB
  metrics_summary.csv: 1.1 KB
