# Deep Hedging Models Test - Utility-Based Evaluation

This notebook tests the four deep hedging models on synthetic data using utility-based evaluation methods:

1. **Augmented Testing**: Mix synthetic with real training data (50/50), train hedger, compare with real-only
2. **Algorithm Comparison**: Train 4 hedgers on both real and synthetic data, evaluate on test sets

Results are saved to the latest  directory.

In [1]:
import sys
import numpy as np
import pandas as pd
import torch
from pathlib import Path
import json
from datetime import datetime

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

from src.models.parametric.gbm import GeometricBrownianMotion
from src.models.parametric.ou_process import OUProcess
from src.models.parametric.merton_jump_diffusion import MertonJumpDiffusion
from src.models.parametric.garch11 import GARCH11
from src.models.parametric.de_jump_diffusion import DoubleExponentialJumpDiffusion
from src.models.non_parametric.block_bootstrap import BlockBootstrap

from src.deep_hedgers.feedforward_layers import FeedforwardDeepHedger
from src.deep_hedgers.feedforward_time import FeedforwardTimeDeepHedger
from src.deep_hedgers.rnn_hedger import RNNDeepHedger
from src.deep_hedgers.lstm_hedger import LSTMDeepHedger

from src.utils.preprocessing_utils import preprocess_data
from src.utils.configs_utils import get_dataset_cfgs
from src.taxonomies.utility import (
    AugmentedTestingEvaluator,
    AlgorithmComparisonEvaluator
)
from src.utils.metric_plot_utils import find_latest_evaluation_folder

In [2]:
# Load and preprocess data
non_parametric_dataset_cfgs, parametric_dataset_cfgs = get_dataset_cfgs()

train_data_para, valid_data_para, test_data_para, train_init_para, valid_init_para, test_init_para = preprocess_data(
    parametric_dataset_cfgs
)

(
    train_data_non_para,
    valid_data_non_para,
    test_data_non_para,
    train_init_non_para,
    valid_init_non_para,
    test_init_non_para,
) = preprocess_data(non_parametric_dataset_cfgs)

# Convert to numpy arrays for utility evaluators
def to_numpy(data):
    if isinstance(data, torch.Tensor):
        return data.cpu().numpy()
    return np.asarray(data)

# Real data log returns (univariate: R, L)
real_train_log_returns = to_numpy(train_data_non_para)
real_val_log_returns = to_numpy(valid_data_non_para)
real_test_log_returns = to_numpy(test_data_non_para)

# Get configuration
original_data_path = non_parametric_dataset_cfgs.get("original_data_path")
seq_length = train_data_non_para.shape[1]

print(f"Real data shapes:")
print(f"  Train: {real_train_log_returns.shape}")
print(f"  Val: {real_val_log_returns.shape}")
print(f"  Test: {real_test_log_returns.shape}")
print(f"Sequence length: {seq_length}")

Preprocessing data for AAPL
Preprocessing data for AAPL
Desired time series sample length (lag with max PACF >0): 103
PACF at that lag: 0.040741497942971425
Real data shapes:
  Train: (8975, 103)
  Val: (1122, 103)
  Test: (1122, 103)
Sequence length: 103


In [3]:
# Generate synthetic data for all models
num_samples = 1000
seed = 42
generation_length = seq_length

print(f"Generating {num_samples} samples of length {generation_length}...")

# Initialize models
parametric_models = {}
parametric_models["GBM"] = GeometricBrownianMotion()
parametric_models["OU Process"] = OUProcess()
parametric_models["MJD"] = MertonJumpDiffusion()
parametric_models["GARCH11"] = GARCH11()
parametric_models["DEJD"] = DoubleExponentialJumpDiffusion()

block_bootstrap = BlockBootstrap(block_size=generation_length)

# Fit models
print("Fitting models...")
for name, model in parametric_models.items():
    print(f"Fitting {name}...")
    model.fit(train_data_para)

print("Fitting Block Bootstrap...")
if isinstance(train_data_para, torch.Tensor):
    block_bootstrap.fit(train_data_para)
else:
    block_bootstrap.fit(torch.from_numpy(train_data_para).float())

# Generate synthetic data (univariate)
synthetic_data = {}
generation_kwargs = {"num_samples": num_samples, "generation_length": generation_length, "seed": seed}

print("Generating synthetic data...")
for name, model in parametric_models.items():
    print(f"Generating {name}...")
    syn_data = model.generate(**generation_kwargs)
    if isinstance(syn_data, torch.Tensor):
        syn_data = syn_data.cpu().numpy()
        if syn_data.ndim == 3:
            syn_data = syn_data[:, :, 0]
    else:
        syn_data = np.asarray(syn_data)
        if syn_data.ndim == 3:
            syn_data = syn_data[:, :, 0]
    synthetic_data[name] = syn_data

print("Generating Block Bootstrap...")
syn_data_bb = block_bootstrap.generate(**generation_kwargs)
if isinstance(syn_data_bb, torch.Tensor):
    syn_data_bb = syn_data_bb.cpu().numpy()
    if syn_data_bb.ndim == 3:
        syn_data_bb = syn_data_bb[:, :, 0]
else:
    syn_data_bb = np.asarray(syn_data_bb)
    if syn_data_bb.ndim == 3:
        syn_data_bb = syn_data_bb[:, :, 0]
synthetic_data["BlockBootstrap"] = syn_data_bb

print("Synthetic data generation complete!")
for name, data in synthetic_data.items():
    print(f"{name}: {data.shape}")

Generating 1000 samples of length 103...
Fitting models...
Fitting GBM...
mu: 0.0005937263937975437, sigma: 0.0299900515594167
Fitting OU Process...
mu: 0.0005998738612877731, theta: 3.71058070622147, sigma: 0.08168810309975477
Fitting MJD...
mu: 0.001147970840732297, sigma: 0.029988395702553322, kappa: 0.009970418497181388, lam: 0.010490282685512367, mu_j: -0.0017242599942500548, sigma_j: 0.15261259320151815
Fitting GARCH11...
mu: 0.0015174979719101576, omega: 1.895775584754838e-05, alpha: 0.10009742465402663, beta: 0.8800128770993371
Fitting DEJD...
mu: 0.10455820676790209, sigma: 0.018964296626492114, lam: 0.10291519434628975, p: 0.5450643776824035, eta1: 15.493428897886261, eta2: 14.601712934992657, kappa: 1.0084483516750737
Fitting Block Bootstrap...
Generating synthetic data...
Generating GBM...
Generating OU Process...
Generating MJD...
Generating GARCH11...
Generating DEJD...
Generating Block Bootstrap...
Synthetic data generation complete!
GBM: (1000, 103)
OU Process: (1000, 1

In [4]:
# Split synthetic data into train/val/test
def split_data(data, train_ratio=0.8, val_ratio=0.1):
    R = data.shape[0]
    train_end = int(R * train_ratio)
    val_end = int(R * (train_ratio + val_ratio))
    return data[:train_end], data[train_end:val_end], data[val_end:]

synthetic_splits = {}
for name, data in synthetic_data.items():
    train, val, test = split_data(data)
    synthetic_splits[name] = {
        "train": train,
        "val": val,
        "test": test
    }
    print(f"{name} splits - Train: {train.shape}, Val: {val.shape}, Test: {test.shape}")

GBM splits - Train: (800, 103), Val: (100, 103), Test: (100, 103)
OU Process splits - Train: (800, 103), Val: (100, 103), Test: (100, 103)
MJD splits - Train: (800, 103), Val: (100, 103), Test: (100, 103)
GARCH11 splits - Train: (800, 103), Val: (100, 103), Test: (100, 103)
DEJD splits - Train: (800, 103), Val: (100, 103), Test: (100, 103)
BlockBootstrap splits - Train: (800, 103), Val: (100, 103), Test: (100, 103)


In [5]:
# Find latest evaluation directory or create new one
try:
    latest_eval_dir = find_latest_evaluation_folder()
    print(f"Found latest evaluation directory: {latest_eval_dir}")
except FileNotFoundError:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    latest_eval_dir = project_root / "results" / f"evaluation_{timestamp}"
    latest_eval_dir.mkdir(parents=True, exist_ok=True)
    print(f"Created new evaluation directory: {latest_eval_dir}")

eval_dir = Path(latest_eval_dir)

Created new evaluation directory: /Users/eddisonpham/Projects/Unified-benchmark-for-SDGFTS/results/evaluation_20251114_010548


In [None]:
# Run Augmented Testing Evaluation for each model
print("="*80)
print("AUGMENTED TESTING EVALUATION")
print("="*80)

augmented_results = {}
hedger_classes = {
    "Feedforward_L-1": FeedforwardDeepHedger,
    "Feedforward_Time": FeedforwardTimeDeepHedger,
    "RNN": RNNDeepHedger,
    "LSTM": LSTMDeepHedger
}

for model_name, syn_data in synthetic_data.items():
    print(f"{"="*80}")
    print(f"Evaluating {model_name}")
    print(f"{"="*80}")
    
    # Use first 500 samples for training
    syn_train = syn_data[:500]
    
    evaluator = AugmentedTestingEvaluator(
        real_train_log_returns=real_train_log_returns,
        real_val_log_returns=real_val_log_returns,
        real_train_initial=train_init_non_para,
        real_val_initial=valid_init_non_para,
        synthetic_train_log_returns=syn_train,
        seq_length=seq_length,
        strike=None,
        hidden_size=64,
        num_epochs=50,
        batch_size=32,
        learning_rate=0.001
    )
    
    model_results = {}
    for hedger_name, hedger_class in hedger_classes.items():
        print(f"Evaluating {hedger_name}...")
        results = evaluator.evaluate(hedger_class)
        model_results[hedger_name] = results
    
    augmented_results[model_name] = model_results

print("Augmented Testing Evaluation Complete!")

AUGMENTED TESTING EVALUATION
Evaluating GBM
Converting log returns to prices for real training data...
Converting log returns to prices for real validation data...
Converting log returns to prices for synthetic training data...
Evaluating Feedforward_L-1...


In [None]:
# Run Algorithm Comparison Evaluation for each model
print("="*80)
print("ALGORITHM COMPARISON EVALUATION")
print("="*80)

algorithm_comparison_results = {}

for model_name, splits in synthetic_splits.items():
    print(f"{"="*80}")
    print(f"Evaluating {model_name}")
    print(f"{"="*80}")
    
    evaluator = AlgorithmComparisonEvaluator(
        real_train_log_returns=real_train_log_returns,
        real_val_log_returns=real_val_log_returns,
        real_test_log_returns=real_test_log_returns,
        synthetic_train_log_returns=splits["train"],
        synthetic_val_log_returns=splits["val"],
        synthetic_test_log_returns=splits["test"],
        real_train_initial=train_init_non_para,
        real_val_initial=valid_init_non_para,
        real_test_initial=test_init_non_para,
        seq_length=seq_length,
        original_data_path=original_data_path,
        strike=None,
        hidden_size=64,
        num_epochs=50,
        batch_size=32,
        learning_rate=0.001
    )
    
    results = evaluator.evaluate()
    algorithm_comparison_results[model_name] = results

print("Algorithm Comparison Evaluation Complete!")

In [None]:
# Save all results to evaluation directory
all_results = {
    "augmented_testing": augmented_results,
    "algorithm_comparison": algorithm_comparison_results,
    "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
    "config": {
        "seq_length": seq_length,
        "num_samples": num_samples,
        "original_data_path": str(original_data_path),
        "hedger_classes": list(hedger_classes.keys())
    }
}

# Save to JSON
results_file = eval_dir / "deep_hedging_utility_results.json"
with open(results_file, "w") as f:
    json.dump(all_results, f, indent=2, default=str)

print(f"Results saved to: {results_file}")

# Also save individual model results
for model_name in synthetic_data.keys():
    model_dir = eval_dir / model_name
    model_dir.mkdir(exist_ok=True)
    
    model_results = {
        "augmented_testing": augmented_results.get(model_name, {}),
        "algorithm_comparison": algorithm_comparison_results.get(model_name, {})
    }
    
    model_file = model_dir / "utility_results.json"
    with open(model_file, "w") as f:
        json.dump(model_results, f, indent=2, default=str)

print("All results saved successfully!")

In [None]:
# Display summary of results
print("="*80)
print("DEEP HEDGING UTILITY EVALUATION SUMMARY")
print("="*80)

for model_name in synthetic_data.keys():
    print(f"{model_name}:")
    print(f"  Augmented Testing:")
    if model_name in augmented_results:
        for hedger_name, results in augmented_results[model_name].items():
            score = results.get("score", {})
            print(f"    {hedger_name}:")
            print(f"      MSE Mean: {score.get("mse_mean", "N/A")}")
            print(f"      MSE P95: {score.get("mse_p95", "N/A")}")
            print(f"      MSE P05: {score.get("mse_p05", "N/A")}")
    
    print(f"  Algorithm Comparison:")
    if model_name in algorithm_comparison_results:
        for hedger_name, results in algorithm_comparison_results[model_name].items():
            print(f"    {hedger_name}:")
            real_test = results.get("real_test", {})
            syn_test = results.get("synthetic_test", {})
            print(f"      Real Test MSE(X): {real_test.get("mse_X", "N/A")}")
            print(f"      Synthetic Test MSE(X): {syn_test.get("mse_X", "N/A")}")