# Deep Hedging Models Test - Utility-Based Evaluation

This notebook tests the four deep hedging models on synthetic data using utility-based evaluation methods:

1. **Augmented Testing**: Mix synthetic with real training data (50/50), train hedger, compare with real-only
2. **Algorithm Comparison**: Train 4 hedgers on both real and synthetic data, evaluate on test sets

Results are saved to the latest  directory.

In [1]:
import sys
import numpy as np
import torch
from pathlib import Path
import json
from datetime import datetime

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

from src.models.parametric.gbm import GeometricBrownianMotion
from src.models.parametric.ou_process import OUProcess
from src.models.parametric.merton_jump_diffusion import MertonJumpDiffusion
from src.models.parametric.garch11 import GARCH11
from src.models.parametric.de_jump_diffusion import DoubleExponentialJumpDiffusion
from src.models.non_parametric.block_bootstrap import BlockBootstrap
from src.models.non_parametric.time_vae import TimeVAE
from src.models.non_parametric.quant_gan import QuantGAN
from src.models.non_parametric.takahashi import TakahashiDiffusion

from src.hedging_models.deep_hedgers.feedforward_layers import FeedforwardLayers
from src.hedging_models.deep_hedgers.feedforward_time import FeedforwardTime
from src.hedging_models.deep_hedgers.rnn_hedger import RNN
from src.hedging_models.deep_hedgers.lstm_hedger import LSTM

from src.hedging_models.non_deep_hedgers.black_scholes import BlackScholes
from src.hedging_models.non_deep_hedgers.delta_gamma import DeltaGamma
from src.hedging_models.non_deep_hedgers.linear_regression import LinearRegression
from src.hedging_models.non_deep_hedgers.xgboost import XGBoost

from src.utils.preprocessing_utils import preprocess_data, create_dataloaders
from src.utils.configs_utils import get_dataset_cfgs
from src.taxonomies.utility import (
    AugmentedTestingEvaluator,
    AlgorithmComparisonEvaluator
)
from src.utils.metric_plot_utils import find_latest_evaluation_folder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load and preprocess data
non_parametric_dataset_cfgs, parametric_dataset_cfgs = get_dataset_cfgs()

train_data_para, valid_data_para, test_data_para, train_init_para, valid_init_para, test_init_para = preprocess_data(
    parametric_dataset_cfgs
)

(
    train_data_non_para,
    valid_data_non_para,
    test_data_non_para,
    train_init_non_para,
    valid_init_non_para,
    test_init_non_para,
) = preprocess_data(non_parametric_dataset_cfgs)

# Get configuration
original_data_path = non_parametric_dataset_cfgs.get("original_data_path")
seq_length = train_data_non_para.shape[1]

print(f"Real data shapes:")
print(f"  Train: {train_data_non_para.shape}")
print(f"  Val: {valid_data_non_para.shape}")
print(f"  Test: {test_data_non_para.shape}")
print(f"Sequence length: {seq_length}")

Preprocessing data for AAPL
Preprocessing data for AAPL
Desired time series sample length (lag with max PACF >0): 103
PACF at that lag: 0.04074734125599129
Real data shapes:
  Train: torch.Size([8981, 103])
  Val: torch.Size([1123, 103])
  Test: torch.Size([1123, 103])
Sequence length: 103


In [3]:
# Generate synthetic data for all models
num_samples = 10000
seed = 42
generation_length = seq_length

print(f"Generating {num_samples} samples of length {generation_length}...")

# Initialize parametric models
parametric_models = {}
parametric_models["GBM"] = GeometricBrownianMotion()
# parametric_models["OU Process"] = OUProcess()
# parametric_models["MJD"] = MertonJumpDiffusion()
# parametric_models["GARCH11"] = GARCH11()
parametric_models["DEJD"] = DoubleExponentialJumpDiffusion()

block_bootstrap = BlockBootstrap(block_size=generation_length)

# Fit parametric models
print("Fitting parametric models...")
for name, model in parametric_models.items():
    print(f"Fitting {name}...")
    model.fit(train_data_para)


print("Creating DataLoaders for non-parametric deep learning models...")
train_loader_non_para, _, _ = create_dataloaders(
    train_data_non_para,
    valid_data_non_para,
    test_data_non_para,
    batch_size=128,
    train_seed=seed,
    train_initial=None,
    valid_initial=None,
    test_initial=None
)

# Initialize non-parametric deep learning models
non_parametric_models = {}
# non_parametric_models["TimeVAE"] = TimeVAE(
#     seq_len=seq_length,
#     input_dim=1,
#     latent_dim=30,
#     hidden_dim=128,
#     lr=1e-4,
#     device='cuda' if torch.cuda.is_available() else 'cpu'
# )
# non_parametric_models["QuantGAN"] = QuantGAN()
# non_parametric_models["TakahashiDiffusion"] = TakahashiDiffusion(
#     length=None,
#     num_channels=1,
#     num_steps=100,
#     beta_start=0.0001,
#     beta_end=0.02,
#     wavelet='haar',
#     lr=1e-5
# )

# Fit non-parametric deep learning models
print("Fitting non-parametric deep learning models...")
for name, model in non_parametric_models.items():
    print(f"Fitting {name}...")
    model.fit(train_loader_non_para, num_epochs=10)

# Generate synthetic data (univariate)
synthetic_data = {}
generation_kwargs = {"num_samples": num_samples, "generation_length": generation_length, "seed": seed}

print("Generating synthetic data from parametric models...")
for name, model in parametric_models.items():
    print(f"Generating {name}...")
    syn_data = model.generate(**generation_kwargs)
    if syn_data.ndim == 3:
        syn_data = syn_data[:, :, 0]
    synthetic_data[name] = syn_data

print("Generating synthetic data from non-parametric deep learning models...")
for name, model in non_parametric_models.items():
    print(f"Generating {name}...")
    syn_data = model.generate(**generation_kwargs)
    if syn_data.ndim == 3:
        syn_data = syn_data[:, :, 0]
    elif syn_data.ndim == 1:
        syn_data = syn_data.reshape(-1, generation_length)
    synthetic_data[name] = syn_data

print("Synthetic data generation complete!")
for name, data in synthetic_data.items():
    print(f"{name}: {data.shape}")

Generating 10000 samples of length 103...
Fitting parametric models...
Fitting GBM...
mu: 0.0005921594767344989, sigma: 0.029982834528274226
Fitting DEJD...
mu: 0.10447647009804578, sigma: 0.018963797049591412, lam: 0.10283570561624186, p: 0.5450643776824035, eta1: 15.493428897886261, eta2: 14.601712934992657, kappa: 1.0084483516750737
Creating DataLoaders for non-parametric deep learning models...
Fitting non-parametric deep learning models...
Generating synthetic data from parametric models...
Generating GBM...
Generating DEJD...
Generating synthetic data from non-parametric deep learning models...
Synthetic data generation complete!
GBM: torch.Size([10000, 103])
DEJD: torch.Size([10000, 103])


In [4]:
# Split synthetic data into train/val/test
def split_data(data, train_ratio=0.8, val_ratio=0.1):
    R = data.shape[0]
    train_end = int(R * train_ratio)
    val_end = int(R * (train_ratio + val_ratio))
    return data[:train_end], data[train_end:val_end], data[val_end:]

synthetic_splits = {}
for name, data in synthetic_data.items():
    train, val, test = split_data(data)
    synthetic_splits[name] = {
        "train": train,
        "val": val,
        "test": test
    }
    print(f"{name} splits - Train: {train.shape}, Val: {val.shape}, Test: {test.shape}")

GBM splits - Train: torch.Size([8000, 103]), Val: torch.Size([1000, 103]), Test: torch.Size([1000, 103])
DEJD splits - Train: torch.Size([8000, 103]), Val: torch.Size([1000, 103]), Test: torch.Size([1000, 103])


In [5]:
# Find latest evaluation directory or create new one
try:
    latest_eval_dir = find_latest_evaluation_folder()
    print(f"Found latest evaluation directory: {latest_eval_dir}")
except FileNotFoundError:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    latest_eval_dir = project_root / "results" / f"evaluation_{timestamp}"
    latest_eval_dir.mkdir(parents=True, exist_ok=True)
    print(f"Created new evaluation directory: {latest_eval_dir}")

eval_dir = Path(latest_eval_dir)

Created new evaluation directory: /Users/eddisonpham/Projects/Unified-benchmark-for-SDGFTS/results/evaluation_20251128_172242


In [6]:
# Run Augmented Testing Evaluation for each model
print("="*80)
print("AUGMENTED TESTING EVALUATION")
print("="*80)

augmented_results = {}

# Deep learning hedging models
hedger_classes = {
    "Feedforward_L-1": FeedforwardLayers,
    "Feedforward_Time": FeedforwardTime,
    "RNN": RNN,
    "LSTM": LSTM
}

# Non-deep learning hedging models
hedger_classes.update({
    "BlackScholes": BlackScholes,
    "DeltaGamma": DeltaGamma,
    "LinearRegression": LinearRegression,
    "XGBoost": XGBoost
})

for model_name, syn_data in synthetic_data.items():
    print(f"{"="*80}")
    print(f"Evaluating {model_name}")
    print(f"{"="*80}")
    
    evaluator = AugmentedTestingEvaluator(
        real_train_log_returns=train_data_non_para,
        real_val_log_returns=valid_data_non_para,
        real_train_initial=train_init_non_para,
        real_val_initial=valid_init_non_para,
        synthetic_train_log_returns=syn_data,
        seq_length=seq_length,
        num_epochs=2,
        batch_size=128,
        learning_rate=1e-3
    )
    
    model_results = {}
    for hedger_name, hedger_class in hedger_classes.items():
        print(f"Evaluating {hedger_name}...")
        results = evaluator.evaluate(hedger_class)
        model_results[hedger_name] = results
    
    augmented_results[model_name] = model_results

print("Augmented Testing Evaluation Complete!")

AUGMENTED TESTING EVALUATION
Evaluating GBM
[AugmentedTestingEvaluator] Initialization started...
[AugmentedTestingEvaluator] Converting log returns to prices for real training data...
[AugmentedTestingEvaluator] Converting log returns to prices for real validation data...
[AugmentedTestingEvaluator] Converting log returns to prices for synthetic training data...
[AugmentedTestingEvaluator] Initialization complete.
Evaluating Feedforward_L-1...
[AugmentedTestingEvaluator] Starting evaluation procedure...
[AugmentedTestingEvaluator] Mixing 8981 real and 8981 synthetic training samples (50/50)...
[AugmentedTestingEvaluator] Training hedger on mixed (synthetic + real) data...
Epoch 1/2, Loss: 28.476090
Epoch 2/2, Loss: 26.528441
Training completed. Final premium: 0.254263
[AugmentedTestingEvaluator] Training hedger on real data only...
Epoch 1/2, Loss: 56.664745
Epoch 2/2, Loss: 45.214780
Training completed. Final premium: 0.128314
[AugmentedTestingEvaluator] Evaluating both hedgers on re

In [7]:
# Run Algorithm Comparison Evaluation for each model
print("="*80)
print("ALGORITHM COMPARISON EVALUATION")
print("="*80)

algorithm_comparison_results = {}

for model_name, splits in synthetic_splits.items():
    print(f"{"="*80}")
    print(f"Evaluating {model_name}")
    print(f"{"="*80}")
    
    evaluator = AlgorithmComparisonEvaluator(
        real_train_log_returns=train_data_non_para,
        real_val_log_returns=valid_data_non_para,
        real_test_log_returns=test_data_non_para,
        synthetic_train_log_returns=splits["train"],
        synthetic_val_log_returns=splits["val"],
        synthetic_test_log_returns=splits["test"],
        real_train_initial=train_init_non_para,
        real_val_initial=valid_init_non_para,
        real_test_initial=test_init_non_para,
        seq_length=seq_length,
        num_epochs=2,
        batch_size=128,
        learning_rate=1e-3
    )
    
    results = evaluator.evaluate()
    algorithm_comparison_results[model_name] = results

print("Algorithm Comparison Evaluation Complete!")

ALGORITHM COMPARISON EVALUATION
Evaluating GBM
[AlgorithmComparisonEvaluator] Initialization started...
Converting log returns to prices for real data...
[AlgorithmComparisonEvaluator] Converting log returns to prices for synthetic data...
[AlgorithmComparisonEvaluator] Initialization complete.
[AlgorithmComparisonEvaluator] Starting evaluation procedure for all hedging models...
[AlgorithmComparisonEvaluator] --- Evaluating Feedforward_L-1 ---
[AlgorithmComparisonEvaluator] Training Feedforward_L-1 on real data...
Epoch 1/2, Loss: 56.348771
Epoch 2/2, Loss: 52.362722
Training completed. Final premium: 0.136745
[AlgorithmComparisonEvaluator] Training Feedforward_L-1 on synthetic data...
Epoch 1/2, Loss: 0.461165
Epoch 2/2, Loss: 0.115399
Training completed. Final premium: 0.088098
[AlgorithmComparisonEvaluator] Evaluating Feedforward_L-1 on real test set...
[AlgorithmComparisonEvaluator] Evaluating Feedforward_L-1 on synthetic test set...
[AlgorithmComparisonEvaluator] Computing metric

In [8]:
# Save all results to evaluation directory
all_results = {
    "augmented_testing": augmented_results,
    "algorithm_comparison": algorithm_comparison_results,
    "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
    "config": {
        "seq_length": seq_length,
        "num_samples": num_samples,
        "original_data_path": str(original_data_path),
        "hedger_classes": list(hedger_classes.keys())
    }
}

# Save to JSON
results_file = eval_dir / "deep_hedging_utility_results.json"
with open(results_file, "w") as f:
    json.dump(all_results, f, indent=2, default=str)

print(f"Results saved to: {results_file}")

# Also save individual model results
for model_name in synthetic_data.keys():
    model_dir = eval_dir / model_name
    model_dir.mkdir(exist_ok=True)
    
    model_results = {
        "augmented_testing": augmented_results.get(model_name, {}),
        "algorithm_comparison": algorithm_comparison_results.get(model_name, {})
    }
    
    model_file = model_dir / "utility_results.json"
    with open(model_file, "w") as f:
        json.dump(model_results, f, indent=2, default=str)

print("All results saved successfully!")

Results saved to: /Users/eddisonpham/Projects/Unified-benchmark-for-SDGFTS/results/evaluation_20251128_172242/deep_hedging_utility_results.json
All results saved successfully!
