# Deep Hedging Models Test - Utility-Based Evaluation

This notebook tests the four deep hedging models on synthetic data using utility-based evaluation methods:

1. **Augmented Testing**: Mix synthetic with real training data (50/50), train hedger, compare with real-only
2. **Algorithm Comparison**: Train 4 hedgers on both real and synthetic data, evaluate on test sets

Results are saved to the latest  directory.

In [None]:
import sys
import numpy as np
import pandas as pd
import torch
from pathlib import Path
import json
from datetime import datetime

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

from src.models.parametric.gbm import GeometricBrownianMotion
from src.models.parametric.ou_process import OUProcess
from src.models.parametric.merton_jump_diffusion import MertonJumpDiffusion
from src.models.parametric.garch11 import GARCH11
from src.models.parametric.de_jump_diffusion import DoubleExponentialJumpDiffusion
from src.models.non_parametric.block_bootstrap import BlockBootstrap
from src.models.non_parametric.time_gan import TimeGAN
from src.models.non_parametric.time_vae import TimeVAE
from src.models.non_parametric.quant_gan import QuantGAN
from src.models.non_parametric.takahashi import TakahashiDiffusion

from src.hedging_models.feedforward_layers import FeedforwardLayers
from src.hedging_models.feedforward_time import FeedforwardTime
from src.hedging_models.rnn_hedger import RNN
from src.hedging_models.lstm_hedger import LSTM
from src.hedging_models.black_scholes import BlackScholes
from src.hedging_models.delta_gamma import DeltaGamma
from src.hedging_models.random_forest import RandomForest
from src.hedging_models.linear_regression import LinearRegression
from src.hedging_models.xgboost import XGBoost
from src.hedging_models.lightgbm import LightGBM
from src.utils.preprocessing_utils import preprocess_data, create_dataloaders
from src.utils.configs_utils import get_dataset_cfgs
from src.taxonomies.utility import (
    AugmentedTestingEvaluator,
    AlgorithmComparisonEvaluator
)
from src.utils.metric_plot_utils import find_latest_evaluation_folder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load and preprocess data
non_parametric_dataset_cfgs, parametric_dataset_cfgs = get_dataset_cfgs()

train_data_para, valid_data_para, test_data_para, train_init_para, valid_init_para, test_init_para = preprocess_data(
    parametric_dataset_cfgs
)

(
    train_data_non_para,
    valid_data_non_para,
    test_data_non_para,
    train_init_non_para,
    valid_init_non_para,
    test_init_non_para,
) = preprocess_data(non_parametric_dataset_cfgs)


# Real data log returns (univariate: R, L)
real_train_log_returns = train_data_non_para
real_val_log_returns = valid_data_non_para
real_test_log_returns = test_data_non_para

# Get configuration
original_data_path = non_parametric_dataset_cfgs.get("original_data_path")
seq_length = train_data_non_para.shape[1]

print(f"Real data shapes:")
print(f"  Train: {real_train_log_returns.shape}")
print(f"  Val: {real_val_log_returns.shape}")
print(f"  Test: {real_test_log_returns.shape}")
print(f"Sequence length: {seq_length}")

Preprocessing data for AAPL
Preprocessing data for AAPL
Desired time series sample length (lag with max PACF >0): 103
PACF at that lag: 0.040741497942971425
Real data shapes:
  Train: (8975, 103)
  Val: (1122, 103)
  Test: (1122, 103)
Sequence length: 103


In [4]:
# Generate synthetic data for all models
num_samples = 10000
seed = 42
generation_length = seq_length

print(f"Generating {num_samples} samples of length {generation_length}...")

# Initialize parametric models
parametric_models = {}
# parametric_models["GBM"] = GeometricBrownianMotion()
# parametric_models["OU Process"] = OUProcess()
# parametric_models["MJD"] = MertonJumpDiffusion()
# parametric_models["GARCH11"] = GARCH11()
# parametric_models["DEJD"] = DoubleExponentialJumpDiffusion()

# block_bootstrap = BlockBootstrap(block_size=generation_length)

# Fit parametric models
print("Fitting parametric models...")
for name, model in parametric_models.items():
    print(f"Fitting {name}...")
    model.fit(train_data_para)

print("Fitting Block Bootstrap...")
# if isinstance(train_data_para, torch.Tensor):
#     block_bootstrap.fit(train_data_para)
# else:
#     block_bootstrap.fit(torch.from_numpy(train_data_para).float())

# Create DataLoaders for non-parametric deep learning models
# Note: Deep learning models only need log returns, not initial values
print("Creating DataLoaders for non-parametric deep learning models...")
train_loader_non_para, _, _ = create_dataloaders(
    train_data_non_para,
    valid_data_non_para,
    test_data_non_para,
    batch_size=128,
    train_seed=seed,
    train_initial=None,
    valid_initial=None,
    test_initial=None
)

# Initialize non-parametric deep learning models
non_parametric_models = {}
# non_parametric_models["TimeGAN"] = TimeGAN(seq_len=generation_length, hidden_dim=24, num_layers=3, learning_rate=1e-5)
non_parametric_models["TimeVAE"] = TimeVAE(
    length=None,
    num_channels=1,
    latent_dim=10,
    hidden_layer_sizes=[100, 200, 400],
    trend_poly=0,
    custom_seas=None,
    use_residual_conn=True,
    reconstruction_wt=3.0,
    lr=1e-5
)
# non_parametric_models["QuantGAN"] = QuantGAN()
# non_parametric_models["TakahashiDiffusion"] = TakahashiDiffusion(
#     length=None,
#     num_channels=1,
#     num_steps=100,
#     beta_start=0.0001,
#     beta_end=0.02,
#     wavelet='haar',
#     lr=1e-5
# )

# Fit non-parametric deep learning models
print("Fitting non-parametric deep learning models...")
for name, model in non_parametric_models.items():
    print(f"Fitting {name}...")
    model.fit(train_loader_non_para, num_epochs=10)

# Generate synthetic data (univariate)
synthetic_data = {}
generation_kwargs = {"num_samples": num_samples, "generation_length": generation_length, "seed": seed}

print("Generating synthetic data from parametric models...")
for name, model in parametric_models.items():
    print(f"Generating {name}...")
    syn_data = model.generate(**generation_kwargs)
    if isinstance(syn_data, torch.Tensor):
        syn_data = syn_data.cpu().numpy()
        if syn_data.ndim == 3:
            syn_data = syn_data[:, :, 0]
    else:
        syn_data = np.asarray(syn_data)
        if syn_data.ndim == 3:
            syn_data = syn_data[:, :, 0]
    synthetic_data[name] = syn_data

print("Generating Block Bootstrap...")
# syn_data_bb = block_bootstrap.generate(**generation_kwargs)
# if isinstance(syn_data_bb, torch.Tensor):
#     syn_data_bb = syn_data_bb.cpu().numpy()
#     if syn_data_bb.ndim == 3:
#         syn_data_bb = syn_data_bb[:, :, 0]
# else:
#     syn_data_bb = np.asarray(syn_data_bb)
#     if syn_data_bb.ndim == 3:
#         syn_data_bb = syn_data_bb[:, :, 0]
# synthetic_data["BlockBootstrap"] = syn_data_bb

print("Generating synthetic data from non-parametric deep learning models...")
for name, model in non_parametric_models.items():
    print(f"Generating {name}...")
    syn_data = model.generate(**generation_kwargs)
    if isinstance(syn_data, torch.Tensor):
        syn_data = syn_data.cpu().numpy()
        if syn_data.ndim == 3:
            syn_data = syn_data[:, :, 0]
        elif syn_data.ndim == 1:
            syn_data = syn_data.reshape(-1, generation_length)
    else:
        syn_data = np.asarray(syn_data)
        if syn_data.ndim == 3:
            syn_data = syn_data[:, :, 0]
        elif syn_data.ndim == 1:
            syn_data = syn_data.reshape(-1, generation_length)
    synthetic_data[name] = syn_data

print("Synthetic data generation complete!")
for name, data in synthetic_data.items():
    print(f"{name}: {data.shape}")

Generating 10000 samples of length 103...
Fitting parametric models...
Fitting Block Bootstrap...
Creating DataLoaders for non-parametric deep learning models...
Fitting non-parametric deep learning models...
Fitting TimeVAE...
Inferred sequence length: 103
TimeVAE Epoch 1/10 - Loss: 5387.9596 | Recon Loss: 1794.0125 | KL Loss: 5.9219
TimeVAE Epoch 2/10 - Loss: 1408.1304 | Recon Loss: 465.3695 | KL Loss: 12.0220
TimeVAE Epoch 3/10 - Loss: 1098.8867 | Recon Loss: 364.3327 | KL Loss: 5.8884
TimeVAE Epoch 4/10 - Loss: 921.6994 | Recon Loss: 303.2312 | KL Loss: 12.0059
TimeVAE Epoch 5/10 - Loss: 790.1929 | Recon Loss: 254.9986 | KL Loss: 25.1970
TimeVAE Epoch 6/10 - Loss: 699.4535 | Recon Loss: 219.8711 | KL Loss: 39.8402
TimeVAE Epoch 7/10 - Loss: 632.8622 | Recon Loss: 194.3914 | KL Loss: 49.6879
TimeVAE Epoch 8/10 - Loss: 598.4615 | Recon Loss: 180.8657 | KL Loss: 55.8646
TimeVAE Epoch 9/10 - Loss: 552.1480 | Recon Loss: 164.9634 | KL Loss: 57.2580
TimeVAE Epoch 10/10 - Loss: 512.7692 |

In [5]:
# Split synthetic data into train/val/test
def split_data(data, train_ratio=0.8, val_ratio=0.1):
    R = data.shape[0]
    train_end = int(R * train_ratio)
    val_end = int(R * (train_ratio + val_ratio))
    return data[:train_end], data[train_end:val_end], data[val_end:]

synthetic_splits = {}
for name, data in synthetic_data.items():
    train, val, test = split_data(data)
    synthetic_splits[name] = {
        "train": train,
        "val": val,
        "test": test
    }
    print(f"{name} splits - Train: {train.shape}, Val: {val.shape}, Test: {test.shape}")

TimeVAE splits - Train: (8000, 103), Val: (1000, 103), Test: (1000, 103)


In [6]:
# Find latest evaluation directory or create new one
try:
    latest_eval_dir = find_latest_evaluation_folder()
    print(f"Found latest evaluation directory: {latest_eval_dir}")
except FileNotFoundError:
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    latest_eval_dir = project_root / "results" / f"evaluation_{timestamp}"
    latest_eval_dir.mkdir(parents=True, exist_ok=True)
    print(f"Created new evaluation directory: {latest_eval_dir}")

eval_dir = Path(latest_eval_dir)

Created new evaluation directory: /Users/eddisonpham/Projects/Unified-benchmark-for-SDGFTS/results/evaluation_20251115_171027


In [9]:
# Run Augmented Testing Evaluation for each model
print("="*80)
print("AUGMENTED TESTING EVALUATION")
print("="*80)

augmented_results = {}

# Deep learning hedging models
hedger_classes = {
    "Feedforward_L-1": FeedforwardLayers,
    "Feedforward_Time": FeedforwardTime,
    "RNN": RNN,
    "LSTM": LSTM
}

# Non-deep learning hedging models
hedger_classes.update({
    "BlackScholes": BlackScholes,
    "DeltaGamma": DeltaGamma,
    "RandomForest": RandomForest,
    "LinearRegression": LinearRegression,
    "XGBoost": XGBoost,
    "LightGBM": LightGBM
})

for model_name, syn_data in synthetic_data.items():
    print(f"{"="*80}")
    print(f"Evaluating {model_name}")
    print(f"{"="*80}")
    
    evaluator = AugmentedTestingEvaluator(
        real_train_log_returns=real_train_log_returns,
        real_val_log_returns=real_val_log_returns,
        real_train_initial=train_init_non_para,
        real_val_initial=valid_init_non_para,
        synthetic_train_log_returns=syn_data,
        seq_length=seq_length,
        strike=None,
        hidden_size=64,
        num_epochs=70,
        batch_size=128,
        learning_rate=0.001
    )
    
    model_results = {}
    for hedger_name, hedger_class in hedger_classes.items():
        print(f"Evaluating {hedger_name}...")
        results = evaluator.evaluate(hedger_class)
        model_results[hedger_name] = results
    
    augmented_results[model_name] = model_results

print("Augmented Testing Evaluation Complete!")

AUGMENTED TESTING EVALUATION
Evaluating TimeVAE
[AugmentedTestingEvaluator] Initialization started...


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [None]:
# Run Algorithm Comparison Evaluation for each model
print("="*80)
print("ALGORITHM COMPARISON EVALUATION")
print("="*80)

algorithm_comparison_results = {}

for model_name, splits in synthetic_splits.items():
    print(f"{"="*80}")
    print(f"Evaluating {model_name}")
    print(f"{"="*80}")
    
    evaluator = AlgorithmComparisonEvaluator(
        real_train_log_returns=train_data_log_returns,
        real_val_log_returns=real_val_log_returns,
        real_test_log_returns=real_test_log_returns,
        synthetic_train_log_returns=splits["train"],
        synthetic_val_log_returns=splits["val"],
        synthetic_test_log_returns=splits["test"],
        real_train_initial=train_init_non_para,
        real_val_initial=valid_init_non_para,
        real_test_initial=test_init_non_para,
        seq_length=seq_length,
        strike=None,
        hidden_size=64,
        num_epochs=70,
        batch_size=128,
        learning_rate=0.001
    )
    
    results = evaluator.evaluate()
    algorithm_comparison_results[model_name] = results

print("Algorithm Comparison Evaluation Complete!")

ALGORITHM COMPARISON EVALUATION
Evaluating GBM
[AlgorithmComparisonEvaluator] Initialization started...
Converting log returns to prices for real data...
[AlgorithmComparisonEvaluator] Converting log returns to prices for synthetic data...
[AlgorithmComparisonEvaluator] Initialization complete.
[AlgorithmComparisonEvaluator] Starting evaluation procedure for all hedgers...
[AlgorithmComparisonEvaluator] --- Evaluating Feedforward_L-1 ---
[AlgorithmComparisonEvaluator] Training Feedforward_L-1 on real data...
Epoch 10/70, Loss: 46.224549
Epoch 20/70, Loss: 44.316120
Epoch 30/70, Loss: 44.314973
Epoch 40/70, Loss: 43.117066
Epoch 50/70, Loss: 42.896970
Epoch 60/70, Loss: 42.509133
Epoch 70/70, Loss: 42.131216
Training completed. Final premium: 2.603165
[AlgorithmComparisonEvaluator] Training Feedforward_L-1 on synthetic data...
Epoch 10/70, Loss: 0.170632
Epoch 20/70, Loss: 0.160448
Epoch 30/70, Loss: 0.161203
Epoch 40/70, Loss: 0.159636
Epoch 50/70, Loss: 0.159843
Epoch 60/70, Loss: 0.1

In [None]:
# Save all results to evaluation directory
all_results = {
    "augmented_testing": augmented_results,
    "algorithm_comparison": algorithm_comparison_results,
    "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
    "config": {
        "seq_length": seq_length,
        "num_samples": num_samples,
        "original_data_path": str(original_data_path),
        "hedger_classes": list(hedger_classes.keys())
    }
}

# Save to JSON
results_file = eval_dir / "deep_hedging_utility_results.json"
with open(results_file, "w") as f:
    json.dump(all_results, f, indent=2, default=str)

print(f"Results saved to: {results_file}")

# Also save individual model results
for model_name in synthetic_data.keys():
    model_dir = eval_dir / model_name
    model_dir.mkdir(exist_ok=True)
    
    model_results = {
        "augmented_testing": augmented_results.get(model_name, {}),
        "algorithm_comparison": algorithm_comparison_results.get(model_name, {})
    }
    
    model_file = model_dir / "utility_results.json"
    with open(model_file, "w") as f:
        json.dump(model_results, f, indent=2, default=str)

print("All results saved successfully!")

Results saved to: /Users/eddisonpham/Projects/Unified-benchmark-for-SDGFTS/results/evaluation_20251114_090602/deep_hedging_utility_results.json
All results saved successfully!
