In [1]:
import numpy as np
import pandas as pd
import random
import copy
import torch
from tqdm import tqdm
import warnings
from itertools import product
from torch.utils.data import DataLoader
from ivyspt.input_processing import split_surfaces, IVSurfaceDataset
from ivyspt.trainer import Trainer
from ivyspt.ivyspt import IvySPT

# Set the random seed for reproducibility
RANDOM_STATE = 0
N_JOBS = 8
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [2]:
pre_train_data = pd.read_csv('data/pre_train_data.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
fine_tune_data = pd.read_csv('data/fine_tune_data.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
pre_train_surfaces_train, pre_train_surfaces_validation, pre_train_surfaces_test = split_surfaces(
    pre_train_data,
    toy_sample=True,
    max_points=20,
    max_surfaces=20,
    random_state=RANDOM_STATE
)
fine_tune_surfaces_train, fine_tune_surfaces_validation, fine_tune_surfaces_test = split_surfaces(
    fine_tune_data,
    toy_sample=True,
    max_points=20,
    max_surfaces=20,
    random_state=RANDOM_STATE
)

In [3]:
hyperparameters = {
    'Input Preprocessing' : {
        'Mask Proportions' : [0.1, 0.3, 0.5, 0.7],
        'Number of Query Points' : None,
        'Batch Size' : 10
    },
    'Surface Embedding' : {
        'Embedding Dimension' : 32,
    },
    'Surface Encoding' : {
        'Number of Heads' : 4,
        'FFN Hidden Dimension' : 128,
        'Attention Dropout' : 0.,
        'Gate Dropout' : 0.,
        'FFN Dropout' : 0.,
        'Number of Blocks' : 4,
        'External Feature Dimension' : 5,
        'Weight Initializer Std.' : 0.02,
        'Linear Bias Initializer' : 0.0,
        'Gate Bias Inititalizer' : 10.0
    },
    'Adaptive Loss Weights' : {
        'Asymmetry' : 1.5,
    },
    'Trainer' : {
        'Pre-Train' : {
            'Number of Epochs' : 10,
            'Warmup Ratio' : 0.15,
            'Peak Learning Rate' : 1e-3,
            'Minimal Learning Rate' : 1e-6,
            'Gradient Clipping' : None,
            'Adam Betas' : (0.9, 0.999),
            'Adam Epsilon' : 1e-8,
            'Adam Weight Decay' : 0.01,
            'Layer-Wise Decay' : None,
        },
        'Fine-Tune' : {
            'Number of Epochs' : 10,
            'Warmup Ratio' : 0.1,
            'Peak Learning Rate' : 1e-3,
            'Minimal Learning Rate' : 1e-6,
            'Gradient Clipping' : None,
            'Adam Betas' : (0.9, 0.999),
            'Adam Epsilon' : 1e-8,
            'Adam Weight Decay' : 0.01,
            'Layer-Wise Decay' : 0.9,
        }
    }
}

In [4]:
pre_train_dataset_train = IVSurfaceDataset(
    pre_train_surfaces_train, 
    hyperparameters['Input Preprocessing']['Mask Proportions'], 
    RANDOM_STATE, 
    hyperparameters['Input Preprocessing']['Number of Query Points'] 
)
pre_train_data_loader_train = DataLoader(
    pre_train_dataset_train, 
    batch_size=hyperparameters['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)
pre_train_dataset_validation = IVSurfaceDataset(
    pre_train_surfaces_validation, 
    hyperparameters['Input Preprocessing']['Mask Proportions'], 
    RANDOM_STATE, 
    hyperparameters['Input Preprocessing']['Number of Query Points'] 
)
pre_train_data_loader_validation = DataLoader(
    pre_train_dataset_validation, 
    batch_size=hyperparameters['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)
pre_train_dataset_test = IVSurfaceDataset(
    pre_train_surfaces_test, 
    hyperparameters['Input Preprocessing']['Mask Proportions'], 
    RANDOM_STATE, 
    hyperparameters['Input Preprocessing']['Number of Query Points'] 
)
pre_train_data_loader_test = DataLoader(
    pre_train_dataset_test, 
    batch_size=hyperparameters['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)

In [5]:
import torch
import copy
import warnings
import pandas as pd
from itertools import product
from tqdm import tqdm

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the grid of hyperparameters
gradient_clipping_values = [1e-1, 1, 10, 100, 1000]
peak_learning_rates = [1e-5, 1e-4, 1e-3, 1e-2]

# Create a list to store results
results = []

# Generate all combinations of gradient clipping values and peak learning rates
combinations = list(product(gradient_clipping_values, peak_learning_rates))

# Iterate over each combination of hyperparameters
for gradient_clip, peak_lr in tqdm(combinations, total=len(combinations)):
    # Deep copy the default hyperparameters to test specific combinations
    test_hyperparameters = copy.deepcopy(hyperparameters)
    
    # Set the specific hyperparameters
    test_hyperparameters['Trainer']['Pre-Train']['Peak Learning Rate'] = peak_lr
    test_hyperparameters['Trainer']['Pre-Train']['Gradient Clipping'] = gradient_clip
    test_hyperparameters['Trainer']['Pre-Train']['Minimal Learning Rate'] = max(1e-6, peak_lr * 0.01)
    
    # Initialize the model with the test hyperparameters
    torch.manual_seed(RANDOM_STATE)
    model_pre_train = IvySPT(
        test_hyperparameters['Surface Embedding']['Embedding Dimension'], 
        test_hyperparameters['Surface Encoding']['Number of Blocks'],
        test_hyperparameters['Surface Encoding']['Number of Heads'], 
        test_hyperparameters['Surface Encoding']['FFN Hidden Dimension'],
        test_hyperparameters['Surface Encoding']['Attention Dropout'], 
        test_hyperparameters['Surface Encoding']['Gate Dropout'],
        test_hyperparameters['Surface Encoding']['FFN Dropout'],
        test_hyperparameters['Surface Encoding']['External Feature Dimension'],
        test_hyperparameters['Surface Encoding']['Weight Initializer Std.'],
        test_hyperparameters['Surface Encoding']['Linear Bias Initializer'],
        test_hyperparameters['Surface Encoding']['Gate Bias Inititalizer']
    )
    
    # Initialize the trainer with the test hyperparameters
    warnings.filterwarnings("ignore", category=UserWarning)        
    pre_trainer = Trainer(
        model_pre_train,
        pre_train_data_loader_train,
        pre_train_data_loader_validation,
        pre_train_data_loader_test,
        test_hyperparameters['Trainer']['Pre-Train']['Number of Epochs'],
        test_hyperparameters['Trainer']['Pre-Train']['Warmup Ratio'],
        test_hyperparameters['Trainer']['Pre-Train']['Peak Learning Rate'],
        test_hyperparameters['Trainer']['Pre-Train']['Minimal Learning Rate'],
        test_hyperparameters['Trainer']['Pre-Train']['Gradient Clipping'],
        test_hyperparameters['Trainer']['Pre-Train']['Adam Betas'],
        test_hyperparameters['Trainer']['Pre-Train']['Adam Epsilon'],
        test_hyperparameters['Trainer']['Pre-Train']['Adam Weight Decay'],
        test_hyperparameters['Trainer']['Pre-Train']['Layer-Wise Decay'],
        test_hyperparameters['Adaptive Loss Weights']['Asymmetry'],
        device
    )
    
    # Train the model and get the validation loss components
    _, _, validate_loss_components_history = pre_trainer.train()
    
    # Get the final validation losses for each component
    final_validation_losses = validate_loss_components_history[-1]

    print({
        'Gradient Clipping': gradient_clip,
        'Peak Learning Rate': peak_lr,
        'MSE Loss': final_validation_losses[0],
        'Calendar Arbitrage Loss': final_validation_losses[1],
        'Butterfly Arbitrage Loss': final_validation_losses[2]
    })
    
    # Append results to the list
    results.append({
        'Gradient Clipping': gradient_clip,
        'Peak Learning Rate': peak_lr,
        'MSE Loss': final_validation_losses[0],
        'Calendar Arbitrage Loss': final_validation_losses[1],
        'Butterfly Arbitrage Loss': final_validation_losses[2]
    })

# Convert the results into a DataFrame
results_df = pd.DataFrame(results)

# Create a multi-index DataFrame with gradient clipping and peak learning rate as the index
results_df.set_index(['Gradient Clipping', 'Peak Learning Rate'], inplace=True)

# Rank the columns, where the lowest loss is rank 1
ranked_df = results_df.rank(axis=0, method='min', ascending=True)

# Create an average rank column and sort by it
ranked_df['Average Rank'] = ranked_df.mean(axis=1)
ranked_df.sort_values(by='Average Rank', ascending=True, inplace=True)

ranked_df


  5%|▌         | 1/20 [03:27<1:05:48, 207.80s/it]

{'Gradient Clipping': 0.1, 'Peak Learning Rate': 1e-05, 'MSE Loss': 0.007978086, 'Calendar Arbitrage Loss': 0.0020727417, 'Butterfly Arbitrage Loss': 0.8096614}


 10%|█         | 2/20 [06:51<1:01:39, 205.51s/it]

{'Gradient Clipping': 0.1, 'Peak Learning Rate': 0.0001, 'MSE Loss': 0.0074118595, 'Calendar Arbitrage Loss': 0.0064550326, 'Butterfly Arbitrage Loss': 4.9240584}


 15%|█▌        | 3/20 [10:18<58:26, 206.27s/it]  

{'Gradient Clipping': 0.1, 'Peak Learning Rate': 0.001, 'MSE Loss': 0.01865702, 'Calendar Arbitrage Loss': 0.0002020784, 'Butterfly Arbitrage Loss': 0.017323326}


 20%|██        | 4/20 [13:44<54:58, 206.19s/it]

{'Gradient Clipping': 0.1, 'Peak Learning Rate': 0.01, 'MSE Loss': 0.0042393375, 'Calendar Arbitrage Loss': 2.2148943e-06, 'Butterfly Arbitrage Loss': 0.0}


 25%|██▌       | 5/20 [17:08<51:18, 205.26s/it]

{'Gradient Clipping': 1, 'Peak Learning Rate': 1e-05, 'MSE Loss': 0.0075784232, 'Calendar Arbitrage Loss': 0.0011701168, 'Butterfly Arbitrage Loss': 1.2276402}


 30%|███       | 6/20 [20:32<47:48, 204.86s/it]

{'Gradient Clipping': 1, 'Peak Learning Rate': 0.0001, 'MSE Loss': 0.010340053, 'Calendar Arbitrage Loss': 0.0021425385, 'Butterfly Arbitrage Loss': 0.6851499}


 35%|███▌      | 7/20 [24:02<44:46, 206.63s/it]

{'Gradient Clipping': 1, 'Peak Learning Rate': 0.001, 'MSE Loss': 0.004120839, 'Calendar Arbitrage Loss': 0.0002127424, 'Butterfly Arbitrage Loss': 0.06854783}


 40%|████      | 8/20 [27:28<41:14, 206.23s/it]

{'Gradient Clipping': 1, 'Peak Learning Rate': 0.01, 'MSE Loss': 0.0042548305, 'Calendar Arbitrage Loss': 1.9511893e-07, 'Butterfly Arbitrage Loss': 0.0}


 45%|████▌     | 9/20 [30:52<37:40, 205.47s/it]

{'Gradient Clipping': 10, 'Peak Learning Rate': 1e-05, 'MSE Loss': 0.008406518, 'Calendar Arbitrage Loss': 0.0050522583, 'Butterfly Arbitrage Loss': 0.4986096}


 50%|█████     | 10/20 [34:07<33:42, 202.27s/it]

{'Gradient Clipping': 10, 'Peak Learning Rate': 0.0001, 'MSE Loss': 0.012325285, 'Calendar Arbitrage Loss': 0.007740541, 'Butterfly Arbitrage Loss': 0.29566038}


 55%|█████▌    | 11/20 [37:34<30:35, 203.95s/it]

{'Gradient Clipping': 10, 'Peak Learning Rate': 0.001, 'MSE Loss': 0.007920148, 'Calendar Arbitrage Loss': 0.0062787095, 'Butterfly Arbitrage Loss': 0.15583532}


 60%|██████    | 12/20 [41:04<27:26, 205.75s/it]

{'Gradient Clipping': 10, 'Peak Learning Rate': 0.01, 'MSE Loss': 0.0044159107, 'Calendar Arbitrage Loss': 3.4739884e-07, 'Butterfly Arbitrage Loss': 0.0}


 65%|██████▌   | 13/20 [44:30<24:00, 205.74s/it]

{'Gradient Clipping': 100, 'Peak Learning Rate': 1e-05, 'MSE Loss': 0.0069962144, 'Calendar Arbitrage Loss': 0.0023423047, 'Butterfly Arbitrage Loss': 0.50326407}


 70%|███████   | 14/20 [47:56<20:34, 205.83s/it]

{'Gradient Clipping': 100, 'Peak Learning Rate': 0.0001, 'MSE Loss': 0.00970708, 'Calendar Arbitrage Loss': 0.0030775196, 'Butterfly Arbitrage Loss': 0.5944328}


 75%|███████▌  | 15/20 [51:22<17:09, 205.89s/it]

{'Gradient Clipping': 100, 'Peak Learning Rate': 0.001, 'MSE Loss': 0.015611563, 'Calendar Arbitrage Loss': 0.0023918725, 'Butterfly Arbitrage Loss': 0.14590287}


 80%|████████  | 16/20 [54:50<13:45, 206.40s/it]

{'Gradient Clipping': 100, 'Peak Learning Rate': 0.01, 'MSE Loss': 0.006873709, 'Calendar Arbitrage Loss': 7.790407e-07, 'Butterfly Arbitrage Loss': 0.0}


 85%|████████▌ | 17/20 [58:12<10:15, 205.10s/it]

{'Gradient Clipping': 1000, 'Peak Learning Rate': 1e-05, 'MSE Loss': 0.008306649, 'Calendar Arbitrage Loss': 0.012935673, 'Butterfly Arbitrage Loss': 2.2684705}


 90%|█████████ | 18/20 [1:01:38<06:50, 205.29s/it]

{'Gradient Clipping': 1000, 'Peak Learning Rate': 0.0001, 'MSE Loss': 0.009268843, 'Calendar Arbitrage Loss': 0.003214235, 'Butterfly Arbitrage Loss': 0.37632728}


 95%|█████████▌| 19/20 [1:04:55<03:22, 202.97s/it]

{'Gradient Clipping': 1000, 'Peak Learning Rate': 0.001, 'MSE Loss': 0.022455947, 'Calendar Arbitrage Loss': 0.0062397327, 'Butterfly Arbitrage Loss': 1.03869415e-05}


100%|██████████| 20/20 [1:08:14<00:00, 204.72s/it]

{'Gradient Clipping': 1000, 'Peak Learning Rate': 0.01, 'MSE Loss': 0.009691524, 'Calendar Arbitrage Loss': 0.00049700326, 'Butterfly Arbitrage Loss': 0.00062475837}





Unnamed: 0_level_0,Unnamed: 1_level_0,MSE Loss,Calendar Arbitrage Loss,Butterfly Arbitrage Loss,Average Rank
Gradient Clipping,Peak Learning Rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,0.01,3.0,1.0,1.0,1.666667
0.1,0.01,2.0,4.0,1.0,2.333333
10.0,0.01,4.0,2.0,1.0,2.333333
100.0,0.01,5.0,3.0,1.0,3.0
1.0,0.001,1.0,6.0,8.0,5.0
1000.0,0.01,14.0,7.0,6.0,9.0
0.1,0.001,19.0,5.0,7.0,10.333333
100.0,1e-05,6.0,11.0,14.0,10.333333
1.0,1e-05,8.0,8.0,18.0,11.333333
10.0,0.001,9.0,17.0,10.0,12.0


In [6]:
results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,MSE Loss,Calendar Arbitrage Loss,Butterfly Arbitrage Loss
Gradient Clipping,Peak Learning Rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.1,1e-05,0.007978,0.002072742,0.809661
0.1,0.0001,0.007412,0.006455033,4.924058
0.1,0.001,0.018657,0.0002020784,0.017323
0.1,0.01,0.004239,2.214894e-06,0.0
1.0,1e-05,0.007578,0.001170117,1.22764
1.0,0.0001,0.01034,0.002142539,0.68515
1.0,0.001,0.004121,0.0002127424,0.068548
1.0,0.01,0.004255,1.951189e-07,0.0
10.0,1e-05,0.008407,0.005052258,0.49861
10.0,0.0001,0.012325,0.007740541,0.29566


In [None]:
import torch
import copy
import warnings
import pandas as pd
from itertools import product
from tqdm import tqdm

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the grid of hyperparameters
embedding_dims = [4, 8, 16, 32]
num_blocks = [1, 2, 4, 8]
peak_learning_rates = [1e-4, 1e-3, 1e-2]

# Create a list to store results
results = []

# Generate all combinations of embedding dimensions, number of blocks, and peak learning rates
combinations = list(product(embedding_dims, num_blocks, peak_learning_rates))

# Iterate over each combination of hyperparameters
for embedding_dim, blocks, peak_lr in tqdm(combinations, total=len(combinations)):
    # Deep copy the default hyperparameters to test specific combinations
    test_hyperparameters = copy.deepcopy(hyperparameters)
    
    # Set the specific hyperparameters
    test_hyperparameters['Surface Embedding']['Embedding Dimension'] = embedding_dim
    test_hyperparameters['Surface Encoding']['Number of Blocks'] = blocks
    test_hyperparameters['Surface Encoding']['FFN Hidden Dimension'] = 4 * embedding_dim
    test_hyperparameters['Trainer']['Pre-Train']['Peak Learning Rate'] = peak_lr
    
    # Initialize the model with the test hyperparameters
    torch.manual_seed(RANDOM_STATE)
    model_pre_train = IvySPT(
        test_hyperparameters['Surface Embedding']['Embedding Dimension'], 
        test_hyperparameters['Surface Encoding']['Number of Blocks'],
        test_hyperparameters['Surface Encoding']['Number of Heads'], 
        test_hyperparameters['Surface Encoding']['FFN Hidden Dimension'],
        test_hyperparameters['Surface Encoding']['Attention Dropout'], 
        test_hyperparameters['Surface Encoding']['Gate Dropout'],
        test_hyperparameters['Surface Encoding']['FFN Dropout'],
        test_hyperparameters['Surface Encoding']['External Feature Dimension'],
        test_hyperparameters['Surface Encoding']['Weight Initializer Std.'],
        test_hyperparameters['Surface Encoding']['Linear Bias Initializer'],
        test_hyperparameters['Surface Encoding']['Gate Bias Inititalizer']
    )
    
    # Initialize the trainer with the test hyperparameters
    warnings.filterwarnings("ignore", category=UserWarning)        
    pre_trainer = Trainer(
        model_pre_train,
        pre_train_data_loader_train,
        pre_train_data_loader_validation,
        pre_train_data_loader_test,
        test_hyperparameters['Trainer']['Pre-Train']['Number of Epochs'],
        test_hyperparameters['Trainer']['Pre-Train']['Warmup Ratio'],
        test_hyperparameters['Trainer']['Pre-Train']['Peak Learning Rate'],
        test_hyperparameters['Trainer']['Pre-Train']['Minimal Learning Rate'],
        test_hyperparameters['Trainer']['Pre-Train']['Gradient Clipping'],
        test_hyperparameters['Trainer']['Pre-Train']['Adam Betas'],
        test_hyperparameters['Trainer']['Pre-Train']['Adam Epsilon'],
        test_hyperparameters['Trainer']['Pre-Train']['Adam Weight Decay'],
        test_hyperparameters['Trainer']['Pre-Train']['Layer-Wise Decay'],
        test_hyperparameters['Adaptive Loss Weights']['Asymmetry'],
        device
    )
    
    # Train the model and get the validation loss components
    _, _, validate_loss_components_history = pre_trainer.train()
    
    # Get the final validation losses for each component
    final_validation_losses = validate_loss_components_history[-1]
    
    # Append results to the list
    results.append({
        'Embedding Dimension': embedding_dim,
        'Number of Blocks': blocks,
        'Peak Learning Rate': peak_lr,
        'MSE Loss': final_validation_losses[0],
        'Calendar Arbitrage Loss': final_validation_losses[1],
        'Butterfly Arbitrage Loss': final_validation_losses[2]
    })

# Convert the results into a DataFrame
results_df = pd.DataFrame(results)

# Create a multi-index DataFrame with embedding dimension, number of blocks, and peak learning rate as the index
results_df.set_index(['Embedding Dimension', 'Number of Blocks', 'Peak Learning Rate'], inplace=True)

# Rank the columns, where the lowest loss is rank 1
ranked_df = results_df.rank(axis=0, method='min', ascending=True)

# Create an average rank column and sort by it
ranked_df['Average Rank'] = ranked_df.mean(axis=1)
ranked_df.sort_values(by='Average Rank', ascending=True, inplace=True)

ranked_df