In [1]:
import numpy as np
import pandas as pd
import random
import torch

In [2]:
# Set the random seed for reproducibility
RANDOM_STATE = 0
N_JOBS = 8
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [3]:
HYPERPARAMETERS = {
    'Input Preprocessing' : {
        'Mask Proportions' : [0.1, 0.2, 0.4, 0.8],
        'Batch Size' : 4
    },
    'Input Embedding' : {
        'Surface Embedding' : {
            'Grid Dimension' : 3,
            'Channels Dimension' : 8,
        },
        'Pre-Encoder' : {
            'Branch Channels Dimension' : 4,
            'Number of Blocks' : 2,
        }
    },
    'Surface Encoding' : {
        'Encoder' : {
            'Number of Heads' : 4,
            'Hidden Dimension' : 16,
            'Dropout' : 0.1,
            'Number of Blocks' : 2,
            'External Feature Dimension' : 3,
        }
    },
    'Query Embedding' : {
        'Pre-Decoder' : {
            'Hidden Dimension' : 16,
            'Dropout' : 0.1,
            'Number of Blocks' : 2,
        }
    },
    'Surface Decoding' : {
        'Decoder' : {
            'Number of Heads' : 4,
            'Hidden Dimension' : 16,
            'Dropout' : 0.1,
            'Number of Blocks' : 2,
        }
    },
    'No-Arbitrage' : {
        'Butterfly' : 1,
        'Calendar' : 1,
    }
}

## Dataset

In [4]:
aapl_googl_data = pd.read_csv('volatility_surface_AAPL_GOOGL_2013_01_2013_06.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
aapl_googl_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Log Moneyness,Time to Maturity,Implied Volatility,Market Return,Market Volatility,Treasury Rate
Datetime,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-02,AAPL,-0.316688,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.316688,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.291996,0.007937,0.3726,0.025086,14.680000,0.055
...,...,...,...,...,...,...,...
2013-06-28,GOOGL,0.427518,2.253968,0.2430,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2383,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2426,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.442224,2.253968,0.2402,-0.004299,16.860001,0.030


In [5]:
import gc
from joblib_progress import joblib_progress
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from joblib import Parallel, delayed

def implied_volatility_surface_datasets(
    options_market_data, 
    proportions, 
    n_jobs=1,
    random_state=0,
    n_chunks=1
):
    def mask_surface(
        date, 
        symbol, 
        surface, 
        rng
    ):
        def mask_surface_with_proportion(
            surface_data, 
            proportion, 
        ):
            n_clusters = int(np.ceil(1 / proportion))
            points_coordinates = surface_data['points_coordinates']
            points_volatilities = surface_data['points_volatilities']

            # Create the clustering pipeline
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto'))
            ])
            
            # Fit the pipeline to the data points
            labels = pipeline.fit_predict(points_coordinates)
            
            single_surface_datasets = []
            for cluster in range(n_clusters):
                cluster_indices = np.where(labels == cluster)[0]
                num_to_mask = int(np.ceil(len(cluster_indices) * proportion))
                masked_indices = rng.choice(cluster_indices, size=num_to_mask, replace=False)
                
                for idx in masked_indices:
                    unmasked_indices = np.setdiff1d(cluster_indices, masked_indices)

                    single_surface_datasets.append({
                        'Datetime': surface_data['datetime'],
                        'Symbol': surface_data['symbol'],
                        'Market Features': surface_data['market_features'],
                        'Input Surface': {
                            'Log Moneyness': points_coordinates[unmasked_indices, 0],
                            'Time to Maturity': points_coordinates[unmasked_indices, 1],
                            'Implied Volatility': points_volatilities[unmasked_indices]
                        },
                        'Query Point': {
                            'Log Moneyness': points_coordinates[idx, 0],
                            'Time to Maturity': points_coordinates[idx, 1]
                        },
                        'Target Volatility': points_volatilities[idx]
                    })

            return single_surface_datasets
        
        surface_data = {
            'datetime': date,
            'symbol': symbol,
            'points_coordinates': surface[['Log Moneyness', 'Time to Maturity']].values,
            'points_volatilities': surface['Implied Volatility'].values,
            'market_features': {
                'Market Return': surface['Market Return'].values[0],
                'Market Volatility': surface['Market Volatility'].values[0],
                'Treasury Rate': surface['Treasury Rate'].values[0]
            }
        }
        
        datasets = []
        for proportion in proportions:
            datasets.extend(mask_surface_with_proportion(surface_data, proportion))

        return datasets

    rng = np.random.default_rng(random_state)
    all_surfaces = list(options_market_data.groupby(level=['Datetime', 'Symbol']))
    n_surfaces = len(all_surfaces)
    
    # Split the array into 'n_chunks' chunks
    chunks = np.array_split(range(n_surfaces), n_chunks)
    # Initialize the list to hold all results
    surface_datasets = []
    # Process each chunk sequentially
    with joblib_progress("Surfaces...", total=n_surfaces): 
        for chunk in chunks:
            # Process the current chunk in parallel
            output = Parallel(n_jobs=n_jobs)(
                delayed(mask_surface)(date, symbol, surface, rng)
                for (date, symbol), surface in [all_surfaces[i] for i in chunk]
            )
            # Extend the overall results with the current chunk's results
            surface_datasets.extend(output)
            gc.collect()  

    # Flatten the list of lists into a single list of datasets
    return [item for sublist in surface_datasets for item in sublist]

aapl_googl_dataset = implied_volatility_surface_datasets(
    aapl_googl_data,
    HYPERPARAMETERS['Input Preprocessing']['Mask Proportions'],
    n_jobs=N_JOBS,
    random_state=RANDOM_STATE,
    n_chunks=4
)

Output()

KeyboardInterrupt: 

In [None]:
# import pickle

# with open('aapl_googl_dataset.pickle', 'wb') as handle:
#     pickle.dump(aapl_googl_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('aapl_googl_dataset.pickle', 'rb') as handle:
#     aapl_googl_dataset_ = pickle.load(handle)


In [None]:
len(aapl_googl_dataset)

863511

In [None]:
aapl_googl_dataset[0]

{'Datetime': Timestamp('2013-01-02 00:00:00'),
 'Symbol': 'AAPL',
 'Market Features': {'Market Return': 0.0250861159586972,
  'Market Volatility': 14.68000030517578,
  'Treasury Rate': 0.0549999997019767},
 'Input Surface': {'Log Moneyness': array([-0.74747141, -0.72842322, -0.72842322, -0.70973108, -0.69138194,
         -0.69138194, -0.67336344, -0.67336344, -0.63827212, -0.63827212,
         -0.62117768, -0.62117768, -0.60437057, -0.60437057, -0.58784126,
         -0.58784126, -0.57158074, -0.5555804 , -0.5555804 , -0.53983205,
         -0.53983205, -0.52432786, -0.52432786, -0.50906039, -0.50906039,
         -0.49402251, -0.49402251, -0.47920742, -0.47920742, -0.46460862,
         -0.46460862, -0.45021989, -0.45021989, -0.43603525, -0.43603525,
         -0.42204901, -0.42204901, -0.40825569, -0.40825569, -0.39465004,
         -0.39465004, -0.74747141, -0.74747141, -0.72842322, -0.70973108,
         -0.70973108, -0.69138194, -0.69138194, -0.67336344, -0.67336344,
         -0.65566386

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import Dataset

class IVSurfaceDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data_point = self.data[idx]

        # Convert each component of the data point into tensors as appropriate
        return {
            'Datetime': data_point['Datetime'],
            'Symbol': data_point['Symbol'],
            'Market Features': {
                'Market Return': torch.tensor(data_point['Market Features']['Market Return'], dtype=torch.float32),
                'Market Volatility': torch.tensor(data_point['Market Features']['Market Volatility'], dtype=torch.float32),
                'Treasury Rate': torch.tensor(data_point['Market Features']['Treasury Rate'], dtype=torch.float32),
            },
            'Input Surface': {
                'Log Moneyness': torch.tensor(data_point['Input Surface']['Log Moneyness'], dtype=torch.float32),
                'Time to Maturity': torch.tensor(data_point['Input Surface']['Time to Maturity'], dtype=torch.float32),
                'Implied Volatility': torch.tensor(data_point['Input Surface']['Implied Volatility'], dtype=torch.float32),
            },
            'Query Point': {
                'Log Moneyness': torch.tensor(data_point['Query Point']['Log Moneyness'], dtype=torch.float32),
                'Time to Maturity': torch.tensor(data_point['Query Point']['Time to Maturity'], dtype=torch.float32),
            },
            'Target Volatility': torch.tensor(data_point['Target Volatility'], dtype=torch.float32),
        }

    def collate_fn(batch):
        # Organize batch data by structuring as a dictionary with batched components
        batched_data = {
            'Datetime': [item['Datetime'] for item in batch],
            'Symbol': [item['Symbol'] for item in batch],
            'Market Features': {
                'Market Return': default_collate([item['Market Features']['Market Return'] for item in batch]),
                'Market Volatility': default_collate([item['Market Features']['Market Volatility'] for item in batch]),
                'Treasury Rate': default_collate([item['Market Features']['Treasury Rate'] for item in batch]),
            },
            'Input Surface': {
                'Log Moneyness': [item['Input Surface']['Log Moneyness'] for item in batch],
                'Time to Maturity': [item['Input Surface']['Time to Maturity'] for item in batch],
                'Implied Volatility': [item['Input Surface']['Implied Volatility'] for item in batch],
            },
            'Query Point': {
                'Log Moneyness': default_collate([item['Query Point']['Log Moneyness'] for item in batch]),
                'Time to Maturity': default_collate([item['Query Point']['Time to Maturity'] for item in batch]),
            },
            'Target Volatility': default_collate([item['Target Volatility'] for item in batch]),
        }

        # Set requires_grad=True for query point values
        batched_data['Query Point']['Log Moneyness'].requires_grad_()
        batched_data['Query Point']['Time to Maturity'].requires_grad_()

        return batched_data



aapl_googl_data_loader = DataLoader(
    IVSurfaceDataset(aapl_googl_dataset), 
    batch_size=HYPERPARAMETERS['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)

# Fetch one batch from the DataLoader
batch = next(iter(aapl_googl_data_loader))
batch

{'Datetime': [Timestamp('2013-03-14 00:00:00'),
  Timestamp('2013-01-31 00:00:00'),
  Timestamp('2013-02-05 00:00:00'),
  Timestamp('2013-05-20 00:00:00')],
 'Symbol': ['GOOGL', 'AAPL', 'AAPL', 'AAPL'],
 'Market Features': {'Market Return': tensor([ 0.0056, -0.0026,  0.0104, -0.0007]),
  'Market Volatility': tensor([11.3000, 14.2800, 13.7200, 13.0200]),
  'Treasury Rate': tensor([0.0900, 0.0650, 0.0700, 0.0350])},
 'Input Surface': {'Log Moneyness': [tensor([-0.9121, -0.7450, -0.7197, -0.6950, -0.6829, -0.6591, -0.6358, -0.5800,
           -0.5479, -0.4670, -0.4670, -0.4383, -0.4013, -0.3922, -0.3655, -0.3482,
           -0.3396, -0.3311, -0.3226, -0.2977, -0.2342, -0.2039, -0.1745, -0.1250,
           -0.1113, -0.1045, -0.0911, -0.0779, -0.0648, -0.0455, -0.0203, -0.0141,
           -0.0080, -0.0080,  0.0163,  0.0573,  0.0744,  0.0856,  0.1023,  0.1240,
            0.1294,  0.1400,  0.1505,  0.1609,  0.1815,  0.2261,  0.2642,  0.3009,
            0.3621, -0.1601, -0.1601, -0.1459, -0.

## Surface Embedding

### Input Embedding

#### Components

In [None]:
import torch
import torch.nn as nn

class SurfaceBatchNorm(nn.Module):
    def __init__(self, num_features=1, momentum=0.1):
        super(SurfaceBatchNorm, self).__init__()
        self.log_moneyness_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.time_to_maturity_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.implied_volatility_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_return_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_volatility_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.treasury_rate_bn = nn.BatchNorm1d(num_features, momentum=momentum)

    def forward(self, batch):
        # Concatenate all tensors from the Input Surface into one tensor for each feature
        input_surface_log_moneyness = torch.cat([x for x in batch['Input Surface']['Log Moneyness']])
        input_surface_time_to_maturity = torch.cat([x for x in batch['Input Surface']['Time to Maturity']])
        input_surface_implied_volatility = torch.cat([x for x in batch['Input Surface']['Implied Volatility']])

        # Concatenate Input Surface tensors with Query Point tensors
        total_log_moneyness = torch.cat([input_surface_log_moneyness, batch['Query Point']['Log Moneyness']])
        total_time_to_maturity = torch.cat([input_surface_time_to_maturity, batch['Query Point']['Time to Maturity']])

        # Normalize Log Moneyness and Time to Maturity
        norm_log_moneyness = self.log_moneyness_bn(total_log_moneyness.unsqueeze(1)).squeeze(1)
        norm_time_to_maturity = self.time_to_maturity_bn(total_time_to_maturity.unsqueeze(1)).squeeze(1)

        # Normalize Implied Volatility (only from Input Surface)
        norm_implied_volatility = self.implied_volatility_bn(input_surface_implied_volatility.unsqueeze(1)).squeeze(1)

        # Split the normalized results back to corresponding structures
        input_surface_sizes = [len(x) for x in batch['Input Surface']['Log Moneyness']]
        total_input_size = sum(input_surface_sizes)

        # Normalizing Market Features
        market_features = batch['Market Features']
        norm_market_return = self.market_return_bn(market_features['Market Return'].unsqueeze(1)).squeeze(1)
        norm_market_volatility = self.market_volatility_bn(market_features['Market Volatility'].unsqueeze(1)).squeeze(1)
        norm_treasury_rate = self.treasury_rate_bn(market_features['Treasury Rate'].unsqueeze(1)).squeeze(1)

        # Reconstructing the batch with normalized data
        output = {
            'Datetime': batch['Datetime'],
            'Symbol': batch['Symbol'],
            'Market Features': {
                'Market Return': norm_market_return,
                'Market Volatility': norm_market_volatility,
                'Treasury Rate': norm_treasury_rate
            },
            'Input Surface': {
                'Log Moneyness': list(torch.split(norm_log_moneyness[:total_input_size], input_surface_sizes)),
                'Time to Maturity': list(torch.split(norm_time_to_maturity[:total_input_size], input_surface_sizes)),
                'Implied Volatility': list(torch.split(norm_implied_volatility, input_surface_sizes))
            },
            'Query Point': {
                'Log Moneyness': norm_log_moneyness[total_input_size:],
                'Time to Maturity': norm_time_to_maturity[total_input_size:]
            },
            'Target Volatility': batch['Target Volatility']
        }

        return output

# Usage
surfacebatchnorm = SurfaceBatchNorm()
processed_batch = surfacebatchnorm(batch)
processed_batch

{'Datetime': [Timestamp('2013-03-14 00:00:00'),
  Timestamp('2013-01-31 00:00:00'),
  Timestamp('2013-02-05 00:00:00'),
  Timestamp('2013-05-20 00:00:00')],
 'Symbol': ['GOOGL', 'AAPL', 'AAPL', 'AAPL'],
 'Market Features': {'Market Return': tensor([ 0.4010, -0.9509,  1.1927, -0.6427], grad_fn=<SqueezeBackward1>),
  'Market Volatility': tensor([-1.5886,  1.0710,  0.5712, -0.0535], grad_fn=<SqueezeBackward1>),
  'Treasury Rate': tensor([ 1.2539e+00, -1.0051e-07,  2.5078e-01, -1.5047e+00],
         grad_fn=<SqueezeBackward1>)},
 'Input Surface': {'Log Moneyness': [tensor([-2.2581e+00, -1.8348e+00, -1.7707e+00, -1.7081e+00, -1.6774e+00,
           -1.6171e+00, -1.5582e+00, -1.4166e+00, -1.3353e+00, -1.1304e+00,
           -1.1304e+00, -1.0576e+00, -9.6379e-01, -9.4085e-01, -8.7328e-01,
           -8.2921e-01, -8.0746e-01, -7.8590e-01, -7.6452e-01, -7.0143e-01,
           -5.4050e-01, -4.6371e-01, -3.8918e-01, -2.6380e-01, -2.2909e-01,
           -2.1191e-01, -1.7790e-01, -1.4434e-01, -1.11

In [None]:
import torch
import torch.nn as nn
import numpy as np

# class ParametricContinuousKernel(nn.Module):
#     def __init__(self, input_dim, hidden_dim, hidden_layers, output_dim=1, dropout_prob=0.1):
#         super(ParametricContinuousKernel, self).__init__()
#         layers = []
#         current_dim = input_dim
#         for _ in range(hidden_layers):
#             layers.append(nn.Linear(current_dim, hidden_dim))
#             layers.append(nn.GELU())
#             layers.append(nn.Dropout(dropout_prob))
#             current_dim = hidden_dim
#         layers.append(nn.Linear(hidden_dim, output_dim))
#         self.net = nn.Sequential(*layers)

#     def forward(self, x):
#         return self.net(x)

class EllipticalRBFKernel(nn.Module):
    def __init__(self, input_dim):
        super(EllipticalRBFKernel, self).__init__()
        # Initialize the bandwidth parameters for each dimension
        # We use log-space parameterization for stability in optimization (exp to ensure positivity)
        self.log_bandwidth = nn.Parameter(torch.zeros(input_dim))  # Initialized to exp(0) = 1

    def forward(self, distances):
        # Scale the distances by the bandwidths
        # torch.exp(self.log_bandwidth) converts log bandwidth back to the standard scale
        scaled_distances = distances / torch.exp(self.log_bandwidth)

        # Compute the RBF kernel output using the scaled distances
        # The RBF kernel formula exp(-0.5 * (scaled distance)^2)
        kernel_values = torch.exp(-0.5 * torch.sum(scaled_distances ** 2, dim=-1))

        return kernel_values

class SurfaceContinuousKernelEmbedding(nn.Module):
    def __init__(self, grid_dim):
        super(SurfaceContinuousKernelEmbedding, self).__init__()
        self.grid_dim = grid_dim
        self.kernel = EllipticalRBFKernel(input_dim=2)
        self.layer_norm = nn.LayerNorm([self.grid_dim, self.grid_dim])  # Normalizing across each image's dimensions

        # Create a regular grid in (0, 1)x(0, 1), excluding 0 and 1
        grid_points = torch.linspace(1 / (grid_dim + 1), 1 - 1 / (grid_dim + 1), grid_dim)
        mesh_x, mesh_y = torch.meshgrid(grid_points, grid_points, indexing='ij')
        self.grid_points = torch.stack([mesh_x.flatten(), mesh_y.flatten()], dim=-1)
        self.grid_points = torch.erfinv(2 * self.grid_points - 1) * np.sqrt(2)  # inverse CDF of normal

    def forward(self, input_surface_batch):
        batch_size = len(input_surface_batch['Log Moneyness'])
        batch_embedded_surfaces = []

        for i in range(batch_size):
            # Extract the coordinates and implied volatilities for each surface in the batch
            surface_coords = torch.stack([
                input_surface_batch['Log Moneyness'][i], 
                input_surface_batch['Time to Maturity'][i]
            ], dim=-1)
            surface_ivs = input_surface_batch['Implied Volatility'][i]

            # Initialize the output grid for the current surface
            embedded_surface = torch.zeros((self.grid_dim, self.grid_dim), dtype=torch.float32, device=surface_coords.device)

            # Compute the convolution for each point on the output grid
            for idx, grid_point in enumerate(self.grid_points):
                # Calculate the distance from each input point to the current grid point
                point_differences = surface_coords - grid_point

                # Apply the parametric kernel to these differences
                kernel_outputs = self.kernel(point_differences)

                # Compute the weighted sum of IVs based on the kernel outputs
                embedded_surface[idx // self.grid_dim, idx % self.grid_dim] = (kernel_outputs * surface_ivs).sum()

            # Normalize the embedded surface
            embedded_surface = self.layer_norm(embedded_surface)
            # Append the encoded surface for this input surface to the batch list
            batch_embedded_surfaces.append(embedded_surface)

        # Stack all encoded surfaces to form a batch tensor
        return torch.stack(batch_embedded_surfaces)


# Example of initializing and using this module
grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
# kernel_hidden_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Kernel Hidden Layer Dimension']
# kernel_hidden_layers = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Kernel Hidden Layer Count']
# kernel_dropout_prob = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Kernel Dropout Probability']

continuous_kernel_embedding = SurfaceContinuousKernelEmbedding(grid_dim=grid_dim)
continuous_kernel_embedding_batch = continuous_kernel_embedding(processed_batch['Input Surface'])
continuous_kernel_embedding_batch

tensor([[[ 0.6845,  0.7368,  1.5961],
         [-1.2593, -0.9681,  0.5388],
         [-1.1992, -0.8333,  0.7038]],

        [[ 1.1950,  0.7473,  0.2984],
         [ 0.8943, -0.0236, -0.9674],
         [ 0.6300, -0.7003, -2.0739]],

        [[ 0.0789, -0.4775, -1.1493],
         [ 1.0084, -0.0040, -1.1368],
         [ 1.9723,  0.6296, -0.9216]],

        [[ 2.1026,  0.7422, -0.5977],
         [ 0.8996, -0.2620, -1.1257],
         [-0.0421, -0.7001, -1.0169]]], grad_fn=<StackBackward0>)

In [None]:
import torch
import torch.nn as nn

class SurfaceProjectionEmbedding(nn.Module):
    def __init__(self, in_channels, d_embedding, grid_dim):
        super(SurfaceProjectionEmbedding, self).__init__()
        # Initialize the 1x1 convolution layer
        self.conv1x1 = nn.Conv2d(in_channels, d_embedding, kernel_size=1)
        # Initialize layer normalization across the channel, height, and width dimensions
        self.layer_norm = nn.LayerNorm([d_embedding, grid_dim, grid_dim])  # Normalizes across (channels, height, width)

    def forward(self, x):
        # Ensure x has dimensions: (batch_size, channels, height, width)
        # Add a channel dimension if necessary
        if x.dim() == 3:  # assuming x has dimensions (batch_size, height, width)
            x = x.unsqueeze(1)  # add channel dimension
        # Apply the 1x1 convolution to project the input to a higher dimensional space
        x = self.conv1x1(x)
        # Normalize the features across each channel, maintaining the spatial dimensions
        x = self.layer_norm(x)
        return x
    
d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']  # Desired number of output channels
torch.manual_seed(RANDOM_STATE)
# Create the module
projection_embedding = SurfaceProjectionEmbedding(1, d_embedding, grid_dim)   
projection_embedding_batch = projection_embedding(continuous_kernel_embedding_batch)
projection_embedding_batch

tensor([[[[ 0.2993,  0.2987,  0.2889],
          [ 0.3216,  0.3182,  0.3010],
          [ 0.3209,  0.3167,  0.2991]],

         [[ 1.4097,  1.4527,  2.1581],
          [-0.1859,  0.0532,  1.2902],
          [-0.1366,  0.1638,  1.4256]],

         [[-0.8816, -0.9475, -2.0297],
          [ 1.5665,  1.1998, -0.6981],
          [ 1.4909,  1.0300, -0.9060]],

         [[-0.6286, -0.6876, -1.6553],
          [ 1.5604,  1.2324, -0.4646],
          [ 1.4927,  1.0806, -0.6505]],

         [[-1.4224, -1.4532, -1.9597],
          [-0.2768, -0.4484, -1.3365],
          [-0.3122, -0.5278, -1.4338]],

         [[-0.2896, -0.2682,  0.0844],
          [-1.0873, -0.9677, -0.3494],
          [-1.0626, -0.9124, -0.2817]],

         [[-0.2086, -0.2102, -0.2362],
          [-0.1497, -0.1585, -0.2042],
          [-0.1515, -0.1626, -0.2092]],

         [[ 1.3301,  1.3936,  2.4362],
          [-1.0283, -0.6749,  1.1533],
          [-0.9554, -0.5114,  1.3536]]],


        [[[ 0.2935,  0.2986,  0.3037],
       

#### Block

In [None]:
class InputEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding, momentum=0.1):
        super(InputEmbedding, self).__init__()
        # Initialize all sub-modules
        self.surface_batchnorm = SurfaceBatchNorm(1, momentum)
        self.surface_continuous_kernel_embedding = SurfaceContinuousKernelEmbedding(grid_dim)
        self.surface_projection_embedding = SurfaceProjectionEmbedding(1, d_embedding, grid_dim)

    def forward(self, batch):
        # Process the batch with SurfaceBatchNorm
        processed_batch = self.surface_batchnorm(batch)
        
        # Generate continuous kernel embeddings from the processed 'Input Surface'
        continuous_kernel_embedding_batch = self.surface_continuous_kernel_embedding(processed_batch['Input Surface'])
        
        # Project the embeddings using 1x1 convolution
        projection_embedding_batch = self.surface_projection_embedding(continuous_kernel_embedding_batch)

        # Return both the positionally encoded embeddings and the processed batch
        return projection_embedding_batch, processed_batch

torch.manual_seed(RANDOM_STATE)
grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']  # Desired number of output channels
input_embedding = InputEmbedding(grid_dim, d_embedding)
projection_embedding_batch, processed_batch = input_embedding(batch)
projection_embedding_batch, processed_batch

(tensor([[[[ 0.2993,  0.2987,  0.2889],
           [ 0.3216,  0.3182,  0.3010],
           [ 0.3209,  0.3167,  0.2991]],
 
          [[ 1.4097,  1.4527,  2.1581],
           [-0.1859,  0.0532,  1.2902],
           [-0.1366,  0.1638,  1.4256]],
 
          [[-0.8816, -0.9475, -2.0297],
           [ 1.5665,  1.1998, -0.6981],
           [ 1.4909,  1.0300, -0.9060]],
 
          [[-0.6286, -0.6876, -1.6553],
           [ 1.5604,  1.2324, -0.4646],
           [ 1.4927,  1.0806, -0.6505]],
 
          [[-1.4224, -1.4532, -1.9597],
           [-0.2768, -0.4484, -1.3365],
           [-0.3122, -0.5278, -1.4338]],
 
          [[-0.2896, -0.2682,  0.0844],
           [-1.0873, -0.9677, -0.3494],
           [-1.0626, -0.9124, -0.2817]],
 
          [[-0.2086, -0.2102, -0.2362],
           [-0.1497, -0.1585, -0.2042],
           [-0.1515, -0.1626, -0.2092]],
 
          [[ 1.3301,  1.3936,  2.4362],
           [-1.0283, -0.6749,  1.1533],
           [-0.9554, -0.5114,  1.3536]]],
 
 
         [[[ 

### Pre-Encoder

#### Block

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PreEncoder(nn.Module):
    def __init__(self, d_embedding, branch_channels, grid_dim):
        super(PreEncoder, self).__init__()
        # Initial channel configuration is common to all branches
        self.branch1 = nn.Sequential(
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        self.branch2 = nn.Sequential(
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU(),
            nn.Conv2d(branch_channels, branch_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        self.branch3 = nn.Sequential(
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU(),
            nn.Conv2d(branch_channels, branch_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU(),
            nn.Conv2d(branch_channels, branch_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        # Reduce the concatenated channels back to the original number of channels
        self.conv_reduce = nn.Conv2d(branch_channels * 4, d_embedding, kernel_size=1)
        self.bn_reduce = nn.BatchNorm2d(d_embedding)
        self.scale = nn.Parameter(torch.tensor(1.0))  # Learnable scale for residual connection
        self.layer_norm = nn.LayerNorm([d_embedding, grid_dim, grid_dim])  # Normalize across (C, H, W)

    def forward(self, x):
        # Apply each branch to the input
        out1 = self.branch1(x)
        out2 = self.branch2(x)
        out3 = self.branch3(x)
        out4 = self.branch4(x)
        
        # Concatenate the outputs from each branch
        concatenated = torch.cat([out1, out2, out3, out4], dim=1)
        
        # Reduce back to the initial number of channels
        reduced = self.conv_reduce(concatenated)
        reduced = self.bn_reduce(reduced)
        
        # Add the residual connection with scale
        residual = x + self.scale * reduced
        residual = F.gelu(residual)  # Apply GELU after adding the residual
        
        # Normalize the output
        output = self.layer_norm(residual)
        
        return output

torch.manual_seed(RANDOM_STATE)
branch_channels = HYPERPARAMETERS['Input Embedding']['Pre-Encoder']['Branch Channels Dimension']
pre_encoder = PreEncoder(d_embedding, branch_channels, grid_dim)
pre_encoded_batch = pre_encoder(projection_embedding_batch)
pre_encoded_batch

tensor([[[[-6.0837e-01,  1.8441e-01,  1.8344e+00],
          [ 9.6723e-01, -5.8042e-01, -4.7301e-02],
          [-7.0156e-01, -6.5279e-01, -7.5022e-01]],

         [[ 6.6131e-01,  2.8394e+00,  1.9287e+00],
          [-7.4843e-01, -7.4947e-01, -6.5527e-01],
          [ 3.4987e-01,  2.6305e-01, -3.6582e-01]],

         [[-7.3371e-01, -7.4444e-01, -5.6426e-01],
          [ 1.8031e+00,  6.6916e-01, -7.3786e-01],
          [ 1.8068e+00, -7.1917e-01, -7.4583e-01]],

         [[-7.2106e-01, -4.9608e-01, -6.5116e-01],
          [ 2.5162e+00,  2.6079e+00, -5.0983e-01],
          [-7.5009e-01, -7.4023e-01, -6.4303e-01]],

         [[-6.3948e-01, -5.6314e-01, -5.5121e-01],
          [-5.7120e-01, -7.5185e-01, -6.3542e-01],
          [ 2.4126e+00,  9.0394e-02, -7.0332e-01]],

         [[-7.5103e-01, -6.4380e-01, -6.7763e-01],
          [-7.5164e-01, -6.3369e-01, -5.7734e-01],
          [ 4.5396e-01,  1.3636e-02, -7.1074e-01]],

         [[-6.9868e-01, -6.6248e-01, -6.6785e-01],
          [ 6.6948e

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class SurfacePositionalEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding):
        super(SurfacePositionalEmbedding, self).__init__()
        self.grid_dim = grid_dim
        self.d_embedding = d_embedding
        
        # Create a regular grid in (0, 1)x(0, 1), excluding 0 and 1
        grid_points = torch.linspace(1 / (grid_dim + 1), 1 - 1 / (grid_dim + 1), grid_dim)
        mesh_x, mesh_y = torch.meshgrid(grid_points, grid_points, indexing='ij')
        self.grid_points = torch.stack([mesh_x.flatten(), mesh_y.flatten()], dim=-1)
        self.grid_points = torch.erfinv(2 * self.grid_points - 1) * np.sqrt(2)  # inverse CDF of normal

        # Initialize learnable scaling parameter (the base for positional embedding)
        self.log_scale = nn.Parameter(torch.log(torch.tensor(10000.0)))
        self.factor = nn.Parameter(torch.tensor(1.0))  # Learnable scale for the positional embedding contribution

        # Layer normalization for final output
        self.layer_norm = nn.LayerNorm([d_embedding, grid_dim, grid_dim])  # Normalizes across (channels, height, width)


    def forward(self, x):
        # x is the output from the 1x1 convolution layer with shape (batch_size, d_embedding, grid_dim, grid_dim)
        scale = torch.exp(self.log_scale)
        pos_enc = torch.zeros_like(x)

        # Repeat grid_points to match the batch size and reshape for broadcasting
        batch_grid_points = self.grid_points.repeat(x.shape[0], 1, 1).view(x.shape[0], self.grid_dim*self.grid_dim, 2)
        
        for i in range(self.d_embedding // 4):
            # Calculate positional embeddings for both dimensions
            div_factor = scale ** (4 * i / self.d_embedding)
            pos_enc[:, 4 * i, :, :] = torch.sin(batch_grid_points[:, :, 0].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)
            pos_enc[:, 4 * i + 1, :, :] = torch.cos(batch_grid_points[:, :, 0].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)
            pos_enc[:, 4 * i + 2, :, :] = torch.sin(batch_grid_points[:, :, 1].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)
            pos_enc[:, 4 * i + 3, :, :] = torch.cos(batch_grid_points[:, :, 1].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)

        # Apply the learned scale to positional embedding and add to the input
        x = x + self.factor * pos_enc
        # Normalize the final output
        x = self.layer_norm(x) 

        return x

# Create the SurfacePositionalEmbedding module
positional_encoder = SurfacePositionalEmbedding(grid_dim, d_embedding)

# Apply positional embedding
positional_embedded_batch = positional_encoder(pre_encoded_batch)
positional_embedded_batch

tensor([[[[-1.5546, -0.8281,  0.6840],
          [ 0.4616, -0.9567, -0.4681],
          [-0.4954, -0.4507, -0.5400]],

         [[ 0.8970,  2.8930,  2.0585],
          [-0.1942, -0.1952, -0.1088],
          [ 0.6116,  0.5320, -0.0443]],

         [[-1.6694, -1.1070, -0.3696],
          [ 0.6553,  0.1885, -0.5286],
          [ 0.6587, -1.0838, -0.5359]],

         [[-0.3698,  0.0370, -0.3058],
          [ 2.5969,  2.8815, -0.1762],
          [-0.3964, -0.1867, -0.2983]],

         [[-1.0170, -0.9470, -0.9361],
          [-0.9482, -1.1138, -1.0071],
          [ 1.7924, -0.3357, -1.0631]],

         [[-0.1966, -0.0983, -0.1294],
          [-0.1972, -0.0891, -0.0374],
          [ 0.9077,  0.5041, -0.1597]],

         [[-1.0712, -1.0319, -1.0306],
          [ 0.1826,  0.7050, -0.6754],
          [-1.1193,  0.4496, -0.8681]],

         [[ 0.9818,  1.5753,  2.0672],
          [-0.0944,  0.6339,  1.8580],
          [-0.0139, -0.0991,  0.3298]]],


        [[[-0.5479, -1.0512, -1.5600],
       

### Final Block

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SurfaceEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding, branch_channels, num_pre_encoder_blocks, momentum=0.1):
        super(SurfaceEmbedding, self).__init__()
        # Initialize the InputEmbedding module
        self.input_embedding = InputEmbedding(grid_dim, d_embedding, momentum)
        
        # Initialize multiple PreEncoder blocks
        self.pre_encoders = nn.ModuleList([
            PreEncoder(d_embedding, branch_channels, grid_dim) for _ in range(num_pre_encoder_blocks)
        ])
        
        # Initialize positional embedding, adjusted to apply before flattening
        self.positional_embedding = SurfacePositionalEmbedding(grid_dim, d_embedding)

    def forward(self, batch):
        # Process batch through InputEmbedding to get initial embeddings and the processed batch
        embedding_batch, processed_batch = self.input_embedding(batch)
        
        # Sequentially pass the output through each PreEncoder block
        pre_encoded_batch = embedding_batch
        for pre_encoder in self.pre_encoders:
            pre_encoded_batch = pre_encoder(pre_encoded_batch)
        
        # Apply positional embedding to the output of the last PreEncoder block
        positional_embedded_batch = self.positional_embedding(pre_encoded_batch)
        
        # Flatten the 2D spatial structure into a sequence of tokens
        batch_size, num_channels, height, width = positional_embedded_batch.shape
        tokenized_positional_embedded_batch = positional_embedded_batch.view(batch_size, num_channels, height * width).transpose(1, 2)
        
        # The final output is now suitable for processing by transformer encoders
        return tokenized_positional_embedded_batch, processed_batch


# Example usage
torch.manual_seed(RANDOM_STATE)
grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']
branch_channels = HYPERPARAMETERS['Input Embedding']['Pre-Encoder']['Branch Channels Dimension']
num_pre_encoder_blocks = HYPERPARAMETERS['Input Embedding']['Pre-Encoder']['Number of Blocks']

surface_embedding = SurfaceEmbedding(grid_dim, d_embedding, branch_channels, num_pre_encoder_blocks)
tokenized_positional_embedded_batch, processed_batch = surface_embedding(batch)
tokenized_positional_embedded_batch, processed_batch

(tensor([[[-1.2322e+00, -2.5396e-01, -1.4235e+00, -2.5456e-01, -9.4066e-01,
            4.5215e-03, -4.7360e-01,  1.2686e-01],
          [-1.4282e+00,  1.9080e+00, -9.0447e-01, -4.0972e-02, -9.2368e-01,
            1.9135e-01, -7.2363e-01,  3.4015e+00],
          [-1.3488e+00,  1.7327e+00,  2.1020e-01, -2.5321e-01, -2.9440e-01,
            4.5288e-01, -4.4926e-01,  2.9342e+00],
          [-5.6752e-01, -2.8359e-02,  1.0675e+00,  1.8990e+00, -8.9341e-01,
            5.5186e-01, -9.4258e-01, -2.1595e-02],
          [-3.5609e-01,  4.7226e-03, -6.7726e-01, -4.0126e-02, -8.9248e-01,
           -5.6153e-02, -9.1099e-01, -6.2356e-02],
          [-8.9268e-01, -1.9122e-02, -3.8458e-01,  1.9130e-01, -7.3351e-01,
            4.7243e-01, -5.4017e-01,  1.5274e+00],
          [ 1.1287e-01, -2.4079e-01,  1.2245e+00,  3.4947e+00, -9.2896e-01,
           -5.3189e-02, -9.3156e-02,  8.7661e-02],
          [-2.1814e-01, -1.3761e-01,  1.0871e+00,  8.8387e-01, -9.1330e-01,
           -4.6758e-02, -6.8705e-01

## Surface Encoding

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ScaledResidualNorm(nn.Module):
    def __init__(self, d_embedding):
        super(ScaledResidualNorm, self).__init__()
        self.scale = nn.Parameter(torch.tensor(1.0))
        self.norm = nn.LayerNorm(d_embedding)

    def forward(self, x, sublayer_output):
        return self.norm(x + self.scale * sublayer_output)

class FeedForwardNetwork(nn.Module):
    def __init__(self, d_embedding, hidden_dim, dropout):
        super(FeedForwardNetwork, self).__init__()
        self.feedforward = nn.Sequential(
            nn.Linear(d_embedding, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, d_embedding),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.feedforward(x)

class Encoder(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout, external_dim):
        super(Encoder, self).__init__()
        self.self_attention = nn.MultiheadAttention(embed_dim=d_embedding, num_heads=n_heads, batch_first=True)
        self.scaled_residual_norm1 = ScaledResidualNorm(d_embedding)
        self.feedforward1 = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        self.scaled_residual_norm2 = ScaledResidualNorm(d_embedding)
        
        self.external_attention = nn.MultiheadAttention(embed_dim=d_embedding, num_heads=n_heads, kdim=external_dim, vdim=external_dim, batch_first=True)
        self.scaled_residual_norm3 = ScaledResidualNorm(d_embedding)
        self.feedforward2 = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        self.scaled_residual_norm4 = ScaledResidualNorm(d_embedding)

    def forward(self, surface_tokens, external_features):
        # Self-attention block
        self_attn_output, _ = self.self_attention(surface_tokens, surface_tokens, surface_tokens)
        x = self.scaled_residual_norm1(surface_tokens, self_attn_output)
        
        # Feedforward network block 1
        ff_output = self.feedforward1(x)
        x = self.scaled_residual_norm2(x, ff_output)
        
        # External attention block
        ext_attn_output, _ = self.external_attention(x, external_features, external_features)
        x = self.scaled_residual_norm3(x, ext_attn_output)
        
        # Feedforward network block 2
        ff_output = self.feedforward2(x)
        x = self.scaled_residual_norm4(x, ff_output)
        
        return x

class SurfaceEncoding(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout, external_dim, num_encoder_blocks):
        super(SurfaceEncoding, self).__init__()
        self.encoders = nn.ModuleList([
            Encoder(d_embedding, n_heads, hidden_dim, dropout, external_dim) for _ in range(num_encoder_blocks)
        ])

    def forward(self, tokenized_positional_embedded_batch, processed_batch):
        # Extract market features from processed batch and create external_features tensor
        market_features = processed_batch['Market Features']
        external_features = torch.stack([
            market_features['Market Return'],
            market_features['Market Volatility'],
            market_features['Treasury Rate']
        ], dim=-1).unsqueeze(1)  # (batch, 1, features)
        
        # Pass the tokenized positional embeddings and external features through each encoder block
        x = tokenized_positional_embedded_batch
        for encoder in self.encoders:
            x = encoder(x, external_features)
        
        return x
    
# Example usage
torch.manual_seed(RANDOM_STATE)
n_heads = HYPERPARAMETERS['Surface Encoding']['Encoder']['Number of Heads']
hidden_dim = HYPERPARAMETERS['Surface Encoding']['Encoder']['Hidden Dimension']
dropout = HYPERPARAMETERS['Surface Encoding']['Encoder']['Dropout']
num_encoder_blocks = HYPERPARAMETERS['Surface Encoding']['Encoder']['Number of Blocks']
external_dim = 3  # Assuming 3 market features

surface_encoding = SurfaceEncoding(d_embedding, n_heads, hidden_dim, dropout, external_dim, num_encoder_blocks)
encoded_surface = surface_encoding(tokenized_positional_embedded_batch, processed_batch)    
encoded_surface

tensor([[[ 0.2469, -0.3497, -1.4286,  1.3753, -0.9483,  1.2850, -0.9019,
           0.7213],
         [-0.1632,  0.3022, -1.6315,  0.8646, -0.4627,  0.6740, -1.1551,
           1.5716],
         [-0.3699,  0.1405, -1.2775,  0.3203, -0.4004,  0.9784, -1.2458,
           1.8545],
         [ 0.6897, -0.1253, -0.3942,  1.4151, -1.2410,  1.0470, -1.6302,
           0.2389],
         [ 0.3982, -0.3709, -1.0674,  1.4139, -1.0317,  1.2783, -1.2252,
           0.6049],
         [ 0.6897, -0.4072, -1.0200,  1.2286, -1.2428,  0.6611, -1.1428,
           1.2333],
         [ 1.0847, -0.7412, -0.2555,  1.7976, -1.0018,  0.4697, -1.3238,
          -0.0297],
         [ 0.6992, -0.6551,  0.4165,  1.2582, -1.3620,  0.9885, -1.5638,
           0.2185],
         [ 0.6872, -0.4694, -0.9725,  1.0512, -1.1934,  1.3166, -1.2126,
           0.7928]],

        [[-1.4550,  0.5706, -0.7268, -0.1802,  0.6716,  0.4818, -1.1145,
           1.7525],
         [-1.4392,  0.8674, -0.5820, -0.2491,  0.5831,  0.4650, -1.2

## Query Embedding

### Point Embedding

In [None]:
import torch
import torch.nn as nn
import numpy as np

class PointEmbedding(nn.Module):
    def __init__(self, d_embedding, surface_embedding):
        super(PointEmbedding, self).__init__()
        self.d_embedding = d_embedding
        self.log_scale = surface_embedding.positional_embedding.log_scale  # Shared log_scale parameter from SurfacePositionalEmbedding
        self.factor = nn.Parameter(torch.tensor(1.0))  # Learnable scale for the positional embedding contribution
        self.learnable_embedding = nn.Parameter(torch.randn(d_embedding))  # Learnable embedding vector
        self.layer_norm = nn.LayerNorm(d_embedding)  # Normalize across the embedding dimension
        self.scaled_residual_norm = ScaledResidualNorm(d_embedding)  # Scaled residual normalization

    def forward(self, query_point_batch):
        log_moneyness = query_point_batch['Log Moneyness']
        time_to_maturity = query_point_batch['Time to Maturity']

        # Stack the query point coordinates
        query_coords = torch.stack([log_moneyness, time_to_maturity], dim=-1)  # Shape: (batch_size, 2)

        # Positional embedding calculation
        scale = torch.exp(self.log_scale)
        pos_enc = torch.zeros(query_coords.size(0), self.d_embedding, device=query_coords.device)

        for i in range(self.d_embedding // 4):
            div_factor = scale ** (4 * i / self.d_embedding)
            pos_enc[:, 4 * i] = torch.sin(query_coords[:, 0] / div_factor)
            pos_enc[:, 4 * i + 1] = torch.cos(query_coords[:, 0] / div_factor)
            pos_enc[:, 4 * i + 2] = torch.sin(query_coords[:, 1] / div_factor)
            pos_enc[:, 4 * i + 3] = torch.cos(query_coords[:, 1] / div_factor)

        # Use ScaledResidualNorm to combine learnable embedding and positional encoding
        point_embedded = self.scaled_residual_norm(self.learnable_embedding, pos_enc)

        return point_embedded

# Example usage:
torch.manual_seed(RANDOM_STATE)
point_embedding = PointEmbedding(d_embedding, surface_embedding)
point_embedded = point_embedding(processed_batch['Query Point'])
point_embedded

tensor([[ 0.5042, -0.3693, -1.9808,  1.0166, -0.7196, -0.2118,  0.3608,  1.3998],
        [ 1.4882,  0.0684, -1.4299,  0.5567, -1.2357, -0.6671,  0.0133,  1.2058],
        [ 1.3204,  0.0949, -1.9956,  0.6253, -0.8096, -0.3823,  0.1181,  1.0289],
        [ 0.4256, -0.2793, -1.4502,  1.0144, -1.1896, -0.5031,  0.2925,  1.6896]],
       grad_fn=<NativeLayerNormBackward0>)

### Pre-Decoder

In [None]:
class PreDecoder(nn.Module):
    def __init__(self, d_embedding, hidden_dim, dropout):
        super(PreDecoder, self).__init__()
        # Initialize the feedforward network
        self.feedforward = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        # Initialize the scaled residual normalization module
        self.scaled_residual_norm = ScaledResidualNorm(d_embedding)

    def forward(self, x):
        # Process the input through the feedforward network
        feedforward_output = self.feedforward(x)
        # Apply the scaled residual connection
        output = self.scaled_residual_norm(x, feedforward_output)
        
        return output
    
# Example usage
torch.manual_seed(RANDOM_STATE)
hidden_dim = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Hidden Dimension']
dropout = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Dropout']

pre_decoder = PreDecoder(d_embedding, hidden_dim, dropout)
pre_decoded_output = pre_decoder(point_embedded)  # Example input
pre_decoded_output

tensor([[ 5.6697e-02, -1.0196e-01, -2.0467e+00,  9.5017e-01, -1.8322e-01,
         -5.6851e-01,  3.3655e-01,  1.5570e+00],
        [ 1.1735e+00,  3.5872e-01, -1.5941e+00,  5.4041e-01, -9.4090e-01,
         -9.1696e-01, -1.1575e-02,  1.3909e+00],
        [ 1.0623e+00,  1.8758e-01, -2.1339e+00,  5.3964e-01, -3.6191e-01,
         -6.4388e-01,  1.5799e-01,  1.1921e+00],
        [ 5.5675e-04, -4.4207e-02, -1.5026e+00,  1.0834e+00, -7.7097e-01,
         -7.5784e-01,  1.5493e-01,  1.8367e+00]],
       grad_fn=<NativeLayerNormBackward0>)

### Final BLock

In [None]:
class QueryEmbedding(nn.Module):
    def __init__(self, d_embedding, surface_embedding, num_pre_decoder_blocks, hidden_dim, dropout):
        super(QueryEmbedding, self).__init__()
        # Initialize the PointEmbedding
        self.point_embedding = PointEmbedding(d_embedding, surface_embedding)

        # Initialize the PreDecoder blocks
        self.pre_decoders = nn.ModuleList([
            PreDecoder(d_embedding, hidden_dim, dropout) for _ in range(num_pre_decoder_blocks)
        ])

    def forward(self, processed_batch):
        query_point_batch = processed_batch['Query Point']

        # First apply the point embedding
        query_embedded = self.point_embedding(query_point_batch)
        
        # Sequentially apply each PreDecoder block
        for pre_decoder in self.pre_decoders:
            query_embedded = pre_decoder(query_embedded)
        
        # Reshape the output to (batch, 1, embedding) to make it a sequence of length 1
        query_embedded = query_embedded.unsqueeze(1)  # Add the sequence length dimension
        
        return query_embedded
    
# Example usage
torch.manual_seed(RANDOM_STATE)
hidden_dim = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Hidden Dimension']
dropout = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Dropout']
num_pre_decoder_blocks = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Number of Blocks']    

query_embedding = QueryEmbedding(d_embedding, surface_embedding, num_pre_decoder_blocks, hidden_dim, dropout)
query_embedded = query_embedding(processed_batch)
query_embedded

tensor([[[ 0.7447, -0.0774, -1.7717,  1.0139, -1.1276, -0.3336,  0.1905,
           1.3613]],

        [[ 1.2842, -0.0804, -0.9078,  0.8424, -1.2601, -1.2203,  0.0255,
           1.3164]],

        [[ 1.4975, -0.0342, -1.5508,  0.7655, -0.8453, -0.9524,  0.0515,
           1.0682]],

        [[ 0.8848, -0.1789, -1.0814,  1.0093, -1.3445, -0.9847,  0.2222,
           1.4732]]], grad_fn=<UnsqueezeBackward0>)

## Surface Decoding

In [None]:
class DecoderBlock(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout):
        super(DecoderBlock, self).__init__()
        self.cross_attention = nn.MultiheadAttention(embed_dim=d_embedding, num_heads=n_heads, batch_first=True)
        self.scaled_residual_norm1 = ScaledResidualNorm(d_embedding)
        self.feedforward = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        self.scaled_residual_norm2 = ScaledResidualNorm(d_embedding)

    def forward(self, query, encoded_surface):
        # Cross-attention
        cross_attn_output, _ = self.cross_attention(query, encoded_surface, encoded_surface)
        x = self.scaled_residual_norm1(query, cross_attn_output)

        # Feedforward network
        ff_output = self.feedforward(x)
        x = self.scaled_residual_norm2(x, ff_output)

        return x

class SurfaceDecoder(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout, num_decoder_blocks):
        super(SurfaceDecoder, self).__init__()
        self.decoder_blocks = nn.ModuleList([
            DecoderBlock(d_embedding, n_heads, hidden_dim, dropout) for _ in range(num_decoder_blocks)
        ])

    def forward(self, query_embedded, encoded_surface):
        x = query_embedded

        for decoder_block in self.decoder_blocks:
            x = decoder_block(x, encoded_surface)

        return x
    
# Example usage:
torch.manual_seed(RANDOM_STATE)
n_heads = HYPERPARAMETERS['Surface Decoding']['Decoder']['Number of Heads']
hidden_dim = HYPERPARAMETERS['Surface Decoding']['Decoder']['Hidden Dimension']
dropout = HYPERPARAMETERS['Surface Decoding']['Decoder']['Dropout']
num_decoder_blocks = HYPERPARAMETERS['Surface Decoding']['Decoder']['Number of Blocks']

surface_decoder = SurfaceDecoder(d_embedding, n_heads, hidden_dim, dropout, num_decoder_blocks)
decoded_surface = surface_decoder(query_embedded, encoded_surface)
decoded_surface

tensor([[[ 1.0483, -0.2380, -1.5907,  0.8196, -1.4996,  0.9765, -0.1599,
           0.6439]],

        [[ 1.7490, -0.2841, -0.9002,  0.8834, -1.4725,  0.1600, -0.7979,
           0.6623]],

        [[ 1.3270, -0.5674, -1.4404,  0.9870, -1.2326,  0.9964, -0.4555,
           0.3855]],

        [[ 0.9850,  0.2246, -0.9211,  1.0486, -1.9522,  0.0174, -0.4226,
           1.0204]]], grad_fn=<NativeLayerNormBackward0>)

## IvySPT

In [None]:
class IvySPT(nn.Module):
    def __init__(self, hyperparameters):
        super(IvySPT, self).__init__()
        # Extract hyperparameters for each component
        input_embedding_hp = hyperparameters['Input Embedding']
        surface_encoding_hp = hyperparameters['Surface Encoding']['Encoder']
        query_embedding_hp = hyperparameters['Query Embedding']['Pre-Decoder']
        surface_decoding_hp = hyperparameters['Surface Decoding']['Decoder']

        # Initialize modules
        self.surface_embedding = SurfaceEmbedding(
            grid_dim=input_embedding_hp['Surface Embedding']['Grid Dimension'],
            d_embedding=input_embedding_hp['Surface Embedding']['Channels Dimension'],
            branch_channels=input_embedding_hp['Pre-Encoder']['Branch Channels Dimension'],
            num_pre_encoder_blocks=input_embedding_hp['Pre-Encoder']['Number of Blocks']
        )

        self.surface_encoding = SurfaceEncoding(
            d_embedding=input_embedding_hp['Surface Embedding']['Channels Dimension'],
            n_heads=surface_encoding_hp['Number of Heads'],
            hidden_dim=surface_encoding_hp['Hidden Dimension'],
            dropout=surface_encoding_hp['Dropout'],
            external_dim=surface_encoding_hp['External Feature Dimension'],
            num_encoder_blocks=surface_encoding_hp['Number of Blocks']
        )

        self.query_embedding = QueryEmbedding(
            d_embedding=input_embedding_hp['Surface Embedding']['Channels Dimension'],
            surface_embedding=self.surface_embedding,
            num_pre_decoder_blocks=query_embedding_hp['Number of Blocks'],
            hidden_dim=query_embedding_hp['Hidden Dimension'],
            dropout=query_embedding_hp['Dropout']
        )

        self.surface_decoder = SurfaceDecoder(
            d_embedding=input_embedding_hp['Surface Embedding']['Channels Dimension'],
            n_heads=surface_decoding_hp['Number of Heads'],
            hidden_dim=surface_decoding_hp['Hidden Dimension'],
            dropout=surface_decoding_hp['Dropout'],
            num_decoder_blocks=surface_decoding_hp['Number of Blocks']
        )

        # Final fully connected layer to predict implied volatility
        self.final_layer = nn.Linear(input_embedding_hp['Surface Embedding']['Channels Dimension'], 1)

    def forward(self, batch):
        # Pass batch through Surface Embedding
        tokenized_positional_embedded_batch, processed_batch = self.surface_embedding(batch)

        # Pass tokenized embeddings and market features to Surface Encoding
        encoded_output = self.surface_encoding(tokenized_positional_embedded_batch, processed_batch)

        # Pass query points through Query Embedding
        query_embedded = self.query_embedding(processed_batch)

        # Decode the query embeddings with the encoded surface data
        decoded_output = self.surface_decoder(query_embedded, encoded_output)

        # Apply the final fully connected layer
        final_output = self.final_layer(decoded_output.squeeze(1)).squeeze(1)  # Removing sequence length dimension and flattening

        return final_output

torch.manual_seed(RANDOM_STATE)    
ivy_spt = IvySPT(HYPERPARAMETERS)
iv_estimates = ivy_spt(batch)  
iv_estimates  

tensor([-0.0161,  0.1768,  0.0644,  0.0999], grad_fn=<SqueezeBackward1>)

## Surface Arbitrage Free Loss

In [None]:
class SurfaceArbitrageFreeLoss(nn.Module):
    def __init__(self, hyperparameters):
        super(SurfaceArbitrageFreeLoss, self).__init__()
        self.calendar_coeff = hyperparameters['No-Arbitrage']['Calendar']
        self.butterfly_coeff = hyperparameters['No-Arbitrage']['Butterfly']

    def forward(self, iv_estimates, batch):
        target_volatility = batch['Target Volatility']

        # Calculate mean squared error between model estimates and target volatilities
        mse_loss = F.mse_loss(iv_estimates, target_volatility)

        # Calculate the total implied variance
        time_to_maturity = batch['Query Point']['Time to Maturity']
        log_moneyness = batch['Query Point']['Log Moneyness']
        total_implied_variance = time_to_maturity * iv_estimates.pow(2)

        # Compute gradients needed for arbitrage conditions
        w_t = torch.autograd.grad(total_implied_variance.sum(), time_to_maturity, create_graph=True)[0] 
        w_x = torch.autograd.grad(total_implied_variance.sum(), log_moneyness, create_graph=True)[0]
        w_xx = torch.autograd.grad(w_x.sum(), log_moneyness, create_graph=True)[0]

        # Calculate Calendar Arbitrage Loss
        calendar_arbitrage_loss = torch.mean(torch.clamp(-w_t, min=0) ** 2)

        # Calculate Butterfly Arbitrage Loss
        w = total_implied_variance
        g = (1 - log_moneyness * w_x / (2 * w)) ** 2 - w_x / 4 * (1 / w + 1 / 4) + w_xx / 2
        butterfly_arbitrage_loss = torch.mean(torch.clamp(-g, min=0) ** 2)

        # Combine all losses with coefficients
        total_loss = mse_loss + self.calendar_coeff * calendar_arbitrage_loss + self.butterfly_coeff * butterfly_arbitrage_loss

        return total_loss
    
surface_arbitrage_free_loss = SurfaceArbitrageFreeLoss(HYPERPARAMETERS)    
total_loss = surface_arbitrage_free_loss(iv_estimates, batch)
total_loss

tensor(5.9122, grad_fn=<AddBackward0>)