In [1]:
import numpy as np
import pandas as pd
import random
import torch

In [2]:
# Set the random seed for reproducibility
RANDOM_STATE = 0
N_JOBS = 8
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [3]:
HYPERPARAMETERS = {
    'Input Preprocessing' : {
        'Mask Proportions' : [0.1, 0.2, 0.4, 0.8],
        'Batch Size' : 4
    },
    'Input Embedding' : {
        'Surface Embedding' : {
            'Grid Dimension' : 3,
            'Channels Dimension' : 8,
        },
        'Pre-Encoder' : {
            'Branch Channels Dimension' : 4,
            'Number of Blocks' : 2,
        }
    },
    'Surface Encoding' : {
        'Encoder' : {
            'Number of Heads' : 4,
            'Hidden Dimension' : 16,
            'Dropout' : 0.1,
            'Number of Blocks' : 2,
            'External Feature Dimension' : 3,
        }
    },
    'Query Embedding' : {
        'Pre-Decoder' : {
            'Hidden Dimension' : 16,
            'Dropout' : 0.1,
            'Number of Blocks' : 2,
        }
    },
    'Surface Decoding' : {
        'Decoder' : {
            'Number of Heads' : 4,
            'Hidden Dimension' : 16,
            'Dropout' : 0.1,
            'Number of Blocks' : 2,
        }
    },
    'No-Arbitrage' : {
        'Butterfly' : 1,
        'Calendar' : 1,
    }
}

## Dataset

In [4]:
aapl_googl_data = pd.read_csv('volatility_surface_AAPL_GOOGL_2013_01_2013_06.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
aapl_googl_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Log Moneyness,Time to Maturity,Implied Volatility,Market Return,Market Volatility,Treasury Rate
Datetime,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-02,AAPL,-0.316688,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.316688,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.291996,0.007937,0.3726,0.025086,14.680000,0.055
...,...,...,...,...,...,...,...
2013-06-28,GOOGL,0.427518,2.253968,0.2430,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2383,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2426,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.442224,2.253968,0.2402,-0.004299,16.860001,0.030


In [5]:
import gc
from joblib_progress import joblib_progress
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from joblib import Parallel, delayed

def implied_volatility_surface_datasets(
    options_market_data, 
    proportions, 
    n_jobs=1,
    random_state=0,
    n_chunks=1
):
    def mask_surface(
        date, 
        symbol, 
        surface, 
        rng
    ):
        def mask_surface_with_proportion(
            surface_data, 
            proportion, 
        ):
            n_clusters = int(np.ceil(1 / proportion))
            points_coordinates = surface_data['points_coordinates']
            points_volatilities = surface_data['points_volatilities']

            # Create the clustering pipeline
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto'))
            ])
            
            # Fit the pipeline to the data points
            labels = pipeline.fit_predict(points_coordinates)
            
            single_surface_datasets = []
            for cluster in range(n_clusters):
                cluster_indices = np.where(labels == cluster)[0]
                num_to_mask = int(np.ceil(len(cluster_indices) * proportion))
                masked_indices = rng.choice(cluster_indices, size=num_to_mask, replace=False)
                
                for idx in masked_indices:
                    unmasked_indices = np.setdiff1d(cluster_indices, masked_indices)

                    single_surface_datasets.append({
                        'Datetime': surface_data['datetime'],
                        'Symbol': surface_data['symbol'],
                        'Market Features': surface_data['market_features'],
                        'Input Surface': {
                            'Log Moneyness': points_coordinates[unmasked_indices, 0],
                            'Time to Maturity': points_coordinates[unmasked_indices, 1],
                            'Implied Volatility': points_volatilities[unmasked_indices]
                        },
                        'Query Point': {
                            'Log Moneyness': points_coordinates[idx, 0],
                            'Time to Maturity': points_coordinates[idx, 1]
                        },
                        'Target Volatility': points_volatilities[idx]
                    })

            return single_surface_datasets
        
        surface_data = {
            'datetime': date,
            'symbol': symbol,
            'points_coordinates': surface[['Log Moneyness', 'Time to Maturity']].values,
            'points_volatilities': surface['Implied Volatility'].values,
            'market_features': {
                'Market Return': surface['Market Return'].values[0],
                'Market Volatility': surface['Market Volatility'].values[0],
                'Treasury Rate': surface['Treasury Rate'].values[0]
            }
        }
        
        datasets = []
        for proportion in proportions:
            datasets.extend(mask_surface_with_proportion(surface_data, proportion))

        return datasets

    rng = np.random.default_rng(random_state)
    all_surfaces = list(options_market_data.groupby(level=['Datetime', 'Symbol']))
    n_surfaces = len(all_surfaces)
    
    # Split the array into 'n_chunks' chunks
    chunks = np.array_split(range(n_surfaces), n_chunks)
    # Initialize the list to hold all results
    surface_datasets = []
    # Process each chunk sequentially
    with joblib_progress("Surfaces...", total=n_surfaces): 
        for chunk in chunks:
            # Process the current chunk in parallel
            output = Parallel(n_jobs=n_jobs)(
                delayed(mask_surface)(date, symbol, surface, rng)
                for (date, symbol), surface in [all_surfaces[i] for i in chunk]
            )
            # Extend the overall results with the current chunk's results
            surface_datasets.extend(output)
            gc.collect()  

    # Flatten the list of lists into a single list of datasets
    return [item for sublist in surface_datasets for item in sublist]

aapl_googl_dataset = implied_volatility_surface_datasets(
    aapl_googl_data,
    HYPERPARAMETERS['Input Preprocessing']['Mask Proportions'],
    n_jobs=N_JOBS,
    random_state=RANDOM_STATE,
    n_chunks=4
)

Output()

In [6]:
# import pickle

# with open('aapl_googl_dataset.pickle', 'wb') as handle:
#     pickle.dump(aapl_googl_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('aapl_googl_dataset.pickle', 'rb') as handle:
#     aapl_googl_dataset_ = pickle.load(handle)


In [7]:
len(aapl_googl_dataset)

863511

In [8]:
aapl_googl_dataset[0]

{'Datetime': Timestamp('2013-01-02 00:00:00'),
 'Symbol': 'AAPL',
 'Market Features': {'Market Return': 0.0250861159586972,
  'Market Volatility': 14.68000030517578,
  'Treasury Rate': 0.0549999997019767},
 'Input Surface': {'Log Moneyness': array([-0.74747141, -0.72842322, -0.72842322, -0.70973108, -0.69138194,
         -0.69138194, -0.67336344, -0.67336344, -0.63827212, -0.63827212,
         -0.62117768, -0.62117768, -0.60437057, -0.60437057, -0.58784126,
         -0.58784126, -0.57158074, -0.5555804 , -0.5555804 , -0.53983205,
         -0.53983205, -0.52432786, -0.52432786, -0.50906039, -0.50906039,
         -0.49402251, -0.49402251, -0.47920742, -0.47920742, -0.46460862,
         -0.46460862, -0.45021989, -0.45021989, -0.43603525, -0.43603525,
         -0.42204901, -0.42204901, -0.40825569, -0.40825569, -0.39465004,
         -0.39465004, -0.74747141, -0.74747141, -0.72842322, -0.70973108,
         -0.70973108, -0.69138194, -0.69138194, -0.67336344, -0.67336344,
         -0.65566386

In [9]:
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import Dataset

class IVSurfaceDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data_point = self.data[idx]

        # Convert each component of the data point into tensors as appropriate
        return {
            'Datetime': data_point['Datetime'],
            'Symbol': data_point['Symbol'],
            'Market Features': {
                'Market Return': torch.tensor(data_point['Market Features']['Market Return'], dtype=torch.float32),
                'Market Volatility': torch.tensor(data_point['Market Features']['Market Volatility'], dtype=torch.float32),
                'Treasury Rate': torch.tensor(data_point['Market Features']['Treasury Rate'], dtype=torch.float32),
            },
            'Input Surface': {
                'Log Moneyness': torch.tensor(data_point['Input Surface']['Log Moneyness'], dtype=torch.float32),
                'Time to Maturity': torch.tensor(data_point['Input Surface']['Time to Maturity'], dtype=torch.float32),
                'Implied Volatility': torch.tensor(data_point['Input Surface']['Implied Volatility'], dtype=torch.float32),
            },
            'Query Point': {
                'Log Moneyness': torch.tensor(data_point['Query Point']['Log Moneyness'], dtype=torch.float32),
                'Time to Maturity': torch.tensor(data_point['Query Point']['Time to Maturity'], dtype=torch.float32),
            },
            'Target Volatility': torch.tensor(data_point['Target Volatility'], dtype=torch.float32),
        }

    def collate_fn(batch):
        # Organize batch data by structuring as a dictionary with batched components
        batched_data = {
            'Datetime': [item['Datetime'] for item in batch],
            'Symbol': [item['Symbol'] for item in batch],
            'Market Features': {
                'Market Return': default_collate([item['Market Features']['Market Return'] for item in batch]),
                'Market Volatility': default_collate([item['Market Features']['Market Volatility'] for item in batch]),
                'Treasury Rate': default_collate([item['Market Features']['Treasury Rate'] for item in batch]),
            },
            'Input Surface': {
                'Log Moneyness': [item['Input Surface']['Log Moneyness'] for item in batch],
                'Time to Maturity': [item['Input Surface']['Time to Maturity'] for item in batch],
                'Implied Volatility': [item['Input Surface']['Implied Volatility'] for item in batch],
            },
            'Query Point': {
                'Log Moneyness': default_collate([item['Query Point']['Log Moneyness'] for item in batch]),
                'Time to Maturity': default_collate([item['Query Point']['Time to Maturity'] for item in batch]),
            },
            'Target Volatility': default_collate([item['Target Volatility'] for item in batch]),
        }

        # Set requires_grad=True for query point values
        batched_data['Query Point']['Log Moneyness'].requires_grad_()
        batched_data['Query Point']['Time to Maturity'].requires_grad_()

        return batched_data



aapl_googl_data_loader = DataLoader(
    IVSurfaceDataset(aapl_googl_dataset), 
    batch_size=HYPERPARAMETERS['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)

# Fetch one batch from the DataLoader
batch = next(iter(aapl_googl_data_loader))
batch

{'Datetime': [Timestamp('2013-03-19 00:00:00'),
  Timestamp('2013-03-19 00:00:00'),
  Timestamp('2013-04-08 00:00:00'),
  Timestamp('2013-03-27 00:00:00')],
 'Symbol': ['GOOGL', 'AAPL', 'GOOGL', 'AAPL'],
 'Market Features': {'Market Return': tensor([-0.0024, -0.0024,  0.0063, -0.0006]),
  'Market Volatility': tensor([14.3900, 14.3900, 13.1900, 13.1500]),
  'Treasury Rate': tensor([0.0700, 0.0700, 0.0550, 0.0800])},
 'Input Surface': {'Log Moneyness': [tensor([-0.1766, -0.1766, -0.1620, -0.1620, -0.1476, -0.1334, -0.0920, -0.0853,
           -0.0853, -0.0786, -0.0719, -0.0653, -0.0653, -0.0588, -0.0523, -0.0523,
           -0.0458, -0.0266, -0.0203, -0.0141, -0.0141, -0.0078,  0.0045,  0.0045,
            0.0106,  0.0106,  0.0167,  0.0167,  0.0228,  0.0347,  0.0407,  0.0466,
            0.0524,  0.0524,  0.0583,  0.0583,  0.0698,  0.0756,  0.0756,  0.0813,
            0.0813,  0.0869,  0.0926,  0.0926,  0.1037,  0.1093,  0.1148, -0.1476,
           -0.1334, -0.1056, -0.1056, -0.0920, -0

## Surface Embedding

### Input Embedding

#### Components

In [10]:
import torch
import torch.nn as nn

class SurfaceBatchNorm(nn.Module):
    def __init__(self, num_features=1, momentum=0.1):
        super(SurfaceBatchNorm, self).__init__()
        self.log_moneyness_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.time_to_maturity_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.implied_volatility_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_return_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_volatility_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.treasury_rate_bn = nn.BatchNorm1d(num_features, momentum=momentum)

    def forward(self, batch):
        # Concatenate all tensors from the Input Surface into one tensor for each feature
        input_surface_log_moneyness = torch.cat([x for x in batch['Input Surface']['Log Moneyness']])
        input_surface_time_to_maturity = torch.cat([x for x in batch['Input Surface']['Time to Maturity']])
        input_surface_implied_volatility = torch.cat([x for x in batch['Input Surface']['Implied Volatility']])

        # Concatenate Input Surface tensors with Query Point tensors
        total_log_moneyness = torch.cat([input_surface_log_moneyness, batch['Query Point']['Log Moneyness']])
        total_time_to_maturity = torch.cat([input_surface_time_to_maturity, batch['Query Point']['Time to Maturity']])

        # Normalize Log Moneyness and Time to Maturity
        norm_log_moneyness = self.log_moneyness_bn(total_log_moneyness.unsqueeze(1)).squeeze(1)
        norm_time_to_maturity = self.time_to_maturity_bn(total_time_to_maturity.unsqueeze(1)).squeeze(1)

        # Normalize Implied Volatility (only from Input Surface)
        norm_implied_volatility = self.implied_volatility_bn(input_surface_implied_volatility.unsqueeze(1)).squeeze(1)

        # Split the normalized results back to corresponding structures
        input_surface_sizes = [len(x) for x in batch['Input Surface']['Log Moneyness']]
        total_input_size = sum(input_surface_sizes)

        # Normalizing Market Features
        market_features = batch['Market Features']
        norm_market_return = self.market_return_bn(market_features['Market Return'].unsqueeze(1)).squeeze(1)
        norm_market_volatility = self.market_volatility_bn(market_features['Market Volatility'].unsqueeze(1)).squeeze(1)
        norm_treasury_rate = self.treasury_rate_bn(market_features['Treasury Rate'].unsqueeze(1)).squeeze(1)

        # Reconstructing the batch with normalized data
        output = {
            'Datetime': batch['Datetime'],
            'Symbol': batch['Symbol'],
            'Market Features': {
                'Market Return': norm_market_return,
                'Market Volatility': norm_market_volatility,
                'Treasury Rate': norm_treasury_rate
            },
            'Input Surface': {
                'Log Moneyness': list(torch.split(norm_log_moneyness[:total_input_size], input_surface_sizes)),
                'Time to Maturity': list(torch.split(norm_time_to_maturity[:total_input_size], input_surface_sizes)),
                'Implied Volatility': list(torch.split(norm_implied_volatility, input_surface_sizes))
            },
            'Query Point': {
                'Log Moneyness': norm_log_moneyness[total_input_size:],
                'Time to Maturity': norm_time_to_maturity[total_input_size:]
            },
            'Target Volatility': batch['Target Volatility']
        }

        return output

# Usage
surfacebatchnorm = SurfaceBatchNorm()
processed_batch = surfacebatchnorm(batch)
processed_batch

{'Datetime': [Timestamp('2013-03-19 00:00:00'),
  Timestamp('2013-03-19 00:00:00'),
  Timestamp('2013-04-08 00:00:00'),
  Timestamp('2013-03-27 00:00:00')],
 'Symbol': ['GOOGL', 'AAPL', 'GOOGL', 'AAPL'],
 'Market Features': {'Market Return': tensor([-0.5515, -0.5515,  1.2702, -0.1672], grad_fn=<SqueezeBackward1>),
  'Market Volatility': tensor([ 0.9997,  0.9997, -0.9669, -1.0325], grad_fn=<SqueezeBackward1>),
  'Treasury Rate': tensor([ 0.1320,  0.1320, -1.4519,  1.1879], grad_fn=<SqueezeBackward1>)},
 'Input Surface': {'Log Moneyness': [tensor([-0.8914, -0.8914, -0.8349, -0.8349, -0.7791, -0.7242, -0.5640, -0.5379,
           -0.5379, -0.5120, -0.4862, -0.4607, -0.4607, -0.4353, -0.4101, -0.4101,
           -0.3850, -0.3108, -0.2863, -0.2621, -0.2621, -0.2379, -0.1901, -0.1901,
           -0.1664, -0.1664, -0.1429, -0.1429, -0.1195, -0.0731, -0.0501, -0.0273,
           -0.0046, -0.0046,  0.0180,  0.0180,  0.0628,  0.0850,  0.0850,  0.1070,
            0.1070,  0.1290,  0.1508,  0.150

In [18]:
import torch
import torch.nn as nn
import numpy as np

# class ParametricContinuousKernel(nn.Module):
#     def __init__(self, input_dim, hidden_dim, hidden_layers, output_dim=1, dropout_prob=0.1):
#         super(ParametricContinuousKernel, self).__init__()
#         layers = []
#         current_dim = input_dim
#         for _ in range(hidden_layers):
#             layers.append(nn.Linear(current_dim, hidden_dim))
#             layers.append(nn.GELU())
#             layers.append(nn.Dropout(dropout_prob))
#             current_dim = hidden_dim
#         layers.append(nn.Linear(hidden_dim, output_dim))
#         self.net = nn.Sequential(*layers)

#     def forward(self, x):
#         return self.net(x)

# class EllipticalRBFKernel(nn.Module):
#     def __init__(self, input_dim):
#         super(EllipticalRBFKernel, self).__init__()
#         # Initialize the bandwidth parameters for each dimension
#         # We use log-space parameterization for stability in optimization (exp to ensure positivity)
#         self.log_bandwidth = nn.Parameter(torch.zeros(input_dim))  # Initialized to exp(0) = 1

#     def forward(self, distances):
#         # Scale the distances by the bandwidths
#         # torch.exp(self.log_bandwidth) converts log bandwidth back to the standard scale
#         scaled_distances = distances / torch.exp(self.log_bandwidth)

#         # Compute the RBF kernel output using the scaled distances
#         # The RBF kernel formula exp(-0.5 * (scaled distance)^2)
#         kernel_values = torch.exp(-0.5 * torch.sum(scaled_distances ** 2, dim=-1))

#         return kernel_values

# class SurfaceContinuousKernelEmbedding(nn.Module):
#     def __init__(self, grid_dim):
#         super(SurfaceContinuousKernelEmbedding, self).__init__()
#         self.grid_dim = grid_dim
#         self.kernel = EllipticalRBFKernel(input_dim=2)
#         self.layer_norm = nn.LayerNorm([self.grid_dim, self.grid_dim])  # Normalizing across each image's dimensions

#         # Create a regular grid in (0, 1)x(0, 1), excluding 0 and 1
#         grid_points = torch.linspace(1 / (grid_dim + 1), 1 - 1 / (grid_dim + 1), grid_dim)
#         mesh_x, mesh_y = torch.meshgrid(grid_points, grid_points, indexing='ij')
#         self.grid_points = torch.stack([mesh_x.flatten(), mesh_y.flatten()], dim=-1)
#         self.grid_points = torch.erfinv(2 * self.grid_points - 1) * np.sqrt(2)  # inverse CDF of normal

#     def forward(self, input_surface_batch):
#         batch_size = len(input_surface_batch['Log Moneyness'])
#         batch_embedded_surfaces = []

#         for i in range(batch_size):
#             # Extract the coordinates and implied volatilities for each surface in the batch
#             surface_coords = torch.stack([
#                 input_surface_batch['Log Moneyness'][i], 
#                 input_surface_batch['Time to Maturity'][i]
#             ], dim=-1)
#             surface_ivs = input_surface_batch['Implied Volatility'][i]

#             # Initialize the output grid for the current surface
#             embedded_surface = torch.zeros((self.grid_dim, self.grid_dim), dtype=torch.float32, device=surface_coords.device)

#             # Compute the convolution for each point on the output grid
#             for idx, grid_point in enumerate(self.grid_points):
#                 # Calculate the distance from each input point to the current grid point
#                 point_differences = surface_coords - grid_point

#                 # Apply the parametric kernel to these differences
#                 kernel_outputs = self.kernel(point_differences)

#                 # Compute the weighted sum of IVs based on the kernel outputs
#                 embedded_surface[idx // self.grid_dim, idx % self.grid_dim] = (kernel_outputs * surface_ivs).sum()

#             # Normalize the embedded surface
#             embedded_surface = self.layer_norm(embedded_surface)
#             # Append the encoded surface for this input surface to the batch list
#             batch_embedded_surfaces.append(embedded_surface)

#         # Stack all encoded surfaces to form a batch tensor
#         return torch.stack(batch_embedded_surfaces)


# # Example of initializing and using this module
# grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
# # kernel_hidden_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Kernel Hidden Layer Dimension']
# # kernel_hidden_layers = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Kernel Hidden Layer Count']
# # kernel_dropout_prob = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Kernel Dropout Probability']

# continuous_kernel_embedding = SurfaceContinuousKernelEmbedding(grid_dim=grid_dim)
# continuous_kernel_embedding_batch = continuous_kernel_embedding(processed_batch['Input Surface'])
# continuous_kernel_embedding_batch

In [19]:
# import torch
# import torch.nn as nn

# class SurfaceProjectionEmbedding(nn.Module):
#     def __init__(self, in_channels, d_embedding, grid_dim):
#         super(SurfaceProjectionEmbedding, self).__init__()
#         # Initialize the 1x1 convolution layer
#         self.conv1x1 = nn.Conv2d(in_channels, d_embedding, kernel_size=1)
#         # Initialize layer normalization across the channel, height, and width dimensions
#         self.layer_norm = nn.LayerNorm([d_embedding, grid_dim, grid_dim])  # Normalizes across (channels, height, width)

#     def forward(self, x):
#         # Ensure x has dimensions: (batch_size, channels, height, width)
#         # Add a channel dimension if necessary
#         if x.dim() == 3:  # assuming x has dimensions (batch_size, height, width)
#             x = x.unsqueeze(1)  # add channel dimension
#         # Apply the 1x1 convolution to project the input to a higher dimensional space
#         x = self.conv1x1(x)
#         # Normalize the features across each channel, maintaining the spatial dimensions
#         x = self.layer_norm(x)
#         return x
    
# d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']  # Desired number of output channels
# torch.manual_seed(RANDOM_STATE)
# # Create the module
# projection_embedding = SurfaceProjectionEmbedding(1, d_embedding, grid_dim)   
# projection_embedding_batch = projection_embedding(continuous_kernel_embedding_batch)
# projection_embedding_batch

In [20]:
import torch
import torch.nn as nn
import numpy as np

class RBFKernel(nn.Module):
    def __init__(self, bandwidth):
        super(RBFKernel, self).__init__()
        self.bandwidth = bandwidth

    def forward(self, distances):
        # Scale the distances by the bandwidths
        # torch.exp(self.log_bandwidth) converts log bandwidth back to the standard scale
        scaled_distances = distances / self.bandwidth

        # Compute the RBF kernel output using the scaled distances
        # The RBF kernel formula exp(-0.5 * (scaled distance)^2)
        kernel_values = torch.exp(-0.5 * torch.sum(scaled_distances ** 2, dim=-1))

        return kernel_values        

class SurfaceContinuousKernelEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding):
        super(SurfaceContinuousKernelEmbedding, self).__init__()
        self.grid_dim = grid_dim
        self.d_embedding = d_embedding

        # Initialize multiple RBF kernels, each with a different fixed bandwidth
        self.kernels = nn.ModuleList()
        for i in range(1, d_embedding + 1):
            bandwidth_value = torch.erfinv(torch.tensor(i / (d_embedding + 1))) * np.sqrt(2)
            self.kernels.append(RBFKernel(bandwidth=bandwidth_value))

        self.layer_norm = nn.LayerNorm([d_embedding, self.grid_dim, self.grid_dim])

        # Create a regular grid in (0, 1)x(0, 1), excluding 0 and 1
        grid_points = torch.linspace(1 / (grid_dim + 1), 1 - 1 / (grid_dim + 1), grid_dim)
        mesh_x, mesh_y = torch.meshgrid(grid_points, grid_points, indexing='ij')
        self.grid_points = torch.stack([mesh_x.flatten(), mesh_y.flatten()], dim=-1)
        self.grid_points = torch.erfinv(2 * self.grid_points - 1) * np.sqrt(2)  # inverse CDF of normal

    def forward(self, input_surface_batch):
        batch_size = len(input_surface_batch['Log Moneyness'])
        batch_embedded_surfaces = []

        for i in range(batch_size):
            # Extract the coordinates and implied volatilities for each surface in the batch
            surface_coords = torch.stack([
                input_surface_batch['Log Moneyness'][i], 
                input_surface_batch['Time to Maturity'][i]
            ], dim=-1)
            surface_ivs = input_surface_batch['Implied Volatility'][i]

            # Initialize the output grid for the current surface with d_embedding channels
            embedded_surface = torch.zeros((self.d_embedding, self.grid_dim, self.grid_dim), dtype=torch.float32, device=surface_coords.device)

            for kernel_idx, kernel in enumerate(self.kernels):
                # Compute the convolution for each point on the output grid using the current RBF kernel
                for idx, grid_point in enumerate(self.grid_points):
                    # Calculate the distance from each input point to the current grid point
                    point_differences = surface_coords - grid_point

                    # Apply the RBF kernel to these differences
                    kernel_outputs = kernel(point_differences)

                    # Compute the weighted sum of IVs based on the kernel outputs
                    embedded_surface[kernel_idx, idx // self.grid_dim, idx % self.grid_dim] = (kernel_outputs * surface_ivs).sum()

            # Normalize the embedded surface
            embedded_surface = self.layer_norm(embedded_surface)
            # Append the encoded surface for this input surface to the batch list
            batch_embedded_surfaces.append(embedded_surface)

        # Stack all encoded surfaces to form a batch tensor
        return torch.stack(batch_embedded_surfaces)


# Example of initializing and using this module
grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']  # Desired number of output channels

continuous_kernel_embedding = SurfaceContinuousKernelEmbedding(grid_dim=grid_dim, d_embedding=d_embedding)
continuous_kernel_embedding_batch = continuous_kernel_embedding(processed_batch['Input Surface'])
continuous_kernel_embedding_batch

tensor([[[[ 1.4984e+00,  1.3326e+00,  1.3190e+00],
          [ 1.0022e+00,  1.2877e+00,  1.2780e+00],
          [ 1.2048e+00,  1.3194e+00,  1.2661e+00]],

         [[ 1.5617e+00,  1.2128e+00,  1.2340e+00],
          [ 4.0624e-01,  9.7030e-01,  1.1149e+00],
          [ 7.9845e-01,  1.0341e+00,  1.1132e+00]],

         [[ 1.3014e+00,  1.0261e+00,  1.0919e+00],
          [ 7.4383e-03,  4.0246e-01,  8.4581e-01],
          [ 2.8261e-01,  5.0006e-01,  8.6889e-01]],

         [[ 9.1361e-01,  7.0940e-01,  8.2258e-01],
          [-2.9023e-01, -1.7404e-01,  3.8278e-01],
          [-1.8617e-01, -9.1761e-02,  4.2655e-01]],

         [[ 4.8129e-01,  2.7844e-01,  4.2755e-01],
          [-5.6515e-01, -6.6247e-01, -1.7677e-01],
          [-5.6892e-01, -6.3394e-01, -1.3602e-01]],

         [[ 2.5201e-02, -2.1136e-01, -5.7113e-02],
          [-8.4624e-01, -1.0781e+00, -7.1884e-01],
          [-8.9579e-01, -1.0924e+00, -7.0055e-01]],

         [[-4.5790e-01, -7.1886e-01, -5.8478e-01],
          [-1.1497e

#### Block

In [22]:
# class InputEmbedding(nn.Module):
#     def __init__(self, grid_dim, d_embedding, momentum=0.1):
#         super(InputEmbedding, self).__init__()
#         # Initialize all sub-modules
#         self.surface_batchnorm = SurfaceBatchNorm(1, momentum)
#         self.surface_continuous_kernel_embedding = SurfaceContinuousKernelEmbedding(grid_dim)
#         self.surface_projection_embedding = SurfaceProjectionEmbedding(1, d_embedding, grid_dim)

#     def forward(self, batch):
#         # Process the batch with SurfaceBatchNorm
#         processed_batch = self.surface_batchnorm(batch)
        
#         # Generate continuous kernel embeddings from the processed 'Input Surface'
#         continuous_kernel_embedding_batch = self.surface_continuous_kernel_embedding(processed_batch['Input Surface'])
        
#         # Project the embeddings using 1x1 convolution
#         projection_embedding_batch = self.surface_projection_embedding(continuous_kernel_embedding_batch)

#         # Return both the positionally encoded embeddings and the processed batch
#         return projection_embedding_batch, processed_batch

import torch
import torch.nn as nn

class InputEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding, momentum=0.1):
        super(InputEmbedding, self).__init__()
        # Initialize SurfaceBatchNorm
        self.surface_batchnorm = SurfaceBatchNorm(1, momentum)
        
        # Initialize SurfaceContinuousKernelEmbedding with multiple RBF kernels
        self.surface_continuous_kernel_embedding = SurfaceContinuousKernelEmbedding(grid_dim, d_embedding)

    def forward(self, batch):
        # Normalize the input surface batch using SurfaceBatchNorm
        processed_batch = self.surface_batchnorm(batch)
        
        # Generate continuous kernel embeddings with multiple RBF kernels
        continuous_kernel_embedding_batch = self.surface_continuous_kernel_embedding(processed_batch['Input Surface'])

        # Return the embedded surface and the processed batch
        return continuous_kernel_embedding_batch, processed_batch
    
torch.manual_seed(RANDOM_STATE)
grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']  # Desired number of output channels
input_embedding = InputEmbedding(grid_dim, d_embedding)
projection_embedding_batch, processed_batch = input_embedding(batch)
projection_embedding_batch, processed_batch

(tensor([[[[ 1.4984e+00,  1.3326e+00,  1.3190e+00],
           [ 1.0022e+00,  1.2877e+00,  1.2780e+00],
           [ 1.2048e+00,  1.3194e+00,  1.2661e+00]],
 
          [[ 1.5617e+00,  1.2128e+00,  1.2340e+00],
           [ 4.0624e-01,  9.7030e-01,  1.1149e+00],
           [ 7.9845e-01,  1.0341e+00,  1.1132e+00]],
 
          [[ 1.3014e+00,  1.0261e+00,  1.0919e+00],
           [ 7.4383e-03,  4.0246e-01,  8.4581e-01],
           [ 2.8261e-01,  5.0006e-01,  8.6889e-01]],
 
          [[ 9.1361e-01,  7.0940e-01,  8.2258e-01],
           [-2.9023e-01, -1.7404e-01,  3.8278e-01],
           [-1.8617e-01, -9.1761e-02,  4.2655e-01]],
 
          [[ 4.8129e-01,  2.7844e-01,  4.2755e-01],
           [-5.6515e-01, -6.6247e-01, -1.7677e-01],
           [-5.6892e-01, -6.3394e-01, -1.3602e-01]],
 
          [[ 2.5201e-02, -2.1136e-01, -5.7113e-02],
           [-8.4624e-01, -1.0781e+00, -7.1884e-01],
           [-8.9579e-01, -1.0924e+00, -7.0055e-01]],
 
          [[-4.5790e-01, -7.1886e-01, -5.8478e

### Pre-Encoder

#### Block

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PreEncoder(nn.Module):
    def __init__(self, d_embedding, branch_channels, grid_dim):
        super(PreEncoder, self).__init__()
        # Initial channel configuration is common to all branches
        self.branch1 = nn.Sequential(
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        self.branch2 = nn.Sequential(
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU(),
            nn.Conv2d(branch_channels, branch_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        self.branch3 = nn.Sequential(
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU(),
            nn.Conv2d(branch_channels, branch_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU(),
            nn.Conv2d(branch_channels, branch_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        # Reduce the concatenated channels back to the original number of channels
        self.conv_reduce = nn.Conv2d(branch_channels * 4, d_embedding, kernel_size=1)
        self.bn_reduce = nn.BatchNorm2d(d_embedding)
        self.scale = nn.Parameter(torch.tensor(1.0))  # Learnable scale for residual connection
        self.layer_norm = nn.LayerNorm([d_embedding, grid_dim, grid_dim])  # Normalize across (C, H, W)

    def forward(self, x):
        # Apply each branch to the input
        out1 = self.branch1(x)
        out2 = self.branch2(x)
        out3 = self.branch3(x)
        out4 = self.branch4(x)
        
        # Concatenate the outputs from each branch
        concatenated = torch.cat([out1, out2, out3, out4], dim=1)
        
        # Reduce back to the initial number of channels
        reduced = self.conv_reduce(concatenated)
        reduced = self.bn_reduce(reduced)
        
        # Add the residual connection with scale
        residual = x + self.scale * reduced
        residual = F.gelu(residual)  # Apply GELU after adding the residual
        
        # Normalize the output
        output = self.layer_norm(residual)
        
        return output

torch.manual_seed(RANDOM_STATE)
branch_channels = HYPERPARAMETERS['Input Embedding']['Pre-Encoder']['Branch Channels Dimension']
pre_encoder = PreEncoder(d_embedding, branch_channels, grid_dim)
pre_encoded_batch = pre_encoder(projection_embedding_batch)
pre_encoded_batch

tensor([[[[ 0.9560,  0.8386,  0.3857],
          [-0.2986,  0.9133, -0.2494],
          [ 1.4491,  0.6898,  0.7085]],

         [[ 2.8297,  2.3452,  2.0125],
          [ 2.0921,  3.3537, -0.3803],
          [ 0.3453,  0.9763,  0.3020]],

         [[-0.2553,  0.7970, -0.2106],
          [ 0.3756,  1.2573,  0.0268],
          [ 0.5103, -0.7254, -0.4265]],

         [[ 1.7953,  1.8887,  1.2367],
          [-0.4701,  0.2318, -0.5712],
          [-0.7851, -0.7337, -0.6657]],

         [[ 0.7664, -0.0679, -0.3749],
          [-0.6508, -0.5987, -0.5185],
          [-0.7772, -0.7751, -0.1373]],

         [[-0.5238, -0.7456, -0.7639],
          [-0.8012, -0.7744, -0.7971],
          [-0.7868, -0.7795, -0.7332]],

         [[ 0.0712, -0.7791, -0.7842],
          [-0.8017, -0.6425, -0.7922],
          [-0.7727, -0.8014, -0.7872]],

         [[-0.6307, -0.7551, -0.7778],
          [-0.8000, -0.7803, -0.7664],
          [-0.7695, -0.6774, -0.6590]]],


        [[[-0.4679, -0.3838, -0.4552],
       

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class SurfacePositionalEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding):
        super(SurfacePositionalEmbedding, self).__init__()
        self.grid_dim = grid_dim
        self.d_embedding = d_embedding
        
        # Create a regular grid in (0, 1)x(0, 1), excluding 0 and 1
        grid_points = torch.linspace(1 / (grid_dim + 1), 1 - 1 / (grid_dim + 1), grid_dim)
        mesh_x, mesh_y = torch.meshgrid(grid_points, grid_points, indexing='ij')
        self.grid_points = torch.stack([mesh_x.flatten(), mesh_y.flatten()], dim=-1)
        self.grid_points = torch.erfinv(2 * self.grid_points - 1) * np.sqrt(2)  # inverse CDF of normal

        # Initialize learnable scaling parameter (the base for positional embedding)
        self.log_scale = nn.Parameter(torch.log(torch.tensor(10000.0)))
        self.factor = nn.Parameter(torch.tensor(1.0))  # Learnable scale for the positional embedding contribution

        # Layer normalization for final output
        self.layer_norm = nn.LayerNorm([d_embedding, grid_dim, grid_dim])  # Normalizes across (channels, height, width)


    def forward(self, x):
        # x is the output from the 1x1 convolution layer with shape (batch_size, d_embedding, grid_dim, grid_dim)
        scale = torch.exp(self.log_scale)
        pos_enc = torch.zeros_like(x)

        # Repeat grid_points to match the batch size and reshape for broadcasting
        batch_grid_points = self.grid_points.repeat(x.shape[0], 1, 1).view(x.shape[0], self.grid_dim*self.grid_dim, 2)
        
        for i in range(self.d_embedding // 4):
            # Calculate positional embeddings for both dimensions
            div_factor = scale ** (4 * i / self.d_embedding)
            pos_enc[:, 4 * i, :, :] = torch.sin(batch_grid_points[:, :, 0].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)
            pos_enc[:, 4 * i + 1, :, :] = torch.cos(batch_grid_points[:, :, 0].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)
            pos_enc[:, 4 * i + 2, :, :] = torch.sin(batch_grid_points[:, :, 1].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)
            pos_enc[:, 4 * i + 3, :, :] = torch.cos(batch_grid_points[:, :, 1].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)

        # Apply the learned scale to positional embedding and add to the input
        x = x + self.factor * pos_enc
        # Normalize the final output
        x = self.layer_norm(x) 

        return x

# Create the SurfacePositionalEmbedding module
positional_encoder = SurfacePositionalEmbedding(grid_dim, d_embedding)

# Apply positional embedding
positional_embedded_batch = positional_encoder(pre_encoded_batch)
positional_embedded_batch

tensor([[[[-1.1634e-01, -2.1983e-01, -6.1912e-01],
          [-6.7181e-01,  3.9651e-01, -6.2851e-01],
          [ 1.4195e+00,  7.5005e-01,  7.6649e-01]],

         [[ 2.7745e+00,  2.3474e+00,  2.0541e+00],
          [ 2.3173e+00,  3.4295e+00,  1.3766e-01],
          [ 5.8431e-01,  1.1406e+00,  5.4612e-01]],

         [[-1.1842e+00,  2.9399e-01, -4.3738e-02],
          [-6.2802e-01,  6.9981e-01,  1.6560e-01],
          [-5.0931e-01, -1.0481e+00, -2.3406e-01]],

         [[ 1.8626e+00,  2.1379e+00,  1.3701e+00],
          [-1.3448e-01,  6.7728e-01, -2.2361e-01],
          [-4.1222e-01, -1.7387e-01, -3.0697e-01]],

         [[ 2.6105e-01, -4.7439e-01, -7.4507e-01],
          [-9.8235e-01, -9.3638e-01, -8.6566e-01],
          [-1.0879e+00, -1.0860e+00, -5.2368e-01]],

         [[ 1.1204e-02, -1.8435e-01, -2.0053e-01],
          [-2.3338e-01, -2.0977e-01, -2.2974e-01],
          [-2.2067e-01, -2.1425e-01, -1.7343e-01]],

         [[-3.5178e-01, -1.0954e+00, -1.0939e+00],
          [-1.1213e

### Final Block

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SurfaceEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding, branch_channels, num_pre_encoder_blocks, momentum=0.1):
        super(SurfaceEmbedding, self).__init__()
        # Initialize the InputEmbedding module
        self.input_embedding = InputEmbedding(grid_dim, d_embedding, momentum)
        
        # Initialize multiple PreEncoder blocks
        self.pre_encoders = nn.ModuleList([
            PreEncoder(d_embedding, branch_channels, grid_dim) for _ in range(num_pre_encoder_blocks)
        ])
        
        # Initialize positional embedding, adjusted to apply before flattening
        self.positional_embedding = SurfacePositionalEmbedding(grid_dim, d_embedding)

    def forward(self, batch):
        # Process batch through InputEmbedding to get initial embeddings and the processed batch
        embedding_batch, processed_batch = self.input_embedding(batch)
        
        # Sequentially pass the output through each PreEncoder block
        pre_encoded_batch = embedding_batch
        for pre_encoder in self.pre_encoders:
            pre_encoded_batch = pre_encoder(pre_encoded_batch)
        
        # Apply positional embedding to the output of the last PreEncoder block
        positional_embedded_batch = self.positional_embedding(pre_encoded_batch)
        
        # Flatten the 2D spatial structure into a sequence of tokens
        batch_size, num_channels, height, width = positional_embedded_batch.shape
        tokenized_positional_embedded_batch = positional_embedded_batch.view(batch_size, num_channels, height * width).transpose(1, 2)
        
        # The final output is now suitable for processing by transformer encoders
        return tokenized_positional_embedded_batch, processed_batch


# Example usage
torch.manual_seed(RANDOM_STATE)
grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']
branch_channels = HYPERPARAMETERS['Input Embedding']['Pre-Encoder']['Branch Channels Dimension']
num_pre_encoder_blocks = HYPERPARAMETERS['Input Embedding']['Pre-Encoder']['Number of Blocks']

surface_embedding = SurfaceEmbedding(grid_dim, d_embedding, branch_channels, num_pre_encoder_blocks)
tokenized_positional_embedded_batch, processed_batch = surface_embedding(batch)
tokenized_positional_embedded_batch, processed_batch

(tensor([[[-1.3103e+00,  2.9942e+00, -1.2890e+00,  2.0274e+00,  4.7882e-01,
            3.9665e-01, -8.5940e-01,  7.8119e-01],
          [-1.2433e+00,  2.6824e+00, -6.0633e-01,  1.5038e+00, -6.9886e-01,
           -3.5986e-02, -8.2732e-01,  7.5069e-01],
          [-1.3093e+00,  2.5279e+00, -3.5401e-01,  5.3681e-01, -2.9260e-01,
            1.7569e-01, -8.3765e-01,  6.2337e-01],
          [-8.4500e-01,  2.2993e+00, -1.3477e+00, -2.2292e-01, -7.4344e-01,
           -5.3275e-02, -8.4749e-01, -4.3185e-02],
          [-8.9896e-02,  2.9584e+00,  8.0548e-01, -3.3670e-02, -8.1568e-01,
            8.1423e-02, -7.0007e-01,  5.8728e-02],
          [-8.5408e-01,  3.1230e-02, -1.0822e-01, -4.9627e-02, -8.4836e-01,
           -4.4094e-02, -7.6418e-01,  7.9041e-01],
          [ 1.0855e+00, -1.7076e-01, -1.2241e+00, -1.3990e-01, -7.0493e-01,
            4.2291e-02, -7.7263e-01, -6.2886e-04],
          [-2.1597e-01,  1.3037e+00, -7.6288e-01,  4.0708e-02, -8.4818e-01,
           -4.7461e-02, -4.1469e-01

## Surface Encoding

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ScaledResidualNorm(nn.Module):
    def __init__(self, d_embedding):
        super(ScaledResidualNorm, self).__init__()
        self.scale = nn.Parameter(torch.tensor(1.0))
        self.norm = nn.LayerNorm(d_embedding)

    def forward(self, x, sublayer_output):
        return self.norm(x + self.scale * sublayer_output)

class FeedForwardNetwork(nn.Module):
    def __init__(self, d_embedding, hidden_dim, dropout):
        super(FeedForwardNetwork, self).__init__()
        self.feedforward = nn.Sequential(
            nn.Linear(d_embedding, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, d_embedding),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.feedforward(x)

class Encoder(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout, external_dim):
        super(Encoder, self).__init__()
        self.self_attention = nn.MultiheadAttention(embed_dim=d_embedding, num_heads=n_heads, batch_first=True)
        self.scaled_residual_norm1 = ScaledResidualNorm(d_embedding)
        self.feedforward1 = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        self.scaled_residual_norm2 = ScaledResidualNorm(d_embedding)
        
        self.external_attention = nn.MultiheadAttention(embed_dim=d_embedding, num_heads=n_heads, kdim=external_dim, vdim=external_dim, batch_first=True)
        self.scaled_residual_norm3 = ScaledResidualNorm(d_embedding)
        self.feedforward2 = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        self.scaled_residual_norm4 = ScaledResidualNorm(d_embedding)

    def forward(self, surface_tokens, external_features):
        # Self-attention block
        self_attn_output, _ = self.self_attention(surface_tokens, surface_tokens, surface_tokens)
        x = self.scaled_residual_norm1(surface_tokens, self_attn_output)
        
        # Feedforward network block 1
        ff_output = self.feedforward1(x)
        x = self.scaled_residual_norm2(x, ff_output)
        
        # External attention block
        ext_attn_output, _ = self.external_attention(x, external_features, external_features)
        x = self.scaled_residual_norm3(x, ext_attn_output)
        
        # Feedforward network block 2
        ff_output = self.feedforward2(x)
        x = self.scaled_residual_norm4(x, ff_output)
        
        return x

class SurfaceEncoding(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout, external_dim, num_encoder_blocks):
        super(SurfaceEncoding, self).__init__()
        self.encoders = nn.ModuleList([
            Encoder(d_embedding, n_heads, hidden_dim, dropout, external_dim) for _ in range(num_encoder_blocks)
        ])

    def forward(self, tokenized_positional_embedded_batch, processed_batch):
        # Extract market features from processed batch and create external_features tensor
        market_features = processed_batch['Market Features']
        external_features = torch.stack([
            market_features['Market Return'],
            market_features['Market Volatility'],
            market_features['Treasury Rate']
        ], dim=-1).unsqueeze(1)  # (batch, 1, features)
        
        # Pass the tokenized positional embeddings and external features through each encoder block
        x = tokenized_positional_embedded_batch
        for encoder in self.encoders:
            x = encoder(x, external_features)
        
        return x
    
# Example usage
torch.manual_seed(RANDOM_STATE)
n_heads = HYPERPARAMETERS['Surface Encoding']['Encoder']['Number of Heads']
hidden_dim = HYPERPARAMETERS['Surface Encoding']['Encoder']['Hidden Dimension']
dropout = HYPERPARAMETERS['Surface Encoding']['Encoder']['Dropout']
num_encoder_blocks = HYPERPARAMETERS['Surface Encoding']['Encoder']['Number of Blocks']
external_dim = 3  # Assuming 3 market features

surface_encoding = SurfaceEncoding(d_embedding, n_heads, hidden_dim, dropout, external_dim, num_encoder_blocks)
encoded_surface = surface_encoding(tokenized_positional_embedded_batch, processed_batch)    
encoded_surface

tensor([[[-1.1987e+00,  1.3675e+00, -7.7992e-01,  5.9965e-01,  2.7499e-01,
           1.0144e+00, -1.5887e+00,  3.1089e-01],
         [-1.6126e+00,  1.3097e+00, -6.0457e-01,  7.1560e-01, -1.0897e-01,
           6.6147e-01, -1.2508e+00,  8.9028e-01],
         [-1.4569e+00,  1.1110e+00, -3.7648e-01, -5.4781e-01,  2.2596e-01,
           9.8066e-01, -1.2304e+00,  1.2940e+00],
         [-5.9283e-01,  1.9033e+00, -1.0383e+00, -4.7112e-01, -7.3180e-02,
           7.3081e-01, -1.2495e+00,  7.9080e-01],
         [-6.6071e-01,  1.3028e+00,  6.0464e-01, -5.0919e-01, -6.5181e-01,
           1.3764e+00, -1.6936e+00,  2.3139e-01],
         [-5.0388e-01, -3.7909e-01,  7.5516e-01, -6.0257e-02, -5.5426e-01,
           5.4803e-01, -1.6920e+00,  1.8863e+00],
         [ 1.6501e+00, -2.7555e-01, -7.4444e-01,  3.6414e-01, -4.6963e-01,
           5.4506e-01, -1.8434e+00,  7.7365e-01],
         [-5.7687e-01,  1.8216e+00, -7.6990e-01, -4.9892e-01, -7.4013e-01,
           3.1825e-01, -9.5147e-01,  1.3974e+00],


## Query Embedding

### Point Embedding

In [27]:
import torch
import torch.nn as nn
import numpy as np

class PointEmbedding(nn.Module):
    def __init__(self, d_embedding, surface_embedding):
        super(PointEmbedding, self).__init__()
        self.d_embedding = d_embedding
        self.log_scale = surface_embedding.positional_embedding.log_scale  # Shared log_scale parameter from SurfacePositionalEmbedding
        self.factor = nn.Parameter(torch.tensor(1.0))  # Learnable scale for the positional embedding contribution
        self.learnable_embedding = nn.Parameter(torch.randn(d_embedding))  # Learnable embedding vector
        self.layer_norm = nn.LayerNorm(d_embedding)  # Normalize across the embedding dimension
        self.scaled_residual_norm = ScaledResidualNorm(d_embedding)  # Scaled residual normalization

    def forward(self, query_point_batch):
        log_moneyness = query_point_batch['Log Moneyness']
        time_to_maturity = query_point_batch['Time to Maturity']

        # Stack the query point coordinates
        query_coords = torch.stack([log_moneyness, time_to_maturity], dim=-1)  # Shape: (batch_size, 2)

        # Positional embedding calculation
        scale = torch.exp(self.log_scale)
        pos_enc = torch.zeros(query_coords.size(0), self.d_embedding, device=query_coords.device)

        for i in range(self.d_embedding // 4):
            div_factor = scale ** (4 * i / self.d_embedding)
            pos_enc[:, 4 * i] = torch.sin(query_coords[:, 0] / div_factor)
            pos_enc[:, 4 * i + 1] = torch.cos(query_coords[:, 0] / div_factor)
            pos_enc[:, 4 * i + 2] = torch.sin(query_coords[:, 1] / div_factor)
            pos_enc[:, 4 * i + 3] = torch.cos(query_coords[:, 1] / div_factor)

        # Use ScaledResidualNorm to combine learnable embedding and positional encoding
        point_embedded = self.scaled_residual_norm(self.learnable_embedding, pos_enc)

        return point_embedded

# Example usage:
torch.manual_seed(RANDOM_STATE)
point_embedding = PointEmbedding(d_embedding, surface_embedding)
point_embedded = point_embedding(processed_batch['Query Point'])
point_embedded

tensor([[ 0.4560,  0.0788, -2.0527,  0.9809, -0.8013, -0.2920,  0.2887,  1.3414],
        [ 1.6366, -0.4008, -1.3905,  0.6827, -1.0895, -0.5814,  0.0316,  1.1113],
        [ 0.9350,  0.3038, -1.7222,  0.9410, -1.1513, -0.5932,  0.0620,  1.2250],
        [ 0.4503, -0.1364, -2.0469,  0.9685, -0.7391, -0.2309,  0.3445,  1.3901]],
       grad_fn=<NativeLayerNormBackward0>)

### Pre-Decoder

In [28]:
class PreDecoder(nn.Module):
    def __init__(self, d_embedding, hidden_dim, dropout):
        super(PreDecoder, self).__init__()
        # Initialize the feedforward network
        self.feedforward = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        # Initialize the scaled residual normalization module
        self.scaled_residual_norm = ScaledResidualNorm(d_embedding)

    def forward(self, x):
        # Process the input through the feedforward network
        feedforward_output = self.feedforward(x)
        # Apply the scaled residual connection
        output = self.scaled_residual_norm(x, feedforward_output)
        
        return output
    
# Example usage
torch.manual_seed(RANDOM_STATE)
hidden_dim = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Hidden Dimension']
dropout = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Dropout']

pre_decoder = PreDecoder(d_embedding, hidden_dim, dropout)
pre_decoded_output = pre_decoder(point_embedded)  # Example input
pre_decoded_output

tensor([[ 0.0180,  0.3673, -2.0930,  0.9128, -0.2409, -0.6894,  0.3017,  1.4235],
        [ 1.3776, -0.0990, -1.6046,  0.7119, -0.8454, -0.7839, -0.0523,  1.2957],
        [ 0.6001,  0.4164, -1.8311,  1.0283, -0.7235, -0.8874,  0.0784,  1.3187],
        [ 0.0971,  0.1185, -2.1002,  0.8977, -0.2964, -0.5036,  0.2526,  1.5344]],
       grad_fn=<NativeLayerNormBackward0>)

### Final BLock

In [29]:
class QueryEmbedding(nn.Module):
    def __init__(self, d_embedding, surface_embedding, num_pre_decoder_blocks, hidden_dim, dropout):
        super(QueryEmbedding, self).__init__()
        # Initialize the PointEmbedding
        self.point_embedding = PointEmbedding(d_embedding, surface_embedding)

        # Initialize the PreDecoder blocks
        self.pre_decoders = nn.ModuleList([
            PreDecoder(d_embedding, hidden_dim, dropout) for _ in range(num_pre_decoder_blocks)
        ])

    def forward(self, processed_batch):
        query_point_batch = processed_batch['Query Point']

        # First apply the point embedding
        query_embedded = self.point_embedding(query_point_batch)
        
        # Sequentially apply each PreDecoder block
        for pre_decoder in self.pre_decoders:
            query_embedded = pre_decoder(query_embedded)
        
        # Reshape the output to (batch, 1, embedding) to make it a sequence of length 1
        query_embedded = query_embedded.unsqueeze(1)  # Add the sequence length dimension
        
        return query_embedded
    
# Example usage
torch.manual_seed(RANDOM_STATE)
hidden_dim = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Hidden Dimension']
dropout = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Dropout']
num_pre_decoder_blocks = HYPERPARAMETERS['Query Embedding']['Pre-Decoder']['Number of Blocks']    

query_embedding = QueryEmbedding(d_embedding, surface_embedding, num_pre_decoder_blocks, hidden_dim, dropout)
query_embedded = query_embedding(processed_batch)
query_embedded

tensor([[[ 0.6811,  0.3073, -1.8038,  1.0033, -1.1785, -0.3943,  0.1102,
           1.2747]],

        [[ 1.4415, -0.4677, -0.8475,  0.9473, -1.1058, -1.1960,  0.0308,
           1.1974]],

        [[ 1.1658,  0.1414, -1.2126,  0.9689, -1.2106, -1.1096,  0.0261,
           1.2305]],

        [[ 0.9947,  0.0524, -1.7013,  1.0016, -0.9316, -0.9155,  0.3760,
           1.1237]]], grad_fn=<UnsqueezeBackward0>)

## Surface Decoding

In [30]:
class DecoderBlock(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout):
        super(DecoderBlock, self).__init__()
        self.cross_attention = nn.MultiheadAttention(embed_dim=d_embedding, num_heads=n_heads, batch_first=True)
        self.scaled_residual_norm1 = ScaledResidualNorm(d_embedding)
        self.feedforward = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        self.scaled_residual_norm2 = ScaledResidualNorm(d_embedding)

    def forward(self, query, encoded_surface):
        # Cross-attention
        cross_attn_output, _ = self.cross_attention(query, encoded_surface, encoded_surface)
        x = self.scaled_residual_norm1(query, cross_attn_output)

        # Feedforward network
        ff_output = self.feedforward(x)
        x = self.scaled_residual_norm2(x, ff_output)

        return x

class SurfaceDecoder(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout, num_decoder_blocks):
        super(SurfaceDecoder, self).__init__()
        self.decoder_blocks = nn.ModuleList([
            DecoderBlock(d_embedding, n_heads, hidden_dim, dropout) for _ in range(num_decoder_blocks)
        ])

    def forward(self, query_embedded, encoded_surface):
        x = query_embedded

        for decoder_block in self.decoder_blocks:
            x = decoder_block(x, encoded_surface)

        return x
    
# Example usage:
torch.manual_seed(RANDOM_STATE)
n_heads = HYPERPARAMETERS['Surface Decoding']['Decoder']['Number of Heads']
hidden_dim = HYPERPARAMETERS['Surface Decoding']['Decoder']['Hidden Dimension']
dropout = HYPERPARAMETERS['Surface Decoding']['Decoder']['Dropout']
num_decoder_blocks = HYPERPARAMETERS['Surface Decoding']['Decoder']['Number of Blocks']

surface_decoder = SurfaceDecoder(d_embedding, n_heads, hidden_dim, dropout, num_decoder_blocks)
decoded_surface = surface_decoder(query_embedded, encoded_surface)
decoded_surface

tensor([[[ 1.0697, -0.2544, -1.3467,  0.9685, -1.7464,  0.7781, -0.0829,
           0.6141]],

        [[ 1.6291, -1.0379, -0.9156,  1.2129, -1.1923,  0.1436, -0.4220,
           0.5822]],

        [[ 1.2781, -0.4840, -1.2519,  1.1615, -1.4681,  0.4846, -0.4871,
           0.7669]],

        [[ 1.1862,  0.3340, -1.5831,  1.0233, -1.4032, -0.5205,  0.1472,
           0.8163]]], grad_fn=<NativeLayerNormBackward0>)

## IvySPT

In [31]:
class IvySPT(nn.Module):
    def __init__(self, hyperparameters):
        super(IvySPT, self).__init__()
        # Extract hyperparameters for each component
        input_embedding_hp = hyperparameters['Input Embedding']
        surface_encoding_hp = hyperparameters['Surface Encoding']['Encoder']
        query_embedding_hp = hyperparameters['Query Embedding']['Pre-Decoder']
        surface_decoding_hp = hyperparameters['Surface Decoding']['Decoder']

        # Initialize modules
        self.surface_embedding = SurfaceEmbedding(
            grid_dim=input_embedding_hp['Surface Embedding']['Grid Dimension'],
            d_embedding=input_embedding_hp['Surface Embedding']['Channels Dimension'],
            branch_channels=input_embedding_hp['Pre-Encoder']['Branch Channels Dimension'],
            num_pre_encoder_blocks=input_embedding_hp['Pre-Encoder']['Number of Blocks']
        )

        self.surface_encoding = SurfaceEncoding(
            d_embedding=input_embedding_hp['Surface Embedding']['Channels Dimension'],
            n_heads=surface_encoding_hp['Number of Heads'],
            hidden_dim=surface_encoding_hp['Hidden Dimension'],
            dropout=surface_encoding_hp['Dropout'],
            external_dim=surface_encoding_hp['External Feature Dimension'],
            num_encoder_blocks=surface_encoding_hp['Number of Blocks']
        )

        self.query_embedding = QueryEmbedding(
            d_embedding=input_embedding_hp['Surface Embedding']['Channels Dimension'],
            surface_embedding=self.surface_embedding,
            num_pre_decoder_blocks=query_embedding_hp['Number of Blocks'],
            hidden_dim=query_embedding_hp['Hidden Dimension'],
            dropout=query_embedding_hp['Dropout']
        )

        self.surface_decoder = SurfaceDecoder(
            d_embedding=input_embedding_hp['Surface Embedding']['Channels Dimension'],
            n_heads=surface_decoding_hp['Number of Heads'],
            hidden_dim=surface_decoding_hp['Hidden Dimension'],
            dropout=surface_decoding_hp['Dropout'],
            num_decoder_blocks=surface_decoding_hp['Number of Blocks']
        )

        # Final fully connected layer to predict implied volatility
        self.final_layer = nn.Linear(input_embedding_hp['Surface Embedding']['Channels Dimension'], 1)

    def forward(self, batch):
        # Pass batch through Surface Embedding
        tokenized_positional_embedded_batch, processed_batch = self.surface_embedding(batch)

        # Pass tokenized embeddings and market features to Surface Encoding
        encoded_output = self.surface_encoding(tokenized_positional_embedded_batch, processed_batch)

        # Pass query points through Query Embedding
        query_embedded = self.query_embedding(processed_batch)

        # Decode the query embeddings with the encoded surface data
        decoded_output = self.surface_decoder(query_embedded, encoded_output)

        # Apply the final fully connected layer
        final_output = self.final_layer(decoded_output.squeeze(1)).squeeze(1)  # Removing sequence length dimension and flattening

        return final_output

torch.manual_seed(RANDOM_STATE)    
ivy_spt = IvySPT(HYPERPARAMETERS)
iv_estimates = ivy_spt(batch)  
iv_estimates  

tensor([0.9235, 0.2264, 0.3986, 0.6405], grad_fn=<SqueezeBackward1>)

## Surface Arbitrage Free Loss

In [32]:
class SurfaceArbitrageFreeLoss(nn.Module):
    def __init__(self, hyperparameters):
        super(SurfaceArbitrageFreeLoss, self).__init__()
        self.calendar_coeff = hyperparameters['No-Arbitrage']['Calendar']
        self.butterfly_coeff = hyperparameters['No-Arbitrage']['Butterfly']

    def forward(self, iv_estimates, batch):
        target_volatility = batch['Target Volatility']

        # Calculate mean squared error between model estimates and target volatilities
        mse_loss = F.mse_loss(iv_estimates, target_volatility)

        # Calculate the total implied variance
        time_to_maturity = batch['Query Point']['Time to Maturity']
        log_moneyness = batch['Query Point']['Log Moneyness']
        total_implied_variance = time_to_maturity * iv_estimates.pow(2)

        # Compute gradients needed for arbitrage conditions
        w_t = torch.autograd.grad(total_implied_variance.sum(), time_to_maturity, create_graph=True)[0] 
        w_x = torch.autograd.grad(total_implied_variance.sum(), log_moneyness, create_graph=True)[0]
        w_xx = torch.autograd.grad(w_x.sum(), log_moneyness, create_graph=True)[0]

        # Calculate Calendar Arbitrage Loss
        calendar_arbitrage_loss = torch.mean(torch.clamp(-w_t, min=0) ** 2)

        # Calculate Butterfly Arbitrage Loss
        w = total_implied_variance
        g = (1 - log_moneyness * w_x / (2 * w)) ** 2 - w_x / 4 * (1 / w + 1 / 4) + w_xx / 2
        butterfly_arbitrage_loss = torch.mean(torch.clamp(-g, min=0) ** 2)

        # Combine all losses with coefficients
        total_loss = mse_loss + self.calendar_coeff * calendar_arbitrage_loss + self.butterfly_coeff * butterfly_arbitrage_loss

        return total_loss
    
surface_arbitrage_free_loss = SurfaceArbitrageFreeLoss(HYPERPARAMETERS)    
total_loss = surface_arbitrage_free_loss(iv_estimates, batch)
total_loss

tensor(0.1195, grad_fn=<AddBackward0>)