In [1]:
import numpy as np
import pandas as pd
import random
import torch

In [2]:
# Set the random seed for reproducibility
RANDOM_STATE = 0
N_JOBS = 8
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [3]:
HYPERPARAMETERS = {
    'Input Preprocessing' : {
        'Mask Proportions' : [0.1, 0.3, 0.5, 0.7],
        'Batch Size' : 4
    },
    'Surface Embedding' : {
        'Embedding Dimension' : 8,
    },
    'Surface Encoding' : {
        'Number of Heads' : 4,
        'FFN Hidden Dimension' : 16,
        'Attention Dropout' : 0.1,
        'Gate Dropout' : 0.1,
        'FFN Dropout' : 0.1,
        'Number of Blocks' : 2,
        'External Feature Dimension' : 3,
    },
    'No-Arbitrage' : {
        'Butterfly' : 1,
        'Calendar' : 1,
    }
}

## Dataset

In [4]:
aapl_googl_data = pd.read_csv('volatility_surface_AAPL_GOOGL_2013_01_2013_06.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
aapl_googl_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Log Moneyness,Time to Maturity,Implied Volatility,Market Return,Market Volatility,Treasury Rate
Datetime,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-02,AAPL,-0.316688,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.316688,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.291996,0.007937,0.3726,0.025086,14.680000,0.055
...,...,...,...,...,...,...,...
2013-06-28,GOOGL,0.427518,2.253968,0.2430,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2383,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2426,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.442224,2.253968,0.2402,-0.004299,16.860001,0.030


In [5]:
def implied_volatility_surfaces(options_market_data):
    # Group the data by Datetime and Symbol
    grouped_data = options_market_data.groupby(level=['Datetime', 'Symbol'])

    surfaces = []
    for (date, symbol), surface in grouped_data:
        surface_dict = {
            'Datetime': date,
            'Symbol': symbol,
            'Market Features': {
                'Market Return': surface['Market Return'].values[0],
                'Market Volatility': surface['Market Volatility'].values[0],
                'Treasury Rate': surface['Treasury Rate'].values[0],
            },
            'Surface': {
                'Log Moneyness': surface['Log Moneyness'].values,
                'Time to Maturity': surface['Time to Maturity'].values,
                'Implied Volatility': surface['Implied Volatility'].values,
            }
        }
        surfaces.append(surface_dict)

    return surfaces

surfaces = implied_volatility_surfaces(aapl_googl_data)
surfaces[0]

{'Datetime': Timestamp('2013-01-02 00:00:00'),
 'Symbol': 'AAPL',
 'Market Features': {'Market Return': 0.0250861159586972,
  'Market Volatility': 14.68000030517578,
  'Treasury Rate': 0.0549999997019767},
 'Surface': {'Log Moneyness': array([-0.31668849, -0.31668849, -0.30426597, ...,  0.63882295,
          0.6483924 ,  0.6483924 ]),
  'Time to Maturity': array([0.00793651, 0.00793651, 0.00793651, ..., 2.95634921, 2.95634921,
         2.95634921]),
  'Implied Volatility': array([0.3726, 0.6095, 0.3726, ..., 0.3387, 0.3342, 0.3389])}}

In [6]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import numpy as np

class IVSurfaceDataset(Dataset):
    def __init__(
        self, 
        data, 
        proportion, 
        random_state=0
    ):
        self.data = data
        self.proportion = proportion
        self.random_state = random_state

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        surface_data = self.data[idx]
        
        # Extract the surface coordinates and volatilities
        points_coordinates = np.stack([
            surface_data['Surface']['Log Moneyness'], 
            surface_data['Surface']['Time to Maturity']
        ], axis=1)
        points_volatilities = surface_data['Surface']['Implied Volatility']

        # Perform clustering
        n_clusters = int(np.ceil(1 / self.proportion))
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('kmeans', KMeans(n_clusters=n_clusters, random_state=self.random_state, n_init='auto'))
        ])
        labels = pipeline.fit_predict(points_coordinates)
        rng = np.random.default_rng(self.random_state)

        masked_indices = np.array([], dtype=int)

        for cluster in range(n_clusters):
            cluster_indices = np.where(labels == cluster)[0]
            num_to_mask = int(np.ceil(len(cluster_indices) * proportion))
            masked_indices = np.append(masked_indices, [rng.choice(cluster_indices, size=num_to_mask, replace=False)])
        
        unmasked_indices = np.setdiff1d(range(len(labels)), masked_indices)

        data_item = {
            'Datetime': surface_data['Datetime'],
            'Symbol': surface_data['Symbol'],
            'Market Features': {
                'Market Return': torch.tensor(surface_data['Market Features']['Market Return'], dtype=torch.float32),
                'Market Volatility': torch.tensor(surface_data['Market Features']['Market Volatility'], dtype=torch.float32),
                'Treasury Rate': torch.tensor(surface_data['Market Features']['Treasury Rate'], dtype=torch.float32),
            },
            'Input Surface': {
                'Log Moneyness': torch.tensor(points_coordinates[unmasked_indices, 0], dtype=torch.float32),
                'Time to Maturity': torch.tensor(points_coordinates[unmasked_indices, 1], dtype=torch.float32),
                'Implied Volatility': torch.tensor(points_volatilities[unmasked_indices], dtype=torch.float32)
            },
            'Query Points': {
                'Log Moneyness': torch.tensor(points_coordinates[masked_indices, 0], dtype=torch.float32),
                'Time to Maturity': torch.tensor(points_coordinates[masked_indices, 1], dtype=torch.float32),
                'Implied Volatility': torch.tensor(points_volatilities[masked_indices], dtype=torch.float32)
            }
        }

        return data_item

    @staticmethod
    def collate_fn(batch):
        batched_data = {
            'Datetime': [item['Datetime'] for item in batch],
            'Symbol': [item['Symbol'] for item in batch],
            'Market Features': {
                'Market Return': default_collate([item['Market Features']['Market Return'] for item in batch]),
                'Market Volatility': default_collate([item['Market Features']['Market Volatility'] for item in batch]),
                'Treasury Rate': default_collate([item['Market Features']['Treasury Rate'] for item in batch]),
            },
            'Input Surface': {
                'Log Moneyness': [item['Input Surface']['Log Moneyness'].clone().detach() for item in batch],
                'Time to Maturity': [item['Input Surface']['Time to Maturity'].clone().detach() for item in batch],
                'Implied Volatility': [item['Input Surface']['Implied Volatility'].clone().detach() for item in batch],
            },
            'Query Points': {
                'Log Moneyness': [item['Query Points']['Log Moneyness'].clone().detach().requires_grad_(True) for item in batch],
                'Time to Maturity': [item['Query Points']['Time to Maturity'].clone().detach().requires_grad_(True) for item in batch],
                'Implied Volatility': [item['Query Points']['Implied Volatility'].clone().detach() for item in batch],
            }
        }

        return batched_data


# Assuming surfaces is the output from the implied_volatility_surfaces function
proportion = 0.2  # example proportion
dataset = IVSurfaceDataset(surfaces, proportion)
data_loader = DataLoader(
    dataset, 
    batch_size=HYPERPARAMETERS['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)

# Fetch one batch from the DataLoader
batch = next(iter(data_loader))
batch

{'Datetime': [Timestamp('2013-06-10 00:00:00'),
  Timestamp('2013-01-28 00:00:00'),
  Timestamp('2013-05-20 00:00:00'),
  Timestamp('2013-03-07 00:00:00')],
 'Symbol': ['AAPL', 'AAPL', 'GOOGL', 'AAPL'],
 'Market Features': {'Market Return': tensor([-0.0003, -0.0019, -0.0007,  0.0018]),
  'Market Volatility': tensor([15.4400, 13.5700, 13.0200, 13.0600]),
  'Treasury Rate': tensor([0.0400, 0.0600, 0.0350, 0.0900])},
 'Input Surface': {'Log Moneyness': [tensor([-0.3159, -0.2852, -0.2553,  ...,  0.8627,  0.8723,  0.8723]),
   tensor([-0.2509, -0.2509, -0.2368,  ...,  0.8381,  0.8477,  0.8477]),
   tensor([-0.3741, -0.3661, -0.3582,  ...,  0.3886,  0.4034,  0.4107]),
   tensor([-0.1790, -0.1790, -0.1652,  ...,  0.8818,  0.8914,  0.8914])],
  'Time to Maturity': [tensor([0.0159, 0.0159, 0.0159,  ..., 2.3254, 2.3254, 2.3254]),
   tensor([0.0159, 0.0159, 0.0159,  ..., 2.8532, 2.8532, 2.8532]),
   tensor([0.0159, 0.0159, 0.0159,  ..., 2.4087, 2.4087, 2.4087]),
   tensor([0.0040, 0.0040, 0.0040,

## Surface Embedding

### Components

In [7]:
import torch
import torch.nn as nn
from torch.utils.data._utils.collate import default_collate

class SurfaceBatchNorm(nn.Module):
    def __init__(
        self, 
        num_features=1, 
        momentum=0.1
    ):
        super(SurfaceBatchNorm, self).__init__()
        self.log_moneyness_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.time_to_maturity_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_return_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_volatility_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.treasury_rate_bn = nn.BatchNorm1d(num_features, momentum=momentum)

    def forward(self, batch):
        # Concatenate all tensors from the Input Surface into one tensor for each feature
        input_surface_log_moneyness = torch.cat([x for x in batch['Input Surface']['Log Moneyness']])
        input_surface_time_to_maturity = torch.cat([x for x in batch['Input Surface']['Time to Maturity']])

        # Concatenate Input Surface tensors with Query Points tensors
        total_log_moneyness = torch.cat([input_surface_log_moneyness] + [x for x in batch['Query Points']['Log Moneyness']])
        total_time_to_maturity = torch.cat([input_surface_time_to_maturity] + [x for x in batch['Query Points']['Time to Maturity']])

        # Normalize Log Moneyness and Time to Maturity
        norm_log_moneyness = self.log_moneyness_bn(total_log_moneyness.unsqueeze(1)).squeeze(1)
        norm_time_to_maturity = self.time_to_maturity_bn(total_time_to_maturity.unsqueeze(1)).squeeze(1)

        # Split the normalized results back to corresponding structures
        input_surface_sizes = [len(x) for x in batch['Input Surface']['Log Moneyness']]
        query_points_sizes = [len(x) for x in batch['Query Points']['Log Moneyness']]
        total_input_size = sum(input_surface_sizes)

        # Normalizing Market Features
        market_features = batch['Market Features']
        norm_market_return = self.market_return_bn(market_features['Market Return'].unsqueeze(1)).squeeze(1)
        norm_market_volatility = self.market_volatility_bn(market_features['Market Volatility'].unsqueeze(1)).squeeze(1)
        norm_treasury_rate = self.treasury_rate_bn(market_features['Treasury Rate'].unsqueeze(1)).squeeze(1)

        # Reconstructing the batch with normalized data
        output = {
            'Datetime': batch['Datetime'],
            'Symbol': batch['Symbol'],
            'Market Features': {
                'Market Return': norm_market_return,
                'Market Volatility': norm_market_volatility,
                'Treasury Rate': norm_treasury_rate
            },
            'Input Surface': {
                'Log Moneyness': list(torch.split(norm_log_moneyness[:total_input_size], input_surface_sizes)),
                'Time to Maturity': list(torch.split(norm_time_to_maturity[:total_input_size], input_surface_sizes)),
                'Implied Volatility': batch['Input Surface']['Implied Volatility']
            },
            'Query Points': {
                'Log Moneyness': list(torch.split(norm_log_moneyness[total_input_size:], query_points_sizes)),
                'Time to Maturity': list(torch.split(norm_time_to_maturity[total_input_size:], query_points_sizes)),
                'Implied Volatility': batch['Query Points']['Implied Volatility']
            }
        }

        # Ensure requires_grad is True for query point values
        for key in output['Query Points']:
            if key != 'Implied Volatility':  # We only set requires_grad for Log Moneyness and Time to Maturity
                for tensor in output['Query Points'][key]:
                    tensor.requires_grad_()

        return output

# Usage
surfacebatchnorm = SurfaceBatchNorm()
processed_batch = surfacebatchnorm(batch)
processed_batch

{'Datetime': [Timestamp('2013-06-10 00:00:00'),
  Timestamp('2013-01-28 00:00:00'),
  Timestamp('2013-05-20 00:00:00'),
  Timestamp('2013-03-07 00:00:00')],
 'Symbol': ['AAPL', 'AAPL', 'GOOGL', 'AAPL'],
 'Market Features': {'Market Return': tensor([-0.0216, -0.4603, -0.1269,  0.6087], grad_fn=<SqueezeBackward1>),
  'Market Volatility': tensor([ 1.6897, -0.2052, -0.7625, -0.7220], grad_fn=<SqueezeBackward1>),
  'Treasury Rate': tensor([-0.7439,  0.1717, -0.9728,  1.5450], grad_fn=<SqueezeBackward1>)},
 'Input Surface': {'Log Moneyness': [tensor([-1.0316, -0.9561, -0.8829,  ...,  1.8599,  1.8834,  1.8834],
          grad_fn=<SplitWithSizesBackward0>),
   tensor([-0.8722, -0.8722, -0.8374,  ...,  1.7995,  1.8230,  1.8230],
          grad_fn=<SplitWithSizesBackward0>),
   tensor([-1.1743, -1.1547, -1.1353,  ...,  0.6968,  0.7331,  0.7511],
          grad_fn=<SplitWithSizesBackward0>),
   tensor([-0.6958, -0.6958, -0.6619,  ...,  1.9068,  1.9303,  1.9303],
          grad_fn=<SplitWithSizesB

In [8]:
import torch
import torch.nn as nn
import numpy as np

class EllipticalRBFKernel(nn.Module):
    def __init__(
        self, 
        input_dim, 
        bandwidth
    ):
        super(EllipticalRBFKernel, self).__init__()
        self.bandwidth = bandwidth
        # Initialize the log of the scale vector to zero, which corresponds to scale factors of one
        self.log_scale = nn.Parameter(torch.zeros(input_dim))

    def forward(self, distances):
        # Convert log scale to actual scale values
        scale = torch.exp(self.log_scale)
        
        # Create a diagonal scale matrix
        scale_matrix = torch.diag(scale)

        # Calculate the scaled distances
        scaled_distances = distances @ scale_matrix @ distances.t()
        
        # Normalize by the trace of the scale matrix
        trace_scale_matrix = torch.trace(scale_matrix)
        normalized_distances = scaled_distances / trace_scale_matrix

        # Compute the RBF kernel output using the normalized distances
        kernel_values = torch.exp(-normalized_distances / (2 * self.bandwidth ** 2))

        return kernel_values

class SurfaceContinuousKernelPositionalEmbedding(nn.Module):
    def __init__(self, d_embedding):
        super(SurfaceContinuousKernelPositionalEmbedding, self).__init__()
        self.d_embedding = d_embedding

        # Initialize multiple RBF kernels, each with a different fixed bandwidth
        self.kernels = nn.ModuleList()
        for i in range(1, d_embedding + 1):
            bandwidth_value = torch.erfinv(torch.tensor(i / (d_embedding + 1))) * np.sqrt(2)
            self.kernels.append(EllipticalRBFKernel(bandwidth=bandwidth_value, input_dim=2))

        self.input_surface_layer_norm = nn.LayerNorm(d_embedding)
        self.query_points_layer_norm = nn.LayerNorm(d_embedding)

        # Initialize learnable scaling parameter (the base for positional embedding)
        self.log_scale = nn.Parameter(torch.log(torch.tensor(10000.0)))

    def forward(
        self, 
        input_surface_batch, 
        query_points_batch
    ):
        batch_size = len(input_surface_batch['Log Moneyness'])

        input_surface_embeddings = []
        query_points_embeddings = []

        for i in range(batch_size):
            # Extract the coordinates and implied volatilities for each surface in the batch
            surface_coords = torch.stack([
                input_surface_batch['Log Moneyness'][i], 
                input_surface_batch['Time to Maturity'][i]
            ], dim=-1)
            surface_ivs = input_surface_batch['Implied Volatility'][i]

            query_coords = torch.stack([
                query_points_batch['Log Moneyness'][i], 
                query_points_batch['Time to Maturity'][i]
            ], dim=-1)

            all_coords = torch.cat((surface_coords, query_coords), dim=0)

            # Compute the pairwise differences between all points and the input surface points
            point_differences = all_coords.unsqueeze(1) - surface_coords.unsqueeze(0)  # (n+m, n, 2)

            # Initialize the output embeddings for the current surface with d_embedding channels
            all_embedded = torch.zeros((all_coords.shape[0], self.d_embedding), dtype=torch.float32, device=surface_coords.device)

            for kernel_idx, kernel in enumerate(self.kernels):
                # Apply the RBF kernel to each distance vector using torch.vmap
                vmap_kernel = torch.vmap(kernel, in_dims=(0,))
                kernel_outputs = vmap_kernel(point_differences.view(-1, point_differences.shape[-1]))  # ((n+m) * n)
                kernel_outputs = kernel_outputs.view(all_coords.shape[0], surface_coords.shape[0])  # (n+m, n)

                # Compute the weighted sum of IVs based on the kernel outputs
                weighted_sum = (kernel_outputs * surface_ivs.unsqueeze(0)).sum(dim=1)
                normalization_factor = kernel_outputs.sum(dim=1)

                all_embedded[:, kernel_idx] = weighted_sum / normalization_factor

            # Split the embeddings into input surface and query points embeddings
            input_surface_embedded = all_embedded[:surface_coords.shape[0], :]
            query_points_embedded = all_embedded[surface_coords.shape[0]:, :]

            # Normalize the embedded surfaces
            input_surface_embedded = self.input_surface_layer_norm(input_surface_embedded)
            query_points_embedded = self.query_points_layer_norm(query_points_embedded)

            # Positional embedding for input surface points
            input_surface_pe = self._compute_positional_embedding(surface_coords)

            # Positional embedding for query points
            query_points_pe = self._compute_positional_embedding(query_coords)

            # Add positional embeddings with a factor of sqrt(2)
            input_surface_final = input_surface_embedded + input_surface_pe * np.sqrt(2)
            query_points_final = query_points_embedded + query_points_pe * np.sqrt(2)

            # Append the encoded surface for this input surface to the batch list
            input_surface_embeddings.append(input_surface_final)
            query_points_embeddings.append(query_points_final)

        # Keep all encoded surfaces as lists to handle variable lengths
        return {
            'Input Surface': input_surface_embeddings,
            'Query Points': query_points_embeddings
        }

    def _compute_positional_embedding(
        self, 
        coords, 
    ):
        positional_embedding = torch.zeros(coords.size(0), self.d_embedding, device=coords.device)

        for i in range(self.d_embedding // 4):
            div_factor = torch.exp(self.log_scale) ** (4 * i / self.d_embedding)
            positional_embedding[:, 4 * i] = torch.sin(coords[:, 0] / div_factor)
            positional_embedding[:, 4 * i + 1] = torch.cos(coords[:, 0] / div_factor)
            positional_embedding[:, 4 * i + 2] = torch.sin(coords[:, 1] / div_factor)
            positional_embedding[:, 4 * i + 3] = torch.cos(coords[:, 1] / div_factor)

        return positional_embedding

# Example of initializing and using this module
d_embedding = HYPERPARAMETERS['Surface Embedding']['Embedding Dimension']  # Desired number of output channels

continuous_kernel_positional_embedding = SurfaceContinuousKernelPositionalEmbedding(d_embedding=d_embedding)
kernel_positional_embedded_batch = continuous_kernel_positional_embedding(processed_batch['Input Surface'], processed_batch['Query Points'])
kernel_positional_embedded_batch

{'Input Surface': [tensor([[ 0.2954,  2.0830, -0.5693,  ...,  0.6434, -0.9860,  0.2933],
          [ 0.5820,  2.0090, -0.7119,  ...,  0.6715, -0.9329,  0.3644],
          [ 0.8457,  1.9012, -0.8559,  ...,  0.7113, -0.8666,  0.4503],
          ...,
          [ 2.7806,  0.6177,  1.5776,  ...,  0.3637, -0.9704,  0.8234],
          [ 2.7725,  0.5866,  1.5793,  ...,  0.3655, -0.9779,  0.8089],
          [ 2.7725,  0.5866,  1.5793,  ...,  0.3655, -0.9779,  0.8089]],
         grad_fn=<AddBackward0>),
  tensor([[ 1.3062,  1.5115, -1.1618,  ...,  0.7953, -0.7169,  0.6365],
          [ 1.3062,  1.5115, -1.1618,  ...,  0.7953, -0.7169,  0.6365],
          [ 1.3474,  1.5246, -1.1666,  ...,  0.8006, -0.7135,  0.6377],
          ...,
          [ 1.2324, -0.5439, -0.4405,  ...,  0.8758,  0.6794,  3.5103],
          [ 1.2221, -0.5732, -0.4372,  ...,  0.8731,  0.6769,  3.5101],
          [ 1.2221, -0.5732, -0.4372,  ...,  0.8731,  0.6769,  3.5101]],
         grad_fn=<AddBackward0>),
  tensor([[ 0.3913,

## Block

In [9]:
import torch
import torch.nn as nn
import numpy as np

class SurfaceEmbedding(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        momentum=0.1
    ):
        super(SurfaceEmbedding, self).__init__()
        self.batch_norm = SurfaceBatchNorm(num_features=1, momentum=momentum)
        self.kernel_positional_embedding = SurfaceContinuousKernelPositionalEmbedding(d_embedding)
        self.layer_norm = nn.LayerNorm(d_embedding)
        self.mask_token = nn.Parameter(torch.randn(d_embedding))

    def forward(self, batch):
        # Apply batch normalization
        norm_batch = self.batch_norm(batch)

        # Extract market features from processed batch and create external_features_batch tensor
        market_features = norm_batch['Market Features']
        external_features_batch = torch.stack([
            market_features['Market Return'],
            market_features['Market Volatility'],
            market_features['Treasury Rate']
        ], dim=-1)  # (batch, features)

        # Compute kernel and positional embeddings
        embeddings = self.kernel_positional_embedding(norm_batch['Input Surface'], norm_batch['Query Points'])

        input_surface_embeddings = embeddings['Input Surface']
        query_points_embeddings = embeddings['Query Points']

        embedded_sequences = []

        for input_surface_embedding, query_points_embedding in zip(input_surface_embeddings, query_points_embeddings):
            # Add mask token to the query point embeddings
            masked_query_points_embedding = query_points_embedding + self.mask_token

            # Combine input surface embeddings and masked query points embeddings
            combined_sequence = torch.cat((input_surface_embedding, masked_query_points_embedding), dim=0)

            # Apply layer normalization
            combined_sequence = self.layer_norm(combined_sequence)

            embedded_sequences.append(combined_sequence)

        return embedded_sequences, external_features_batch


# Example of initializing and using this module
d_embedding = HYPERPARAMETERS['Surface Embedding']['Embedding Dimension']  # Desired number of output channels
surface_embedding = SurfaceEmbedding(d_embedding=d_embedding)
embedded_sequences_batch, external_features_batch = surface_embedding(batch)
embedded_sequences_batch

[tensor([[ 0.0401,  2.0104, -0.9130,  ...,  0.4237, -1.3722,  0.0378],
         [ 0.3371,  1.9172, -1.0956,  ...,  0.4362, -1.3403,  0.0962],
         [ 0.6080,  1.7753, -1.2737,  ...,  0.4594, -1.2855,  0.1708],
         ...,
         [-1.0275,  0.5390, -1.3384,  ...,  1.3468, -0.1811,  1.5858],
         [-0.9579,  0.6055, -1.4113,  ...,  1.3381, -0.1463,  1.5341],
         [ 0.2283,  0.9342, -1.0442,  ...,  1.2355, -0.8853,  1.3790]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[ 1.0984,  1.3221, -1.5918,  ...,  0.5415, -1.1069,  0.3683],
         [ 1.0984,  1.3221, -1.5918,  ...,  0.5415, -1.1069,  0.3683],
         [ 1.1237,  1.3151, -1.5924,  ...,  0.5329, -1.1028,  0.3570],
         ...,
         [-2.3258,  0.3800, -0.4917,  ...,  1.2400, -0.0622,  0.6894],
         [-1.2907,  1.9219, -0.6732,  ...,  1.0791, -1.0044,  0.0715],
         [-2.2406,  0.1332, -0.4394,  ...,  1.3251,  0.0881,  0.9311]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[ 0.1924,  1.9208, 

# Surface Encoding

## Encoder

In [10]:
import torch
import torch.nn as nn

class ResidualNorm(nn.Module):
    def __init__(self, d_embedding):
        super(ResidualNorm, self).__init__()
        self.norm = nn.LayerNorm(d_embedding)

    def forward(
        self, 
        x, 
        sublayer_output
    ):
        return self.norm(x + sublayer_output)

class GatedAttentionFusion(nn.Module):
    def __init__(
        self, 
        d_embedding,
        gate_dropout
    ):
        super(GatedAttentionFusion, self).__init__()
        self.gate_layer = nn.Sequential(
            nn.Linear(d_embedding * 2, d_embedding),
            nn.Sigmoid(),
            nn.Dropout(gate_dropout)
        )

    def forward(
        self, 
        self_attn_output, 
        ext_attn_output
    ):
        # Concatenate self-attention and external attention outputs
        concatenated_output = torch.cat((self_attn_output, ext_attn_output), dim=-1)
        # Compute gate values
        gate_values = self.gate_layer(concatenated_output)
        # Calculate gated embedding
        gated_embedding = gate_values * self_attn_output + (1 - gate_values) * ext_attn_output

        return gated_embedding
    
class FeedForwardNetwork(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        ffn_hidden_dim, 
        ffn_dropout
    ):
        super(FeedForwardNetwork, self).__init__()
        self.feedforward = nn.Sequential(
            nn.Linear(d_embedding, ffn_hidden_dim),
            nn.GELU(),
            nn.Dropout(ffn_dropout),
            nn.Linear(ffn_hidden_dim, d_embedding),
            nn.Dropout(ffn_dropout)
        )

    def forward(self, x):
        
        return self.feedforward(x)    

class Encoder(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        n_heads, 
        ffn_hidden_dim, 
        attention_dropout, 
        gate_dropout,
        ffn_dropout,
        external_dim
    ):
        super(Encoder, self).__init__()
        self.self_attention = nn.MultiheadAttention(
            embed_dim=d_embedding, 
            num_heads=n_heads, 
            dropout=attention_dropout
        )
        self.external_attention = nn.MultiheadAttention(
            embed_dim=d_embedding, 
            num_heads=n_heads, 
            kdim=external_dim, 
            vdim=external_dim, 
            dropout=attention_dropout
        )
        self.gated_attention_fusion = GatedAttentionFusion(d_embedding, gate_dropout)
        self.residual_norm1 = ResidualNorm(d_embedding)
        self.feed_forward = FeedForwardNetwork(d_embedding, ffn_hidden_dim, ffn_dropout)
        self.residual_norm2 = ResidualNorm(d_embedding)

    def forward(
        self, 
        surface_embeddings, 
        external_features
    ):
        # Self-Attention
        self_attn_output, _ = self.self_attention(surface_embeddings, surface_embeddings, surface_embeddings)
        # External Attention
        ext_attn_output, _ = self.external_attention(surface_embeddings, external_features, external_features) 
        # Gated Attention Fusion
        gated_embedding = self.gated_attention_fusion(self_attn_output, ext_attn_output)
        # Residual Connection and Layer Normalization
        surface_embeddings = self.residual_norm1(surface_embeddings, gated_embedding)
        # Feed-Forward Network
        ffn_output = self.feed_forward(surface_embeddings)
        # Final Residual Connection and Layer Normalization
        surface_embeddings = self.residual_norm2(surface_embeddings, ffn_output)
        
        return surface_embeddings

class SurfaceEncoder(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        num_encoder_blocks,
        n_heads, 
        ffn_hidden_dim,
        attention_dropout, 
        gate_dropout,
        ffn_dropout,
        external_dim
    ):
        super(SurfaceEncoder, self).__init__()
        self.encoders = nn.ModuleList([
            Encoder(
                d_embedding, 
                n_heads, 
                ffn_hidden_dim, 
                attention_dropout, 
                gate_dropout,
                ffn_dropout,
                external_dim
            )
            for _ in range(num_encoder_blocks)
        ])

    def forward(
        self, 
        embedded_sequences_batch, 
        external_features_batch
    ):
        batch_size = len(embedded_sequences_batch)
        encoded_sequences_batch = []

        for i in range(batch_size):
            surface_embeddings = embedded_sequences_batch[i].unsqueeze(1) 
            external_features = external_features_batch[i].unsqueeze(0).unsqueeze(0)

            for encoder in self.encoders:
                surface_embeddings = encoder(surface_embeddings, external_features)
                
            encoded_sequences_batch.append(surface_embeddings.squeeze(1))
        
        return encoded_sequences_batch

# Example of initializing and using these modules
torch.manual_seed(RANDOM_STATE)
n_heads = HYPERPARAMETERS['Surface Encoding']['Number of Heads']
ffn_hidden_dim = HYPERPARAMETERS['Surface Encoding']['FFN Hidden Dimension']
attention_dropout = HYPERPARAMETERS['Surface Encoding']['Attention Dropout']
gate_dropout = HYPERPARAMETERS['Surface Encoding']['Gate Dropout']
ffn_dropout = HYPERPARAMETERS['Surface Encoding']['FFN Dropout']
num_encoder_blocks = HYPERPARAMETERS['Surface Encoding']['Number of Blocks']
external_dim = 3

surface_encoder = SurfaceEncoder(
    d_embedding, 
    num_encoder_blocks,
    n_heads, 
    ffn_hidden_dim, 
    attention_dropout, 
    gate_dropout, 
    ffn_dropout, 
    external_dim, 
)

# Assume embedded_sequences_batch is the output of the SurfaceEmbedding module and
# external_features is the formatted external market features batch
encoded_sequences_batch = surface_encoder(embedded_sequences_batch, external_features_batch)
encoded_sequences_batch

[tensor([[ 0.3129,  1.4653, -1.2204,  ...,  1.2530, -1.6073,  0.0768],
         [ 0.7833,  1.2334, -1.1345,  ...,  0.9228, -1.9027,  0.2909],
         [ 1.0559,  1.0879, -1.2278,  ...,  1.2601, -1.4105,  0.1809],
         ...,
         [-0.2479, -0.0269, -1.6364,  ...,  1.4322, -0.4094,  1.5634],
         [-0.4852,  0.2739, -1.4158,  ...,  1.4431, -0.0627,  1.4292],
         [ 0.4488, -0.0256, -0.9882,  ...,  1.4369, -0.9499,  1.5133]],
        grad_fn=<SqueezeBackward1>),
 tensor([[ 1.6099,  0.6789, -0.8598,  ...,  1.1429, -1.2232,  0.2780],
         [ 1.5252,  0.7215, -0.7823,  ...,  1.1578, -1.3290,  0.2657],
         [ 1.3831,  0.8062, -0.7033,  ...,  1.0236, -1.6254,  0.5153],
         ...,
         [-2.2622,  0.6225, -0.3726,  ...,  1.3094, -0.0448,  0.7411],
         [-1.3039,  1.6454, -0.1095,  ...,  1.4177, -0.9369, -0.0400],
         [-2.1987,  0.4399, -0.2732,  ...,  1.1775,  0.3401,  0.9361]],
        grad_fn=<SqueezeBackward1>),
 tensor([[ 0.4655,  1.2916,  0.0053,  ...,  

In [11]:
encoded_sequences_batch

[tensor([[ 0.3129,  1.4653, -1.2204,  ...,  1.2530, -1.6073,  0.0768],
         [ 0.7833,  1.2334, -1.1345,  ...,  0.9228, -1.9027,  0.2909],
         [ 1.0559,  1.0879, -1.2278,  ...,  1.2601, -1.4105,  0.1809],
         ...,
         [-0.2479, -0.0269, -1.6364,  ...,  1.4322, -0.4094,  1.5634],
         [-0.4852,  0.2739, -1.4158,  ...,  1.4431, -0.0627,  1.4292],
         [ 0.4488, -0.0256, -0.9882,  ...,  1.4369, -0.9499,  1.5133]],
        grad_fn=<SqueezeBackward1>),
 tensor([[ 1.6099,  0.6789, -0.8598,  ...,  1.1429, -1.2232,  0.2780],
         [ 1.5252,  0.7215, -0.7823,  ...,  1.1578, -1.3290,  0.2657],
         [ 1.3831,  0.8062, -0.7033,  ...,  1.0236, -1.6254,  0.5153],
         ...,
         [-2.2622,  0.6225, -0.3726,  ...,  1.3094, -0.0448,  0.7411],
         [-1.3039,  1.6454, -0.1095,  ...,  1.4177, -0.9369, -0.0400],
         [-2.1987,  0.4399, -0.2732,  ...,  1.1775,  0.3401,  0.9361]],
        grad_fn=<SqueezeBackward1>),
 tensor([[ 0.4655,  1.2916,  0.0053,  ...,  

# IvySPT

In [12]:
import torch
import torch.nn as nn

class IvySPT(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        num_encoder_blocks,
        n_heads, 
        ffn_hidden_dim,
        attention_dropout, 
        gate_dropout,
        ffn_dropout,
        external_dim
    ):
        super(IvySPT, self).__init__()
        self.surface_embedding = SurfaceEmbedding(d_embedding)
        self.surface_encoder = SurfaceEncoder(
            d_embedding, 
            num_encoder_blocks,
            n_heads, 
            ffn_hidden_dim,
            attention_dropout, 
            gate_dropout,
            ffn_dropout,
            external_dim
        )
        self.final_layer = nn.Linear(d_embedding, 1)

    def forward(self, batch):
        # Obtain the embedded sequences and external features from the SurfaceEmbedding module
        embedded_sequences_batch, external_features_batch = self.surface_embedding(batch)

        # Encode the sequences using the SurfaceEncoder module
        encoded_sequences_batch = self.surface_encoder(embedded_sequences_batch, external_features_batch)

        # List to hold the implied volatility estimates for each query point in the batch
        iv_estimates_batch = []

        for i in range(len(encoded_sequences_batch)):
            # Extract the encoded sequence
            encoded_sequence = encoded_sequences_batch[i]

            # Determine the number of query points for this sequence
            num_query_points = len(batch['Query Points']['Log Moneyness'][i])

            # Extract the encoded query points (last num_query_points elements in the sequence)
            encoded_query_points = encoded_sequence[-num_query_points:]

            # Estimate the implied volatility for each query point using the fully connected layer
            iv_estimates = self.final_layer(encoded_query_points).squeeze(-1)

            # Append the estimates to the batch list
            iv_estimates_batch.append(iv_estimates)

        return iv_estimates_batch

# Example of initializing and using this module
torch.manual_seed(RANDOM_STATE)
n_heads = HYPERPARAMETERS['Surface Encoding']['Number of Heads']
ffn_hidden_dim = HYPERPARAMETERS['Surface Encoding']['FFN Hidden Dimension']
attention_dropout = HYPERPARAMETERS['Surface Encoding']['Attention Dropout']
gate_dropout = HYPERPARAMETERS['Surface Encoding']['Gate Dropout']
ffn_dropout = HYPERPARAMETERS['Surface Encoding']['FFN Dropout']
num_encoder_blocks = HYPERPARAMETERS['Surface Encoding']['Number of Blocks']
d_embedding = HYPERPARAMETERS['Surface Embedding']['Embedding Dimension']  # Desired number of output channels
external_dim = 3

ivy_spt = IvySPT(
    d_embedding, 
    num_encoder_blocks,
    n_heads, 
    ffn_hidden_dim,
    attention_dropout, 
    gate_dropout,
    ffn_dropout,
    external_dim
)

# Pass the batch through the IvySPT model to get implied volatility estimates
iv_estimates_batch = ivy_spt(batch)
iv_estimates_batch

[tensor([0.7148, 0.4146, 0.4608, 0.4135, 0.6143, 0.5572, 0.6718, 0.3357, 0.4871,
         0.4813, 0.5920, 0.4468, 0.5046, 0.5533, 0.5604, 0.4832, 0.6962, 0.3562,
         0.2858, 0.5461, 0.7195, 0.5420, 0.5661, 0.4552, 0.4555, 0.4688, 0.3553,
         0.6810, 0.6298, 0.6140, 0.4203, 0.5709, 0.4766, 0.4945, 0.5639, 0.7052,
         0.4569, 0.4950, 0.4856, 0.5100, 0.7992, 0.5749, 0.6080, 0.4550, 0.3951,
         0.3539, 0.6736, 0.4504, 0.5283, 0.5641, 0.5780, 0.5578, 0.6364, 0.5014,
         0.4514, 0.6482, 0.5365, 0.4957, 0.6052, 0.3540, 0.5467, 0.6059, 0.3948,
         0.4564, 0.6553, 0.6239, 0.3600, 0.5871, 0.5362, 0.5015, 0.4932, 0.5445,
         0.5707, 0.4843, 0.6191, 0.9255, 0.5064, 0.5571, 0.6231, 0.7034, 0.4300,
         0.4782, 0.5911, 0.4556, 0.4863, 0.6253, 0.5715, 0.5712, 0.6181, 0.6538,
         0.7799, 0.4797, 0.4087, 0.4617, 0.5657, 0.6397, 0.4104, 0.6409, 0.4158,
         0.5921, 0.5534, 0.4717, 0.6326, 0.8433, 0.4839, 0.5885, 0.4143, 0.6252,
         0.4918, 0.4806, 0.3

In [13]:
batch['Query Points']['Implied Volatility']

[tensor([0.2695, 0.2695, 0.3415, 0.3091, 0.3302, 0.2849, 0.2695, 0.3302, 0.2561,
         0.2912, 0.3295, 0.3461, 0.2658, 0.3302, 0.3133, 0.3065, 0.3302, 0.2950,
         0.3461, 0.2847, 0.3088, 0.2847, 0.2711, 0.2940, 0.2561, 0.2887, 0.2561,
         0.2683, 0.2662, 0.3295, 0.2817, 0.3133, 0.3271, 0.2548, 0.3302, 0.3295,
         0.2849, 0.3302, 0.3302, 0.2695, 0.3302, 0.2702, 0.3302, 0.3716, 0.2707,
         0.3295, 0.3302, 0.2604, 0.3133, 0.3104, 0.2695, 0.3302, 0.2631, 0.2702,
         0.2654, 0.3295, 0.2561, 0.3138, 0.2702, 0.3153, 0.3128, 0.2695, 0.2561,
         0.2665, 0.3024, 0.2849, 0.3461, 0.2849, 0.2661, 0.3066, 0.2564, 0.2844,
         0.2561, 0.2702, 0.3295, 0.2745, 0.2695, 0.3000, 0.2636, 0.3302, 0.2585,
         0.3356, 0.2849, 0.3216, 0.2785, 0.3190, 0.3302, 0.2695, 0.3302, 0.2676,
         0.2737, 0.2695, 0.3356, 0.2661, 0.3295, 0.3302, 0.2601, 0.2695, 0.2717,
         0.2695, 0.2859, 0.3133, 0.2695, 0.2561, 0.3295, 0.3302, 0.2695, 0.2695,
         0.3302, 0.3133, 0.3