In [1]:
import numpy as np
import pandas as pd
import random
import torch

In [2]:
# Set the random seed for reproducibility
RANDOM_STATE = 0
N_JOBS = 8
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [19]:
HYPERPARAMETERS = {
    'Input Preprocessing' : {
        'Mask Proportions' : [0.1, 0.3, 0.5, 0.7],
        'Batch Size' : 4
    },
    'Surface Embedding' : {
        'Embedding Dimension' : 8,
    },
    'Surface Encoding' : {
        'Encoder' : {
            'Number of Heads' : 4,
            'Hidden Dimension' : 16,
            'Dropout' : 0.1,
            'Number of Blocks' : 2,
            'External Feature Dimension' : 3,
        }
    },
    'Query Embedding' : {
        'Pre-Decoder' : {
            'Hidden Dimension' : 16,
            'Dropout' : 0.1,
            'Number of Blocks' : 2,
        }
    },
    'Surface Decoding' : {
        'Decoder' : {
            'Number of Heads' : 4,
            'Hidden Dimension' : 16,
            'Dropout' : 0.1,
            'Number of Blocks' : 2,
        }
    },
    'No-Arbitrage' : {
        'Butterfly' : 1,
        'Calendar' : 1,
    }
}

## Dataset

In [4]:
aapl_googl_data = pd.read_csv('volatility_surface_AAPL_GOOGL_2013_01_2013_06.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
aapl_googl_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Log Moneyness,Time to Maturity,Implied Volatility,Market Return,Market Volatility,Treasury Rate
Datetime,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-02,AAPL,-0.316688,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.316688,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.291996,0.007937,0.3726,0.025086,14.680000,0.055
...,...,...,...,...,...,...,...
2013-06-28,GOOGL,0.427518,2.253968,0.2430,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2383,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2426,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.442224,2.253968,0.2402,-0.004299,16.860001,0.030


In [5]:
def implied_volatility_surfaces(options_market_data):
    # Group the data by Datetime and Symbol
    grouped_data = options_market_data.groupby(level=['Datetime', 'Symbol'])

    surfaces = []
    for (date, symbol), surface in grouped_data:
        surface_dict = {
            'Datetime': date,
            'Symbol': symbol,
            'Market Features': {
                'Market Return': surface['Market Return'].values[0],
                'Market Volatility': surface['Market Volatility'].values[0],
                'Treasury Rate': surface['Treasury Rate'].values[0],
            },
            'Surface': {
                'Log Moneyness': surface['Log Moneyness'].values,
                'Time to Maturity': surface['Time to Maturity'].values,
                'Implied Volatility': surface['Implied Volatility'].values,
            }
        }
        surfaces.append(surface_dict)

    return surfaces

surfaces = implied_volatility_surfaces(aapl_googl_data)
surfaces[0]

{'Datetime': Timestamp('2013-01-02 00:00:00'),
 'Symbol': 'AAPL',
 'Market Features': {'Market Return': 0.0250861159586972,
  'Market Volatility': 14.68000030517578,
  'Treasury Rate': 0.0549999997019767},
 'Surface': {'Log Moneyness': array([-0.31668849, -0.31668849, -0.30426597, ...,  0.63882295,
          0.6483924 ,  0.6483924 ]),
  'Time to Maturity': array([0.00793651, 0.00793651, 0.00793651, ..., 2.95634921, 2.95634921,
         2.95634921]),
  'Implied Volatility': array([0.3726, 0.6095, 0.3726, ..., 0.3387, 0.3342, 0.3389])}}

In [6]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import numpy as np

class IVSurfaceDataset(Dataset):
    def __init__(
        self, 
        data, 
        proportion, 
        random_state=0
    ):
        self.data = data
        self.proportion = proportion
        self.random_state = random_state

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        surface_data = self.data[idx]
        
        # Extract the surface coordinates and volatilities
        points_coordinates = np.stack([
            surface_data['Surface']['Log Moneyness'], 
            surface_data['Surface']['Time to Maturity']
        ], axis=1)
        points_volatilities = surface_data['Surface']['Implied Volatility']

        # Perform clustering
        n_clusters = int(np.ceil(1 / self.proportion))
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('kmeans', KMeans(n_clusters=n_clusters, random_state=self.random_state, n_init='auto'))
        ])
        labels = pipeline.fit_predict(points_coordinates)

        rng = np.random.default_rng(self.random_state)
        cluster_indices = np.where(labels == rng.integers(n_clusters))[0]
        num_to_mask = int(np.ceil(len(cluster_indices) * self.proportion))
        masked_indices = rng.choice(cluster_indices, size=num_to_mask, replace=False)
        
        unmasked_indices = np.setdiff1d(cluster_indices, masked_indices)


        data_item = {
            'Datetime': surface_data['Datetime'],
            'Symbol': surface_data['Symbol'],
            'Market Features': {
                'Market Return': torch.tensor(surface_data['Market Features']['Market Return'], dtype=torch.float32),
                'Market Volatility': torch.tensor(surface_data['Market Features']['Market Volatility'], dtype=torch.float32),
                'Treasury Rate': torch.tensor(surface_data['Market Features']['Treasury Rate'], dtype=torch.float32),
            },
            'Input Surface': {
                'Log Moneyness': torch.tensor(points_coordinates[unmasked_indices, 0], dtype=torch.float32),
                'Time to Maturity': torch.tensor(points_coordinates[unmasked_indices, 1], dtype=torch.float32),
                'Implied Volatility': torch.tensor(points_volatilities[unmasked_indices], dtype=torch.float32)
            },
            'Query Points': {
                'Log Moneyness': torch.tensor(points_coordinates[masked_indices, 0], dtype=torch.float32),
                'Time to Maturity': torch.tensor(points_coordinates[masked_indices, 1], dtype=torch.float32),
                'Implied Volatility': torch.tensor(points_volatilities[masked_indices], dtype=torch.float32)
            }
        }

        return data_item

    @staticmethod
    def collate_fn(batch):
        batched_data = {
            'Datetime': [item['Datetime'] for item in batch],
            'Symbol': [item['Symbol'] for item in batch],
            'Market Features': {
                'Market Return': default_collate([item['Market Features']['Market Return'] for item in batch]),
                'Market Volatility': default_collate([item['Market Features']['Market Volatility'] for item in batch]),
                'Treasury Rate': default_collate([item['Market Features']['Treasury Rate'] for item in batch]),
            },
            'Input Surface': {
                'Log Moneyness': [item['Input Surface']['Log Moneyness'].clone().detach() for item in batch],
                'Time to Maturity': [item['Input Surface']['Time to Maturity'].clone().detach() for item in batch],
                'Implied Volatility': [item['Input Surface']['Implied Volatility'].clone().detach() for item in batch],
            },
            'Query Points': {
                'Log Moneyness': [item['Query Points']['Log Moneyness'].clone().detach().requires_grad_(True) for item in batch],
                'Time to Maturity': [item['Query Points']['Time to Maturity'].clone().detach().requires_grad_(True) for item in batch],
                'Implied Volatility': [item['Query Points']['Implied Volatility'].clone().detach() for item in batch],
            }
        }

        return batched_data


# Assuming surfaces is the output from the implied_volatility_surfaces function
proportion = 0.2  # example proportion
dataset = IVSurfaceDataset(surfaces, proportion)
data_loader = DataLoader(
    dataset, 
    batch_size=HYPERPARAMETERS['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)

# Fetch one batch from the DataLoader
batch = next(iter(data_loader))
batch

{'Datetime': [Timestamp('2013-06-10 00:00:00'),
  Timestamp('2013-01-28 00:00:00'),
  Timestamp('2013-05-20 00:00:00'),
  Timestamp('2013-03-07 00:00:00')],
 'Symbol': ['AAPL', 'AAPL', 'GOOGL', 'AAPL'],
 'Market Features': {'Market Return': tensor([-0.0003, -0.0019, -0.0007,  0.0018]),
  'Market Volatility': tensor([15.4400, 13.5700, 13.0200, 13.0600]),
  'Treasury Rate': tensor([0.0400, 0.0600, 0.0350, 0.0900])},
 'Input Surface': {'Log Moneyness': [tensor([0.4524, 0.4597, 0.4668, 0.4739, 0.4739, 0.4880, 0.4950, 0.4950, 0.5019,
           0.5019, 0.5088, 0.5088, 0.5156, 0.5156, 0.5224, 0.5224, 0.5291, 0.5358,
           0.5358, 0.5425, 0.5491, 0.5556, 0.5556, 0.5621, 0.5621, 0.5686, 0.5686,
           0.5750, 0.5750, 0.5814, 0.5814, 0.5878, 0.5878, 0.5941, 0.6004, 0.6004,
           0.6066, 0.6066, 0.6128, 0.6128, 0.6189, 0.6251, 0.6251, 0.6311, 0.6372,
           0.6372, 0.6432, 0.6492, 0.6492, 0.6551, 0.6551, 0.6610, 0.6610, 0.6669,
           0.6669, 0.6727, 0.6727, 0.6785, 0.6785,

## Surface Embedding

### Components

In [7]:
import torch
import torch.nn as nn
from torch.utils.data._utils.collate import default_collate

class SurfaceBatchNorm(nn.Module):
    def __init__(
        self, 
        num_features=1, 
        momentum=0.1
    ):
        super(SurfaceBatchNorm, self).__init__()
        self.log_moneyness_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.time_to_maturity_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_return_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_volatility_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.treasury_rate_bn = nn.BatchNorm1d(num_features, momentum=momentum)

    def forward(self, batch):
        # Concatenate all tensors from the Input Surface into one tensor for each feature
        input_surface_log_moneyness = torch.cat([x for x in batch['Input Surface']['Log Moneyness']])
        input_surface_time_to_maturity = torch.cat([x for x in batch['Input Surface']['Time to Maturity']])

        # Concatenate Input Surface tensors with Query Points tensors
        total_log_moneyness = torch.cat([input_surface_log_moneyness] + [x for x in batch['Query Points']['Log Moneyness']])
        total_time_to_maturity = torch.cat([input_surface_time_to_maturity] + [x for x in batch['Query Points']['Time to Maturity']])

        # Normalize Log Moneyness and Time to Maturity
        norm_log_moneyness = self.log_moneyness_bn(total_log_moneyness.unsqueeze(1)).squeeze(1)
        norm_time_to_maturity = self.time_to_maturity_bn(total_time_to_maturity.unsqueeze(1)).squeeze(1)

        # Split the normalized results back to corresponding structures
        input_surface_sizes = [len(x) for x in batch['Input Surface']['Log Moneyness']]
        query_points_sizes = [len(x) for x in batch['Query Points']['Log Moneyness']]
        total_input_size = sum(input_surface_sizes)

        # Normalizing Market Features
        market_features = batch['Market Features']
        norm_market_return = self.market_return_bn(market_features['Market Return'].unsqueeze(1)).squeeze(1)
        norm_market_volatility = self.market_volatility_bn(market_features['Market Volatility'].unsqueeze(1)).squeeze(1)
        norm_treasury_rate = self.treasury_rate_bn(market_features['Treasury Rate'].unsqueeze(1)).squeeze(1)

        # Reconstructing the batch with normalized data
        output = {
            'Datetime': batch['Datetime'],
            'Symbol': batch['Symbol'],
            'Market Features': {
                'Market Return': norm_market_return,
                'Market Volatility': norm_market_volatility,
                'Treasury Rate': norm_treasury_rate
            },
            'Input Surface': {
                'Log Moneyness': list(torch.split(norm_log_moneyness[:total_input_size], input_surface_sizes)),
                'Time to Maturity': list(torch.split(norm_time_to_maturity[:total_input_size], input_surface_sizes)),
                'Implied Volatility': batch['Input Surface']['Implied Volatility']
            },
            'Query Points': {
                'Log Moneyness': list(torch.split(norm_log_moneyness[total_input_size:], query_points_sizes)),
                'Time to Maturity': list(torch.split(norm_time_to_maturity[total_input_size:], query_points_sizes)),
                'Implied Volatility': batch['Query Points']['Implied Volatility']
            }
        }

        # Ensure requires_grad is True for query point values
        for key in output['Query Points']:
            if key != 'Implied Volatility':  # We only set requires_grad for Log Moneyness and Time to Maturity
                for tensor in output['Query Points'][key]:
                    tensor.requires_grad_()

        return output

# Usage
surfacebatchnorm = SurfaceBatchNorm()
processed_batch = surfacebatchnorm(batch)
processed_batch

{'Datetime': [Timestamp('2013-06-10 00:00:00'),
  Timestamp('2013-01-28 00:00:00'),
  Timestamp('2013-05-20 00:00:00'),
  Timestamp('2013-03-07 00:00:00')],
 'Symbol': ['AAPL', 'AAPL', 'GOOGL', 'AAPL'],
 'Market Features': {'Market Return': tensor([-0.0216, -0.4603, -0.1269,  0.6087], grad_fn=<SqueezeBackward1>),
  'Market Volatility': tensor([ 1.6897, -0.2052, -0.7625, -0.7220], grad_fn=<SqueezeBackward1>),
  'Treasury Rate': tensor([-0.7439,  0.1717, -0.9728,  1.5450], grad_fn=<SqueezeBackward1>)},
 'Input Surface': {'Log Moneyness': [tensor([1.2476, 1.2676, 1.2873, 1.3070, 1.3070, 1.3458, 1.3650, 1.3650, 1.3841,
           1.3841, 1.4030, 1.4030, 1.4219, 1.4219, 1.4406, 1.4406, 1.4591, 1.4776,
           1.4776, 1.4959, 1.5141, 1.5322, 1.5322, 1.5501, 1.5501, 1.5680, 1.5680,
           1.5857, 1.5857, 1.6033, 1.6033, 1.6209, 1.6209, 1.6383, 1.6555, 1.6555,
           1.6727, 1.6727, 1.6898, 1.6898, 1.7068, 1.7236, 1.7236, 1.7404, 1.7571,
           1.7571, 1.7736, 1.7901, 1.7901, 1.

In [11]:
import torch
import torch.nn as nn
import numpy as np

class EllipticalRBFKernel(nn.Module):
    def __init__(
        self, 
        input_dim, 
        bandwidth
    ):
        super(EllipticalRBFKernel, self).__init__()
        self.bandwidth = bandwidth
        # Initialize the log of the scale vector to zero, which corresponds to scale factors of one
        self.log_scale = nn.Parameter(torch.zeros(input_dim))

    def forward(self, distances):
        # Convert log scale to actual scale values
        scale = torch.exp(self.log_scale)
        
        # Create a diagonal scale matrix
        scale_matrix = torch.diag(scale)

        # Calculate the scaled distances
        scaled_distances = distances @ scale_matrix @ distances.t()
        
        # Normalize by the trace of the scale matrix
        trace_scale_matrix = torch.trace(scale_matrix)
        normalized_distances = scaled_distances / trace_scale_matrix

        # Compute the RBF kernel output using the normalized distances
        kernel_values = torch.exp(-normalized_distances / (2 * self.bandwidth ** 2))

        return kernel_values

class SurfaceContinuousKernelPositionalEmbedding(nn.Module):
    def __init__(self, d_embedding):
        super(SurfaceContinuousKernelPositionalEmbedding, self).__init__()
        self.d_embedding = d_embedding

        # Initialize multiple RBF kernels, each with a different fixed bandwidth
        self.kernels = nn.ModuleList()
        for i in range(1, d_embedding + 1):
            bandwidth_value = torch.erfinv(torch.tensor(i / (d_embedding + 1))) * np.sqrt(2)
            self.kernels.append(EllipticalRBFKernel(bandwidth=bandwidth_value, input_dim=2))

        self.input_surface_layer_norm = nn.LayerNorm(d_embedding)
        self.query_points_layer_norm = nn.LayerNorm(d_embedding)

        # Initialize learnable scaling parameter (the base for positional embedding)
        self.log_scale = nn.Parameter(torch.log(torch.tensor(10000.0)))

    def forward(
        self, 
        input_surface_batch, 
        query_points_batch
    ):
        batch_size = len(input_surface_batch['Log Moneyness'])

        input_surface_embeddings = []
        query_points_embeddings = []

        for i in range(batch_size):
            # Extract the coordinates and implied volatilities for each surface in the batch
            surface_coords = torch.stack([
                input_surface_batch['Log Moneyness'][i], 
                input_surface_batch['Time to Maturity'][i]
            ], dim=-1)
            surface_ivs = input_surface_batch['Implied Volatility'][i]

            query_coords = torch.stack([
                query_points_batch['Log Moneyness'][i], 
                query_points_batch['Time to Maturity'][i]
            ], dim=-1)

            all_coords = torch.cat((surface_coords, query_coords), dim=0)

            # Compute the pairwise differences between all points and the input surface points
            point_differences = all_coords.unsqueeze(1) - surface_coords.unsqueeze(0)  # (n+m, n, 2)

            # Initialize the output embeddings for the current surface with d_embedding channels
            all_embedded = torch.zeros((all_coords.shape[0], self.d_embedding), dtype=torch.float32, device=surface_coords.device)

            for kernel_idx, kernel in enumerate(self.kernels):
                # Apply the RBF kernel to each distance vector using torch.vmap
                vmap_kernel = torch.vmap(kernel, in_dims=(0,))
                kernel_outputs = vmap_kernel(point_differences.view(-1, point_differences.shape[-1]))  # ((n+m) * n)
                kernel_outputs = kernel_outputs.view(all_coords.shape[0], surface_coords.shape[0])  # (n+m, n)

                # Compute the weighted sum of IVs based on the kernel outputs
                weighted_sum = (kernel_outputs * surface_ivs.unsqueeze(0)).sum(dim=1)
                normalization_factor = kernel_outputs.sum(dim=1)

                all_embedded[:, kernel_idx] = weighted_sum / normalization_factor

            # Split the embeddings into input surface and query points embeddings
            input_surface_embedded = all_embedded[:surface_coords.shape[0], :]
            query_points_embedded = all_embedded[surface_coords.shape[0]:, :]

            # Normalize the embedded surfaces
            input_surface_embedded = self.input_surface_layer_norm(input_surface_embedded)
            query_points_embedded = self.query_points_layer_norm(query_points_embedded)

            # Positional embedding for input surface points
            input_surface_pe = self._compute_positional_embedding(surface_coords)

            # Positional embedding for query points
            query_points_pe = self._compute_positional_embedding(query_coords)

            # Add positional embeddings with a factor of sqrt(2)
            input_surface_final = input_surface_embedded + input_surface_pe * np.sqrt(2)
            query_points_final = query_points_embedded + query_points_pe * np.sqrt(2)

            # Append the encoded surface for this input surface to the batch list
            input_surface_embeddings.append(input_surface_final)
            query_points_embeddings.append(query_points_final)

        # Keep all encoded surfaces as lists to handle variable lengths
        return {
            'Input Surface': input_surface_embeddings,
            'Query Points': query_points_embeddings
        }

    def _compute_positional_embedding(
        self, 
        coords, 
    ):
        positional_embedding = torch.zeros(coords.size(0), self.d_embedding, device=coords.device)

        for i in range(self.d_embedding // 4):
            div_factor = torch.exp(self.log_scale) ** (4 * i / self.d_embedding)
            positional_embedding[:, 4 * i] = torch.sin(coords[:, 0] / div_factor)
            positional_embedding[:, 4 * i + 1] = torch.cos(coords[:, 0] / div_factor)
            positional_embedding[:, 4 * i + 2] = torch.sin(coords[:, 1] / div_factor)
            positional_embedding[:, 4 * i + 3] = torch.cos(coords[:, 1] / div_factor)

        return positional_embedding

# Example of initializing and using this module
d_embedding = HYPERPARAMETERS['Surface Embedding']['Embedding Dimension']  # Desired number of output channels

continuous_kernel_positional_embedding = SurfaceContinuousKernelPositionalEmbedding(d_embedding=d_embedding)
kernel_positional_embedded_batch = continuous_kernel_positional_embedding(processed_batch['Input Surface'], processed_batch['Query Points'])
kernel_positional_embedded_batch

{'Input Surface': [tensor([[ 2.5797,  1.1175, -0.6514,  ...,  0.9242, -0.6169,  0.7202],
          [ 2.5106,  1.1143, -0.6118,  ...,  0.9240, -0.6249,  0.7063],
          [ 2.4414,  1.1081, -0.5732,  ...,  0.9248, -0.6319,  0.6934],
          ...,
          [ 0.6927, -0.2153,  0.6305,  ...,  1.8009,  0.4823,  1.9477],
          [ 0.6927, -0.2153,  0.6305,  ...,  1.8009,  0.4823,  1.9477],
          [ 0.7100, -0.1950,  0.6664,  ...,  1.7790,  0.4526,  1.9125]],
         grad_fn=<AddBackward0>),
  tensor([[ 1.3965,  1.8080, -1.3322,  ...,  0.7615, -0.8103,  0.5183],
          [ 0.6581,  1.0516, -1.0983,  ...,  0.7090, -0.9440,  0.3138],
          [ 0.6581,  1.0516, -1.0983,  ...,  0.7090, -0.9440,  0.3138],
          ...,
          [-1.5078,  0.2822,  0.4336,  ...,  2.1350,  1.1476,  2.8897],
          [-1.4543,  0.2870,  0.4280,  ...,  2.1351,  1.1496,  2.8930],
          [-1.4543,  0.2870,  0.4280,  ...,  2.1351,  1.1496,  2.8930]],
         grad_fn=<AddBackward0>),
  tensor([[-1.7151,

## Block

In [25]:
import torch
import torch.nn as nn
import numpy as np

class SurfaceEmbedding(nn.Module):
    def __init__(self, d_embedding, momentum=0.1):
        super(SurfaceEmbedding, self).__init__()
        self.batch_norm = SurfaceBatchNorm(num_features=1, momentum=momentum)
        self.kernel_positional_embedding = SurfaceContinuousKernelPositionalEmbedding(d_embedding)
        self.layer_norm = nn.LayerNorm(d_embedding)
        self.mask_token = nn.Parameter(torch.randn(d_embedding))

    def forward(self, batch):
        # Apply batch normalization
        norm_batch = self.batch_norm(batch)

        # Extract market features from processed batch and create external_features tensor
        market_features = norm_batch['Market Features']
        external_features = torch.stack([
            market_features['Market Return'],
            market_features['Market Volatility'],
            market_features['Treasury Rate']
        ], dim=-1)  # (batch, features)

        # Compute kernel and positional embeddings
        embeddings = self.kernel_positional_embedding(norm_batch['Input Surface'], norm_batch['Query Points'])

        input_surface_embeddings = embeddings['Input Surface']
        query_points_embeddings = embeddings['Query Points']

        embedded_sequences = []

        for input_surface_embedding, query_points_embedding in zip(input_surface_embeddings, query_points_embeddings):
            # Add mask token to the query point embeddings
            masked_query_points_embedding = query_points_embedding + self.mask_token

            # Combine input surface embeddings and masked query points embeddings
            combined_sequence = torch.cat((input_surface_embedding, masked_query_points_embedding), dim=0)

            # Apply layer normalization
            combined_sequence = self.layer_norm(combined_sequence)

            embedded_sequences.append(combined_sequence)

        return embedded_sequences, external_features


# Example of initializing and using this module
d_embedding = HYPERPARAMETERS['Surface Embedding']['Embedding Dimension']  # Desired number of output channels
surface_embedding = SurfaceEmbedding(d_embedding=d_embedding)
embedded_sequences_batch, external_features = surface_embedding(batch)
embedded_sequences_batch

[tensor([[ 1.9445,  0.5113, -1.2225,  ...,  0.3218, -1.1887,  0.1219],
         [ 1.9201,  0.5216, -1.2073,  ...,  0.3310, -1.2205,  0.1129],
         [ 1.8941,  0.5291, -1.1922,  ...,  0.3414, -1.2523,  0.1045],
         ...,
         [ 1.6405, -0.1604, -0.8763,  ...,  1.4830, -1.4023, -0.5574],
         [ 1.5223, -0.4795, -0.5474,  ...,  1.5152, -1.5193, -0.7181],
         [-0.1815, -0.9409, -0.8970,  ...,  2.2985, -0.7164,  0.3199]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[ 1.1722,  1.5662, -1.4404,  ...,  0.5642, -0.9407,  0.3314],
         [ 0.8662,  1.3928, -1.4840,  ...,  0.9343, -1.2776,  0.4055],
         [ 0.8662,  1.3928, -1.4840,  ...,  0.9343, -1.2776,  0.4055],
         ...,
         [-1.2376, -0.6706, -0.8891,  ...,  2.1195, -0.3624,  0.5512],
         [-1.1061, -0.4833, -1.2120,  ...,  2.0781, -0.4209,  0.4089],
         [-1.2496, -0.6847, -0.8856,  ...,  2.1084, -0.3361,  0.5780]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[-1.7979, -0.0045, 

# Surface Encoding

## Encoder

In [27]:
import torch
import torch.nn as nn

class ResidualNorm(nn.Module):
    def __init__(self, d_embedding):
        super(ResidualNorm, self).__init__()
        self.norm = nn.LayerNorm(d_embedding)

    def forward(self, x, sublayer_output):
        return self.norm(x + sublayer_output)

class FeedForwardNetwork(nn.Module):
    def __init__(self, d_embedding, hidden_dim, dropout):
        super(FeedForwardNetwork, self).__init__()
        self.feedforward = nn.Sequential(
            nn.Linear(d_embedding, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, d_embedding),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.feedforward(x)

class GatedAttentionFusion(nn.Module):
    def __init__(self, d_embedding):
        super(GatedAttentionFusion, self).__init__()
        self.gate_layer = nn.Sequential(
            nn.Linear(d_embedding * 2, d_embedding),
            nn.Sigmoid()
        )

    def forward(self, self_attn_output, ext_attn_output):
        # Concatenate self-attention and external attention outputs
        concatenated_output = torch.cat((self_attn_output, ext_attn_output), dim=-1)
        # Compute gate values
        gate_values = self.gate_layer(concatenated_output)
        # Calculate gated embedding
        gated_embedding = gate_values * self_attn_output + (1 - gate_values) * ext_attn_output
        return gated_embedding

import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, d_embedding, num_heads, dropout):
        super(SelfAttention, self).__init__()
        self.attention = nn.MultiheadAttention(d_embedding, num_heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        return self.dropout(attn_output)

class CrossAttention(nn.Module):
    def __init__(self, d_embedding, num_heads, dropout):
        super(CrossAttention, self).__init__()
        self.attention = nn.MultiheadAttention(d_embedding, num_heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, external_features):
        attn_output, _ = self.attention(x, external_features, external_features)
        return self.dropout(attn_output)

class EncoderBlock(nn.Module):
    def __init__(self, d_embedding, num_heads, hidden_dim, dropout):
        super(EncoderBlock, self).__init__()
        self.self_attention = SelfAttention(d_embedding, num_heads, dropout)
        self.cross_attention = CrossAttention(d_embedding, num_heads, dropout)
        self.gated_attention_fusion = GatedAttentionFusion(d_embedding)
        self.residual_norm1 = ResidualNorm(d_embedding)
        self.feed_forward = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        self.residual_norm2 = ResidualNorm(d_embedding)

    def forward(self, x, external_features):
        # Self-Attention
        self_attn_output = self.self_attention(x)
        
        # Cross-Attention
        ext_attn_output = self.cross_attention(x, external_features)
        
        # Gated Attention Fusion
        gated_embedding = self.gated_attention_fusion(self_attn_output, ext_attn_output)
        
        # Residual Connection and Layer Normalization
        x = self.residual_norm1(x, gated_embedding)
        
        # Feed-Forward Network
        ffn_output = self.feed_forward(x)
        
        # Final Residual Connection and Layer Normalization
        x = self.residual_norm2(x, ffn_output)
        
        return x

class SurfaceEncoder(nn.Module):
    def __init__(self, num_layers, d_embedding, num_heads, hidden_dim, dropout):
        super(SurfaceEncoder, self).__init__()
        self.layers = nn.ModuleList([
            EncoderBlock(d_embedding, num_heads, hidden_dim, dropout)
            for _ in range(num_layers)
        ])

    def forward(self, embedded_sequences_batch, external_features):
        batch_size = len(embedded_sequences_batch)
        encoded_sequences_batch = []

        for i in range(batch_size):
            x = embedded_sequences_batch[i]
            for layer in self.layers:
                x = layer(x, external_features[i])
            encoded_sequences_batch.append(x)
        
        return encoded_sequences_batch

# Example of initializing and using these modules
# d_embedding = HYPERPARAMETERS['Surface Encoding']['Embedding Dimension']
num_heads = HYPERPARAMETERS['Surface Encoding']['Num Heads']
hidden_dim = HYPERPARAMETERS['Surface Encoding']['Hidden Dimension']
dropout = HYPERPARAMETERS['Surface Encoding']['Dropout']
num_layers = HYPERPARAMETERS['Surface Encoding']['Num Layers']

surface_encoder = SurfaceEncoder(num_layers, d_embedding, num_heads, hidden_dim, dropout)

# Assume embedded_sequences_batch is the output of the SurfaceEmbedding module and
# external_features is the formatted external market features batch
encoded_sequences_batch = surface_encoder(embedded_sequences_batch, external_features)
encoded_sequences_batch

KeyError: 'Num Heads'