In [1]:
import numpy as np
import pandas as pd
import random
import torch

In [2]:
# Set the random seed for reproducibility
RANDOM_STATE = 0
N_JOBS = 8
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [21]:
HYPERPARAMETERS = {
    'Input Preprocessing' : {
        'Mask Proportions' : [0.1, 0.2, 0.4, 0.8],
        'Batch Size' : 4
    },
    'Input Embedding' : {
        'Surface Embedding' : {
            'Grid Dimension' : 3,
            'Channels Dimension' : 8,
        },
        'Pre-Encoder' : {
            'Branch Channels Dimension' : 4,
            'Number of Blocks' : 2,
        }
    },
    'Surface Encoding' : {
        'Number of Heads' : 4,
        'Hidden Dimension' : 16,
        'Dropout' : 0.1,
        'Number of Blocks' : 2,
    }
}

## Dataset

In [4]:
aapl_googl_data = pd.read_csv('volatility_surface_AAPL_GOOGL_2013_01_2013_06.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
aapl_googl_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Log Moneyness,Time to Maturity,Implied Volatility,Market Return,Market Volatility,Treasury Rate
Datetime,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-02,AAPL,-0.316688,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.316688,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.291996,0.007937,0.3726,0.025086,14.680000,0.055
...,...,...,...,...,...,...,...
2013-06-28,GOOGL,0.427518,2.253968,0.2430,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2383,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2426,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.442224,2.253968,0.2402,-0.004299,16.860001,0.030


In [5]:
import gc
from joblib_progress import joblib_progress
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from joblib import Parallel, delayed

def implied_volatility_surface_datasets(
    options_market_data, 
    proportions, 
    n_jobs=1,
    random_state=0,
    n_chunks=1
):
    def mask_surface(
        date, 
        symbol, 
        surface, 
        rng
    ):
        def mask_surface_with_proportion(
            surface_data, 
            proportion, 
        ):
            n_clusters = int(np.ceil(1 / proportion))
            points_coordinates = surface_data['points_coordinates']
            points_volatilities = surface_data['points_volatilities']

            # Create the clustering pipeline
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto'))
            ])
            
            # Fit the pipeline to the data points
            labels = pipeline.fit_predict(points_coordinates)
            
            single_surface_datasets = []
            for cluster in range(n_clusters):
                cluster_indices = np.where(labels == cluster)[0]
                num_to_mask = int(np.ceil(len(cluster_indices) * proportion))
                masked_indices = rng.choice(cluster_indices, size=num_to_mask, replace=False)
                
                for idx in masked_indices:
                    unmasked_indices = np.setdiff1d(cluster_indices, masked_indices)

                    single_surface_datasets.append({
                        'Datetime': surface_data['datetime'],
                        'Symbol': surface_data['symbol'],
                        'Market Features': surface_data['market_features'],
                        'Input Surface': {
                            'Log Moneyness': points_coordinates[unmasked_indices, 0],
                            'Time to Maturity': points_coordinates[unmasked_indices, 1],
                            'Implied Volatility': points_volatilities[unmasked_indices]
                        },
                        'Query Point': {
                            'Log Moneyness': points_coordinates[idx, 0],
                            'Time to Maturity': points_coordinates[idx, 1]
                        },
                        'Target Volatility': points_volatilities[idx]
                    })

            return single_surface_datasets
        
        surface_data = {
            'datetime': date,
            'symbol': symbol,
            'points_coordinates': surface[['Log Moneyness', 'Time to Maturity']].values,
            'points_volatilities': surface['Implied Volatility'].values,
            'market_features': {
                'Market Return': surface['Market Return'].values[0],
                'Market Volatility': surface['Market Volatility'].values[0],
                'Treasury Rate': surface['Treasury Rate'].values[0]
            }
        }
        
        datasets = []
        for proportion in proportions:
            datasets.extend(mask_surface_with_proportion(surface_data, proportion))

        return datasets

    rng = np.random.default_rng(random_state)
    all_surfaces = list(options_market_data.groupby(level=['Datetime', 'Symbol']))
    n_surfaces = len(all_surfaces)
    
    # Split the array into 'n_chunks' chunks
    chunks = np.array_split(range(n_surfaces), n_chunks)
    # Initialize the list to hold all results
    surface_datasets = []
    # Process each chunk sequentially
    with joblib_progress("Surfaces...", total=n_surfaces): 
        for chunk in chunks:
            # Process the current chunk in parallel
            output = Parallel(n_jobs=n_jobs)(
                delayed(mask_surface)(date, symbol, surface, rng)
                for (date, symbol), surface in [all_surfaces[i] for i in chunk]
            )
            # Extend the overall results with the current chunk's results
            surface_datasets.extend(output)
            gc.collect()  

    # Flatten the list of lists into a single list of datasets
    return [item for sublist in surface_datasets for item in sublist]

aapl_googl_dataset = implied_volatility_surface_datasets(
    aapl_googl_data,
    HYPERPARAMETERS['Input Preprocessing']['Mask Proportions'],
    n_jobs=N_JOBS,
    random_state=RANDOM_STATE,
    n_chunks=4
)

Output()

In [6]:
# import pickle

# with open('aapl_googl_dataset.pickle', 'wb') as handle:
#     pickle.dump(aapl_googl_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('aapl_googl_dataset.pickle', 'rb') as handle:
#     aapl_googl_dataset_ = pickle.load(handle)


In [7]:
len(aapl_googl_dataset)

863509

In [8]:
aapl_googl_dataset[0]

{'Datetime': Timestamp('2013-01-02 00:00:00'),
 'Symbol': 'AAPL',
 'Market Features': {'Market Return': 0.0250861159586972,
  'Market Volatility': 14.68000030517578,
  'Treasury Rate': 0.0549999997019767},
 'Input Surface': {'Log Moneyness': array([-0.74747141, -0.72842322, -0.72842322, -0.70973108, -0.69138194,
         -0.69138194, -0.67336344, -0.67336344, -0.63827212, -0.63827212,
         -0.62117768, -0.62117768, -0.60437057, -0.60437057, -0.58784126,
         -0.58784126, -0.57158074, -0.5555804 , -0.5555804 , -0.53983205,
         -0.53983205, -0.52432786, -0.52432786, -0.50906039, -0.50906039,
         -0.49402251, -0.49402251, -0.47920742, -0.47920742, -0.46460862,
         -0.46460862, -0.45021989, -0.45021989, -0.43603525, -0.43603525,
         -0.42204901, -0.42204901, -0.40825569, -0.40825569, -0.39465004,
         -0.39465004, -0.74747141, -0.74747141, -0.72842322, -0.70973108,
         -0.70973108, -0.69138194, -0.69138194, -0.67336344, -0.67336344,
         -0.65566386

In [9]:
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import Dataset

class IVSurfaceDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data_point = self.data[idx]

        # Convert each component of the data point into tensors as appropriate
        return {
            'Datetime': data_point['Datetime'],
            'Symbol': data_point['Symbol'],
            'Market Features': {
                'Market Return': torch.tensor(data_point['Market Features']['Market Return'], dtype=torch.float32),
                'Market Volatility': torch.tensor(data_point['Market Features']['Market Volatility'], dtype=torch.float32),
                'Treasury Rate': torch.tensor(data_point['Market Features']['Treasury Rate'], dtype=torch.float32),
            },
            'Input Surface': {
                'Log Moneyness': torch.tensor(data_point['Input Surface']['Log Moneyness'], dtype=torch.float32),
                'Time to Maturity': torch.tensor(data_point['Input Surface']['Time to Maturity'], dtype=torch.float32),
                'Implied Volatility': torch.tensor(data_point['Input Surface']['Implied Volatility'], dtype=torch.float32),
            },
            'Query Point': {
                'Log Moneyness': torch.tensor(data_point['Query Point']['Log Moneyness'], dtype=torch.float32),
                'Time to Maturity': torch.tensor(data_point['Query Point']['Time to Maturity'], dtype=torch.float32),
            },
            'Target Volatility': torch.tensor(data_point['Target Volatility'], dtype=torch.float32),
        }

    def collate_fn(batch):
        # Organize batch data by structuring as a dictionary with batched components
        batched_data = {
            'Datetime': [item['Datetime'] for item in batch],
            'Symbol': [item['Symbol'] for item in batch],
            'Market Features': {
                'Market Return': default_collate([item['Market Features']['Market Return'] for item in batch]),
                'Market Volatility': default_collate([item['Market Features']['Market Volatility'] for item in batch]),
                'Treasury Rate': default_collate([item['Market Features']['Treasury Rate'] for item in batch]),
            },
            'Input Surface': {
                'Log Moneyness': [item['Input Surface']['Log Moneyness'] for item in batch],
                'Time to Maturity': [item['Input Surface']['Time to Maturity'] for item in batch],
                'Implied Volatility': [item['Input Surface']['Implied Volatility'] for item in batch],
            },
            'Query Point': {
                'Log Moneyness': default_collate([item['Query Point']['Log Moneyness'] for item in batch]),
                'Time to Maturity': default_collate([item['Query Point']['Time to Maturity'] for item in batch]),
            },
            'Target Volatility': default_collate([item['Target Volatility'] for item in batch]),
        }

        return batched_data



aapl_googl_data_loader = DataLoader(
    IVSurfaceDataset(aapl_googl_dataset), 
    batch_size=HYPERPARAMETERS['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)

# Fetch one batch from the DataLoader
batch = next(iter(aapl_googl_data_loader))
batch

{'Datetime': [Timestamp('2013-03-20 00:00:00'),
  Timestamp('2013-03-20 00:00:00'),
  Timestamp('2013-04-09 00:00:00'),
  Timestamp('2013-03-28 00:00:00')],
 'Symbol': ['GOOGL', 'GOOGL', 'GOOGL', 'AAPL'],
 'Market Features': {'Market Return': tensor([0.0067, 0.0067, 0.0035, 0.0040]),
  'Market Volatility': tensor([12.6700, 12.6700, 12.8400, 12.7000]),
  'Treasury Rate': tensor([0.0630, 0.0630, 0.0630, 0.0650])},
 'Input Surface': {'Log Moneyness': [tensor([-1.8074e-01, -1.8074e-01, -1.6614e-01, -1.3757e-01, -1.2358e-01,
           -1.2358e-01, -1.0979e-01, -1.0979e-01, -8.9448e-02, -8.2759e-02,
           -6.9514e-02, -6.9514e-02, -6.2956e-02, -6.2956e-02, -5.6442e-02,
           -4.9969e-02, -4.3538e-02, -4.3538e-02, -3.7149e-02, -3.7149e-02,
           -3.0799e-02, -3.0799e-02, -2.4490e-02, -1.8220e-02, -1.1990e-02,
           -1.1990e-02, -5.7980e-03, -5.7980e-03,  6.4721e-03,  6.4721e-03,
            1.2551e-02,  1.8593e-02,  1.8593e-02,  2.4600e-02,  2.4600e-02,
            3.0570

## Surface Embedding

### Input Embedding

#### Components

In [10]:
import torch
import torch.nn as nn

class SurfaceBatchNorm(nn.Module):
    def __init__(self, num_features=1, momentum=0.1):
        super(SurfaceBatchNorm, self).__init__()
        self.log_moneyness_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.time_to_maturity_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.implied_volatility_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_return_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_volatility_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.treasury_rate_bn = nn.BatchNorm1d(num_features, momentum=momentum)

    def forward(self, batch):
        # Concatenate all tensors from the Input Surface into one tensor for each feature
        input_surface_log_moneyness = torch.cat([x for x in batch['Input Surface']['Log Moneyness']])
        input_surface_time_to_maturity = torch.cat([x for x in batch['Input Surface']['Time to Maturity']])
        input_surface_implied_volatility = torch.cat([x for x in batch['Input Surface']['Implied Volatility']])

        # Concatenate Input Surface tensors with Query Point tensors
        total_log_moneyness = torch.cat([input_surface_log_moneyness, batch['Query Point']['Log Moneyness']])
        total_time_to_maturity = torch.cat([input_surface_time_to_maturity, batch['Query Point']['Time to Maturity']])

        # Normalize Log Moneyness and Time to Maturity
        norm_log_moneyness = self.log_moneyness_bn(total_log_moneyness.unsqueeze(1)).squeeze(1)
        norm_time_to_maturity = self.time_to_maturity_bn(total_time_to_maturity.unsqueeze(1)).squeeze(1)

        # Normalize Implied Volatility (only from Input Surface)
        norm_implied_volatility = self.implied_volatility_bn(input_surface_implied_volatility.unsqueeze(1)).squeeze(1)

        # Split the normalized results back to corresponding structures
        input_surface_sizes = [len(x) for x in batch['Input Surface']['Log Moneyness']]
        total_input_size = sum(input_surface_sizes)

        # Normalizing Market Features
        market_features = batch['Market Features']
        norm_market_return = self.market_return_bn(market_features['Market Return'].unsqueeze(1)).squeeze(1)
        norm_market_volatility = self.market_volatility_bn(market_features['Market Volatility'].unsqueeze(1)).squeeze(1)
        norm_treasury_rate = self.treasury_rate_bn(market_features['Treasury Rate'].unsqueeze(1)).squeeze(1)

        # Reconstructing the batch with normalized data
        output = {
            'Datetime': batch['Datetime'],
            'Symbol': batch['Symbol'],
            'Market Features': {
                'Market Return': norm_market_return,
                'Market Volatility': norm_market_volatility,
                'Treasury Rate': norm_treasury_rate
            },
            'Input Surface': {
                'Log Moneyness': list(torch.split(norm_log_moneyness[:total_input_size], input_surface_sizes)),
                'Time to Maturity': list(torch.split(norm_time_to_maturity[:total_input_size], input_surface_sizes)),
                'Implied Volatility': list(torch.split(norm_implied_volatility, input_surface_sizes))
            },
            'Query Point': {
                'Log Moneyness': norm_log_moneyness[total_input_size:],
                'Time to Maturity': norm_time_to_maturity[total_input_size:]
            },
            'Target Volatility': batch['Target Volatility']
        }

        return output

# Usage
surfacebatchnorm = SurfaceBatchNorm()
processed_batch = surfacebatchnorm(batch)
processed_batch

{'Datetime': [Timestamp('2013-03-20 00:00:00'),
  Timestamp('2013-03-20 00:00:00'),
  Timestamp('2013-04-09 00:00:00'),
  Timestamp('2013-03-28 00:00:00')],
 'Symbol': ['GOOGL', 'GOOGL', 'GOOGL', 'AAPL'],
 'Market Features': {'Market Return': tensor([ 0.4141,  0.4141, -0.4874, -0.3408], grad_fn=<SqueezeBackward1>),
  'Market Volatility': tensor([-0.7100, -0.7100,  1.7039, -0.2840], grad_fn=<SqueezeBackward1>),
  'Treasury Rate': tensor([-0.1525, -0.1525, -0.1525,  0.4575], grad_fn=<SqueezeBackward1>)},
 'Input Surface': {'Log Moneyness': [tensor([-0.9756, -0.9756, -0.9307, -0.8429, -0.8000, -0.8000, -0.7576, -0.7576,
           -0.6951, -0.6745, -0.6338, -0.6338, -0.6136, -0.6136, -0.5936, -0.5737,
           -0.5540, -0.5540, -0.5343, -0.5343, -0.5148, -0.5148, -0.4954, -0.4762,
           -0.4570, -0.4570, -0.4380, -0.4380, -0.4003, -0.4003, -0.3816, -0.3630,
           -0.3630, -0.3446, -0.3446, -0.3262, -0.3262, -0.3080, -0.3080, -0.2718,
           -0.2539, -0.2361, -0.2184, -0.20

In [11]:
import torch
import torch.nn as nn
import numpy as np

# class ParametricContinuousKernel(nn.Module):
#     def __init__(self, input_dim, hidden_dim, hidden_layers, output_dim=1, dropout_prob=0.1):
#         super(ParametricContinuousKernel, self).__init__()
#         layers = []
#         current_dim = input_dim
#         for _ in range(hidden_layers):
#             layers.append(nn.Linear(current_dim, hidden_dim))
#             layers.append(nn.GELU())
#             layers.append(nn.Dropout(dropout_prob))
#             current_dim = hidden_dim
#         layers.append(nn.Linear(hidden_dim, output_dim))
#         self.net = nn.Sequential(*layers)

#     def forward(self, x):
#         return self.net(x)

class EllipticalRBFKernel(nn.Module):
    def __init__(self, input_dim):
        super(EllipticalRBFKernel, self).__init__()
        # Initialize the bandwidth parameters for each dimension
        # We use log-space parameterization for stability in optimization (exp to ensure positivity)
        self.log_bandwidth = nn.Parameter(torch.zeros(input_dim))  # Initialized to exp(0) = 1

    def forward(self, distances):
        # Scale the distances by the bandwidths
        # torch.exp(self.log_bandwidth) converts log bandwidth back to the standard scale
        scaled_distances = distances / torch.exp(self.log_bandwidth)

        # Compute the RBF kernel output using the scaled distances
        # The RBF kernel formula exp(-0.5 * (scaled distance)^2)
        kernel_values = torch.exp(-0.5 * torch.sum(scaled_distances ** 2, dim=-1))

        return kernel_values

class SurfaceContinuousKernelEmbedding(nn.Module):
    def __init__(self, grid_dim):
        super(SurfaceContinuousKernelEmbedding, self).__init__()
        self.grid_dim = grid_dim
        self.kernel = EllipticalRBFKernel(input_dim=2)
        self.layer_norm = nn.LayerNorm([self.grid_dim, self.grid_dim])  # Normalizing across each image's dimensions

        # Create a regular grid in (0, 1)x(0, 1), excluding 0 and 1
        grid_points = torch.linspace(1 / (grid_dim + 1), 1 - 1 / (grid_dim + 1), grid_dim)
        mesh_x, mesh_y = torch.meshgrid(grid_points, grid_points, indexing='ij')
        self.grid_points = torch.stack([mesh_x.flatten(), mesh_y.flatten()], dim=-1)
        self.grid_points = torch.erfinv(2 * self.grid_points - 1) * np.sqrt(2)  # inverse CDF of normal

    def forward(self, input_surface_batch):
        batch_size = len(input_surface_batch['Log Moneyness'])
        batch_embedded_surfaces = []

        for i in range(batch_size):
            # Extract the coordinates and implied volatilities for each surface in the batch
            surface_coords = torch.stack([
                input_surface_batch['Log Moneyness'][i], 
                input_surface_batch['Time to Maturity'][i]
            ], dim=-1)
            surface_ivs = input_surface_batch['Implied Volatility'][i]

            # Initialize the output grid for the current surface
            embedded_surface = torch.zeros((self.grid_dim, self.grid_dim), dtype=torch.float32, device=surface_coords.device)

            # Compute the convolution for each point on the output grid
            for idx, grid_point in enumerate(self.grid_points):
                # Calculate the distance from each input point to the current grid point
                point_differences = surface_coords - grid_point

                # Apply the parametric kernel to these differences
                kernel_outputs = self.kernel(point_differences)

                # Compute the weighted sum of IVs based on the kernel outputs
                embedded_surface[idx // self.grid_dim, idx % self.grid_dim] = (kernel_outputs * surface_ivs).sum()

            # Normalize the embedded surface
            embedded_surface = self.layer_norm(embedded_surface)
            # Append the encoded surface for this input surface to the batch list
            batch_embedded_surfaces.append(embedded_surface)

        # Stack all encoded surfaces to form a batch tensor
        return torch.stack(batch_embedded_surfaces)


# Example of initializing and using this module
grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
# kernel_hidden_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Kernel Hidden Layer Dimension']
# kernel_hidden_layers = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Kernel Hidden Layer Count']
# kernel_dropout_prob = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Kernel Dropout Probability']

continuous_kernel_embedding = SurfaceContinuousKernelEmbedding(grid_dim=grid_dim)
continuous_kernel_embedding_batch = continuous_kernel_embedding(processed_batch['Input Surface'])
continuous_kernel_embedding_batch

tensor([[[ 1.7747,  0.5131,  0.7984],
         [-0.4966, -1.7778, -0.9971],
         [ 0.4416, -0.5284,  0.2723]],

        [[ 1.8824,  1.0967,  0.9512],
         [-0.4256, -1.2481, -0.8633],
         [-0.2180, -0.8392, -0.3362]],

        [[ 1.6058,  0.4209, -0.8797],
         [ 1.3254,  0.0682, -1.2262],
         [ 0.4718, -0.4527, -1.3334]],

        [[-1.0073, -1.0003, -1.1151],
         [-0.1092, -0.0864, -0.5016],
         [ 1.5675,  1.6145,  0.6379]]], grad_fn=<StackBackward0>)

In [12]:
import torch
import torch.nn as nn

class SurfaceProjectionEmbedding(nn.Module):
    def __init__(self, in_channels, d_embedding, grid_dim):
        super(SurfaceProjectionEmbedding, self).__init__()
        # Initialize the 1x1 convolution layer
        self.conv1x1 = nn.Conv2d(in_channels, d_embedding, kernel_size=1)
        # Initialize layer normalization across the channel, height, and width dimensions
        self.layer_norm = nn.LayerNorm([d_embedding, grid_dim, grid_dim])  # Normalizes across (channels, height, width)

    def forward(self, x):
        # Ensure x has dimensions: (batch_size, channels, height, width)
        # Add a channel dimension if necessary
        if x.dim() == 3:  # assuming x has dimensions (batch_size, height, width)
            x = x.unsqueeze(1)  # add channel dimension
        # Apply the 1x1 convolution to project the input to a higher dimensional space
        x = self.conv1x1(x)
        # Normalize the features across each channel, maintaining the spatial dimensions
        x = self.layer_norm(x)
        return x
    
d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']  # Desired number of output channels
torch.manual_seed(RANDOM_STATE)
# Create the module
projection_embedding = SurfaceProjectionEmbedding(1, d_embedding, grid_dim)   
projection_embedding_batch = projection_embedding(continuous_kernel_embedding_batch)
projection_embedding_batch

tensor([[[[ 0.2868,  0.3013,  0.2980],
          [ 0.3128,  0.3275,  0.3186],
          [ 0.3021,  0.3132,  0.3040]],

         [[ 2.3047,  1.2691,  1.5033],
          [ 0.4402, -0.6115,  0.0293],
          [ 1.2104,  0.4141,  1.0714]],

         [[-2.2546, -0.6657, -1.0250],
          [ 0.6060,  2.2196,  1.2364],
          [-0.5757,  0.6460, -0.3624]],

         [[-1.8564, -0.4356, -0.7570],
          [ 0.7015,  2.1443,  1.2651],
          [-0.3552,  0.7372, -0.1645]],

         [[-2.0649, -1.3214, -1.4895],
          [-0.7263,  0.0288, -0.4313],
          [-1.2792, -0.7075, -1.1794]],

         [[ 0.1577, -0.3600, -0.2429],
          [-0.7743, -1.3000, -0.9797],
          [-0.3893, -0.7873, -0.4588]],

         [[-0.2417, -0.2034, -0.2121],
          [-0.1728, -0.1339, -0.1576],
          [-0.2012, -0.1718, -0.1961]],

         [[ 2.6529,  1.1222,  1.4683],
          [-0.1029, -1.6574, -0.7102],
          [ 1.0355, -0.1415,  0.8300]]],


        [[[ 0.2856,  0.2946,  0.2963],
       

#### Block

In [13]:
class InputEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding, momentum=0.1):
        super(InputEmbedding, self).__init__()
        # Initialize all sub-modules
        self.surface_batchnorm = SurfaceBatchNorm(1, momentum)
        self.surface_continuous_kernel_embedding = SurfaceContinuousKernelEmbedding(grid_dim)
        self.surface_projection_embedding = SurfaceProjectionEmbedding(1, d_embedding, grid_dim)

    def forward(self, batch):
        # Process the batch with SurfaceBatchNorm
        processed_batch = self.surface_batchnorm(batch)
        
        # Generate continuous kernel embeddings from the processed 'Input Surface'
        continuous_kernel_embedding_batch = self.surface_continuous_kernel_embedding(processed_batch['Input Surface'])
        
        # Project the embeddings using 1x1 convolution
        projection_embedding_batch = self.surface_projection_embedding(continuous_kernel_embedding_batch)

        # Return both the positionally encoded embeddings and the processed batch
        return projection_embedding_batch, processed_batch

torch.manual_seed(RANDOM_STATE)
grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']  # Desired number of output channels
input_embedding = InputEmbedding(grid_dim, d_embedding)
projection_embedding_batch, processed_batch = input_embedding(batch)
projection_embedding_batch, processed_batch

(tensor([[[[ 0.2868,  0.3013,  0.2980],
           [ 0.3128,  0.3275,  0.3186],
           [ 0.3021,  0.3132,  0.3040]],
 
          [[ 2.3047,  1.2691,  1.5033],
           [ 0.4402, -0.6115,  0.0293],
           [ 1.2104,  0.4141,  1.0714]],
 
          [[-2.2546, -0.6657, -1.0250],
           [ 0.6060,  2.2196,  1.2364],
           [-0.5757,  0.6460, -0.3624]],
 
          [[-1.8564, -0.4356, -0.7570],
           [ 0.7015,  2.1443,  1.2651],
           [-0.3552,  0.7372, -0.1645]],
 
          [[-2.0649, -1.3214, -1.4895],
           [-0.7263,  0.0288, -0.4313],
           [-1.2792, -0.7075, -1.1794]],
 
          [[ 0.1577, -0.3600, -0.2429],
           [-0.7743, -1.3000, -0.9797],
           [-0.3893, -0.7873, -0.4588]],
 
          [[-0.2417, -0.2034, -0.2121],
           [-0.1728, -0.1339, -0.1576],
           [-0.2012, -0.1718, -0.1961]],
 
          [[ 2.6529,  1.1222,  1.4683],
           [-0.1029, -1.6574, -0.7102],
           [ 1.0355, -0.1415,  0.8300]]],
 
 
         [[[ 

### Pre-Encoder

#### Block

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PreEncoder(nn.Module):
    def __init__(self, d_embedding, branch_channels, grid_dim):
        super(PreEncoder, self).__init__()
        # Initial channel configuration is common to all branches
        self.branch1 = nn.Sequential(
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        self.branch2 = nn.Sequential(
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU(),
            nn.Conv2d(branch_channels, branch_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        self.branch3 = nn.Sequential(
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU(),
            nn.Conv2d(branch_channels, branch_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU(),
            nn.Conv2d(branch_channels, branch_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(d_embedding, branch_channels, kernel_size=1),
            nn.BatchNorm2d(branch_channels),
            nn.GELU()
        )
        
        # Reduce the concatenated channels back to the original number of channels
        self.conv_reduce = nn.Conv2d(branch_channels * 4, d_embedding, kernel_size=1)
        self.bn_reduce = nn.BatchNorm2d(d_embedding)
        self.scale = nn.Parameter(torch.tensor(1.0))  # Learnable scale for residual connection
        self.layer_norm = nn.LayerNorm([d_embedding, grid_dim, grid_dim])  # Normalize across (C, H, W)

    def forward(self, x):
        # Apply each branch to the input
        out1 = self.branch1(x)
        out2 = self.branch2(x)
        out3 = self.branch3(x)
        out4 = self.branch4(x)
        
        # Concatenate the outputs from each branch
        concatenated = torch.cat([out1, out2, out3, out4], dim=1)
        
        # Reduce back to the initial number of channels
        reduced = self.conv_reduce(concatenated)
        reduced = self.bn_reduce(reduced)
        
        # Add the residual connection with scale
        residual = x + self.scale * reduced
        residual = F.gelu(residual)  # Apply GELU after adding the residual
        
        # Normalize the output
        output = self.layer_norm(residual)
        
        return output

torch.manual_seed(RANDOM_STATE)
branch_channels = HYPERPARAMETERS['Input Embedding']['Pre-Encoder']['Branch Channels Dimension']
pre_encoder = PreEncoder(d_embedding, branch_channels, grid_dim)
pre_encoded_batch = pre_encoder(projection_embedding_batch)
pre_encoded_batch

tensor([[[[ 0.1920, -0.7922, -0.0076],
          [ 1.2240,  0.7556, -0.0998],
          [-0.3658, -0.6377, -0.8224]],

         [[ 1.2388,  1.6315,  1.1239],
          [ 1.3735, -0.7240, -0.8243],
          [-0.8110, -0.2190,  0.2282]],

         [[-0.7408, -0.6581, -0.8097],
          [ 1.2993,  3.1171,  0.7897],
          [-0.8254, -0.6587, -0.8243]],

         [[-0.6763, -0.5276, -0.8217],
          [ 2.1927,  2.6753,  1.2197],
          [-0.8162, -0.8065, -0.6918]],

         [[-0.6583, -0.6992, -0.6875],
          [-0.7815, -0.4518, -0.7391],
          [-0.8153,  0.3843, -0.5089]],

         [[-0.7702, -0.7463, -0.7929],
          [-0.8257, -0.8057, -0.8185],
          [ 0.5970,  1.6791, -0.0630]],

         [[-0.3010, -0.8016, -0.8242],
          [-0.3160,  0.8639,  0.9188],
          [-0.7943, -0.0254,  0.6932]],

         [[ 2.5638,  1.3952,  0.6005],
          [ 0.5694, -0.4064, -0.6243],
          [ 0.7223, -0.8131, -0.8175]]],


        [[[ 0.6071, -0.7135,  0.1417],
       

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class SurfacePositionalEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding):
        super(SurfacePositionalEmbedding, self).__init__()
        self.grid_dim = grid_dim
        self.d_embedding = d_embedding
        
        # Create a regular grid in (0, 1)x(0, 1), excluding 0 and 1
        grid_points = torch.linspace(1 / (grid_dim + 1), 1 - 1 / (grid_dim + 1), grid_dim)
        mesh_x, mesh_y = torch.meshgrid(grid_points, grid_points, indexing='ij')
        self.grid_points = torch.stack([mesh_x.flatten(), mesh_y.flatten()], dim=-1)
        self.grid_points = torch.erfinv(2 * self.grid_points - 1) * np.sqrt(2)  # inverse CDF of normal

        # Initialize learnable scaling parameter (the base for positional embedding)
        self.log_scale = nn.Parameter(torch.log(torch.tensor(10000.0)))
        self.factor = nn.Parameter(torch.tensor(1.0))  # Learnable scale for the positional embedding contribution

        # Layer normalization for final output
        self.layer_norm = nn.LayerNorm([d_embedding, grid_dim, grid_dim])  # Normalizes across (channels, height, width)


    def forward(self, x):
        # x is the output from the 1x1 convolution layer with shape (batch_size, d_embedding, grid_dim, grid_dim)
        scale = torch.exp(self.log_scale)
        pos_enc = torch.zeros_like(x)

        # Repeat grid_points to match the batch size and reshape for broadcasting
        batch_grid_points = self.grid_points.repeat(x.shape[0], 1, 1).view(x.shape[0], self.grid_dim*self.grid_dim, 2)
        
        for i in range(self.d_embedding // 4):
            # Calculate positional embeddings for both dimensions
            div_factor = scale ** (4 * i / self.d_embedding)
            pos_enc[:, 4 * i, :, :] = torch.sin(batch_grid_points[:, :, 0].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)
            pos_enc[:, 4 * i + 1, :, :] = torch.cos(batch_grid_points[:, :, 0].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)
            pos_enc[:, 4 * i + 2, :, :] = torch.sin(batch_grid_points[:, :, 1].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)
            pos_enc[:, 4 * i + 3, :, :] = torch.cos(batch_grid_points[:, :, 1].view(x.shape[0], self.grid_dim, self.grid_dim) / div_factor)

        # Apply the learned scale to positional embedding and add to the input
        x = x + self.factor * pos_enc
        # Normalize the final output
        x = self.layer_norm(x) 

        return x

# Create the SurfacePositionalEmbedding module
positional_encoder = SurfacePositionalEmbedding(grid_dim, d_embedding)

# Apply positional embedding
positional_embedded_batch = positional_encoder(pre_encoded_batch)
positional_embedded_batch

tensor([[[[-0.7619, -1.5988, -0.9316],
          [ 0.6467,  0.2484, -0.4790],
          [-0.1742, -0.4053, -0.5624]],

         [[ 1.3234,  1.6573,  1.2257],
          [ 1.6241, -0.1595, -0.2448],
          [-0.4196,  0.0838,  0.4641]],

         [[-1.5551, -0.9538, -0.5516],
          [ 0.1797,  2.2564,  0.8084],
          [-1.6270, -0.9543, -0.5641]],

         [[-0.3051,  0.0076, -0.4287],
          [ 2.1346,  2.7311,  1.3072],
          [-0.4241, -0.2296, -0.3183]],

         [[-0.9596, -0.9944, -0.9845],
          [-1.0587, -0.7784, -1.0226],
          [-1.0817, -0.0616, -0.8211]],

         [[-0.1987, -0.1784, -0.2181],
          [-0.2459, -0.2289, -0.2398],
          [ 0.9638,  1.8840,  0.4026]],

         [[-0.6559, -1.0758, -1.0892],
          [-0.6686,  0.3405,  0.3929],
          [-1.0753, -0.4158,  0.2011]],

         [[ 2.6363,  1.6426,  0.9668],
          [ 0.9404,  0.1106, -0.0747],
          [ 1.0704, -0.2352, -0.2390]]],


        [[[-0.4128, -1.5466, -0.8124],
       

### Final Block

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SurfaceEmbedding(nn.Module):
    def __init__(self, grid_dim, d_embedding, branch_channels, num_pre_encoder_blocks, momentum=0.1):
        super(SurfaceEmbedding, self).__init__()
        # Initialize the InputEmbedding module
        self.input_embedding = InputEmbedding(grid_dim, d_embedding, momentum)
        
        # Initialize multiple PreEncoder blocks
        self.pre_encoders = nn.ModuleList([
            PreEncoder(d_embedding, branch_channels, grid_dim) for _ in range(num_pre_encoder_blocks)
        ])
        
        # Initialize positional embedding, adjusted to apply before flattening
        self.positional_embedding = SurfacePositionalEmbedding(grid_dim, d_embedding)

    def forward(self, batch):
        # Process batch through InputEmbedding to get initial embeddings and the processed batch
        embedding_batch, processed_batch = self.input_embedding(batch)
        
        # Sequentially pass the output through each PreEncoder block
        pre_encoded_batch = embedding_batch
        for pre_encoder in self.pre_encoders:
            pre_encoded_batch = pre_encoder(pre_encoded_batch)
        
        # Apply positional embedding to the output of the last PreEncoder block
        positional_embedded_batch = self.positional_embedding(pre_encoded_batch)
        
        # Flatten the 2D spatial structure into a sequence of tokens
        batch_size, num_channels, height, width = positional_embedded_batch.shape
        tokenized_positional_embedded_batch = positional_embedded_batch.view(batch_size, num_channels, height * width).transpose(1, 2)
        
        # The final output is now suitable for processing by transformer encoders
        return tokenized_positional_embedded_batch, processed_batch


# Example usage
torch.manual_seed(RANDOM_STATE)
grid_dim = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Grid Dimension']
d_embedding = HYPERPARAMETERS['Input Embedding']['Surface Embedding']['Channels Dimension']
branch_channels = HYPERPARAMETERS['Input Embedding']['Pre-Encoder']['Branch Channels Dimension']
num_pre_encoder_blocks = HYPERPARAMETERS['Input Embedding']['Pre-Encoder']['Number of Blocks']

surface_embedding = SurfaceEmbedding(grid_dim, d_embedding, branch_channels, num_pre_encoder_blocks)
tokenized_positional_embedded_batch, processed_batch = surface_embedding(batch)
tokenized_positional_embedded_batch, processed_batch

(tensor([[[-1.3303e+00,  2.1044e+00, -1.3362e+00, -2.8019e-01, -3.2145e-01,
           -1.0643e-01, -6.2448e-01,  3.4260e+00],
          [-1.4510e+00,  7.4142e-01,  1.1684e+00,  2.3995e-01, -8.0646e-01,
            3.6007e-02, -8.4416e-01, -1.0753e-01],
          [-8.4866e-01,  2.0403e+00, -3.5992e-01, -1.1717e-01, -8.6191e-01,
           -7.0662e-02, -9.3144e-01,  2.0138e+00],
          [-8.0920e-01, -3.4787e-02, -1.4279e+00,  9.8281e-01, -8.8960e-01,
           -7.2083e-02, -9.4286e-01, -5.2162e-02],
          [-9.2581e-01,  1.1054e+00,  1.7250e+00,  1.7216e+00,  9.0015e-01,
            7.7737e-01, -8.0029e-01,  7.9503e-03],
          [-1.4221e-01,  1.3855e+00, -1.6876e-01,  2.6878e+00, -9.2091e-01,
           -7.3199e-03, -9.2603e-01, -4.1719e-02],
          [-4.1094e-02,  3.2664e-01, -1.3561e+00,  3.9013e-01, -8.6537e-01,
            4.2438e-03, -8.3648e-01, -9.6690e-02],
          [-1.0968e-01, -2.0998e-01, -2.1824e-02,  8.1026e-01, -9.3181e-01,
           -7.8111e-02, -7.9945e-01

## Surface Encoding

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ScaledResidualNorm(nn.Module):
    def __init__(self, d_embedding):
        super(ScaledResidualNorm, self).__init__()
        self.scale = nn.Parameter(torch.tensor(1.0))
        self.norm = nn.LayerNorm(d_embedding)

    def forward(self, x, sublayer_output):
        return self.norm(x + self.scale * sublayer_output)

class FeedForwardNetwork(nn.Module):
    def __init__(self, d_embedding, hidden_dim, dropout):
        super(FeedForwardNetwork, self).__init__()
        self.feedforward = nn.Sequential(
            nn.Linear(d_embedding, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, d_embedding),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.feedforward(x)

class Encoder(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout, external_dim):
        super(Encoder, self).__init__()
        self.self_attention = nn.MultiheadAttention(embed_dim=d_embedding, num_heads=n_heads, batch_first=True)
        self.scaled_residual_norm1 = ScaledResidualNorm(d_embedding)
        self.feedforward1 = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        self.scaled_residual_norm2 = ScaledResidualNorm(d_embedding)
        
        self.external_attention = nn.MultiheadAttention(embed_dim=d_embedding, num_heads=n_heads, kdim=external_dim, vdim=external_dim, batch_first=True)
        self.scaled_residual_norm3 = ScaledResidualNorm(d_embedding)
        self.feedforward2 = FeedForwardNetwork(d_embedding, hidden_dim, dropout)
        self.scaled_residual_norm4 = ScaledResidualNorm(d_embedding)

    def forward(self, surface_tokens, external_features):
        # Self-attention block
        self_attn_output, _ = self.self_attention(surface_tokens, surface_tokens, surface_tokens)
        x = self.scaled_residual_norm1(surface_tokens, self_attn_output)
        
        # Feedforward network block 1
        ff_output = self.feedforward1(x)
        x = self.scaled_residual_norm2(x, ff_output)
        
        # External attention block
        ext_attn_output, _ = self.external_attention(x, external_features, external_features)
        x = self.scaled_residual_norm3(x, ext_attn_output)
        
        # Feedforward network block 2
        ff_output = self.feedforward2(x)
        x = self.scaled_residual_norm4(x, ff_output)
        
        return x

class SurfaceEncoding(nn.Module):
    def __init__(self, d_embedding, n_heads, hidden_dim, dropout, external_dim, num_encoder_blocks):
        super(SurfaceEncoding, self).__init__()
        self.encoders = nn.ModuleList([
            Encoder(d_embedding, n_heads, hidden_dim, dropout, external_dim) for _ in range(num_encoder_blocks)
        ])

    def forward(self, tokenized_positional_embedded_batch, processed_batch):
        # Extract market features from processed batch and create external_features tensor
        market_features = processed_batch['Market Features']
        external_features = torch.stack([
            market_features['Market Return'],
            market_features['Market Volatility'],
            market_features['Treasury Rate']
        ], dim=-1).unsqueeze(1)  # (batch, 1, features)
        
        # Pass the tokenized positional embeddings and external features through each encoder block
        x = tokenized_positional_embedded_batch
        for encoder in self.encoders:
            x = encoder(x, external_features)
        
        return x
    
# Example usage
torch.manual_seed(RANDOM_STATE)
n_heads = HYPERPARAMETERS['Surface Encoding']['Number of Heads']
hidden_dim = HYPERPARAMETERS['Surface Encoding']['Hidden Dimension']
dropout = HYPERPARAMETERS['Surface Encoding']['Dropout']
num_encoder_blocks = HYPERPARAMETERS['Surface Encoding']['Number of Blocks']
external_dim = 3  # Assuming 3 market features

surface_encoding = SurfaceEncoding(d_embedding, n_heads, hidden_dim, dropout, external_dim, num_encoder_blocks)
encoded_output = surface_encoding(tokenized_positional_embedded_batch, processed_batch)    
encoded_output

tensor([[[-0.8822,  0.1151, -0.8951,  0.7154, -0.6311,  0.1693, -0.7919,
           2.2003],
         [-0.8297, -0.1905,  1.2735,  0.1447, -1.4698,  1.2920, -1.0531,
           0.8330],
         [-0.9984,  0.1709, -0.6545,  0.0408, -0.5886,  0.6638, -0.8700,
           2.2360],
         [-0.1867,  0.0286, -1.3779,  1.4677, -0.7915,  0.6790, -1.0943,
           1.2753],
         [-1.1471, -0.3448,  1.0720,  1.0847, -0.7983,  1.4733, -1.1867,
          -0.1531],
         [ 0.4575, -0.1852, -0.0937,  2.0028, -1.2875,  0.2381, -1.3971,
           0.2651],
         [ 0.8239, -0.2179, -0.9775,  1.4546, -1.3959,  0.3660, -1.0557,
           1.0027],
         [ 0.0816, -1.1054,  0.8193,  1.2764, -1.3974,  0.7353, -1.1778,
           0.7682],
         [ 1.5322, -0.9234,  0.0721,  1.1776, -1.2744, -0.0393, -1.1762,
           0.6313]],

        [[-1.0578, -0.3708, -0.8959,  0.1147, -0.2538,  0.6546, -0.4736,
           2.2826],
         [-1.1895, -0.3655, -0.7321,  0.3443, -0.0715, -0.0128, -0.3

## Query Embedding

### Point Embedding

In [30]:
import torch
import torch.nn as nn
import numpy as np

class PointEmbedding(nn.Module):
    def __init__(self, d_embedding, surface_embedding):
        super(PointEmbedding, self).__init__()
        self.d_embedding = d_embedding
        self.log_scale = surface_embedding.positional_embedding.log_scale  # Shared log_scale parameter from SurfacePositionalEmbedding
        self.factor = nn.Parameter(torch.tensor(1.0))  # Learnable scale for the positional embedding contribution
        self.learnable_embedding = nn.Parameter(torch.randn(d_embedding))  # Learnable embedding vector
        self.layer_norm = nn.LayerNorm(d_embedding)  # Normalize across the embedding dimension

    def forward(self, query_point_batch):
        log_moneyness = query_point_batch['Log Moneyness']
        time_to_maturity = query_point_batch['Time to Maturity']

        # Stack the query point coordinates
        query_coords = torch.stack([log_moneyness, time_to_maturity], dim=-1)  # Shape: (batch_size, 2)

        # Positional embedding calculation
        scale = torch.exp(self.log_scale)
        pos_enc = torch.zeros(query_coords.size(0), self.d_embedding, device=query_coords.device)

        for i in range(self.d_embedding // 4):
            div_factor = scale ** (4 * i / self.d_embedding)
            pos_enc[:, 4 * i] = torch.sin(query_coords[:, 0] / div_factor)
            pos_enc[:, 4 * i + 1] = torch.cos(query_coords[:, 0] / div_factor)
            pos_enc[:, 4 * i + 2] = torch.sin(query_coords[:, 1] / div_factor)
            pos_enc[:, 4 * i + 3] = torch.cos(query_coords[:, 1] / div_factor)

        # Apply the learned scale to positional embedding
        pos_enc = self.factor * pos_enc

        # Add learnable embedding vector
        pos_enc += self.learnable_embedding

        # Apply layer normalization
        pos_enc = self.layer_norm(pos_enc)

        return pos_enc

# Example usage:
torch.manual_seed(RANDOM_STATE)
point_embedding = PointEmbedding(d_embedding, surface_embedding)
point_embedded = point_embedding(processed_batch['Query Point'])
point_embedded

tensor([[-0.3089, -0.2000,  2.1540, -0.9069, -0.4450,  0.6728, -1.2846,  0.3185],
        [-0.3695, -0.5821,  2.1679, -0.8032, -0.3598,  0.7301, -1.1697,  0.3863],
        [ 0.2782,  0.0869,  1.7491, -1.0090, -0.5919,  0.7914, -1.6548,  0.3502],
        [ 1.1863, -0.9844,  1.3338, -0.9864, -0.4044,  0.8092, -1.3710,  0.4168]],
       grad_fn=<NativeLayerNormBackward0>)

### Pre-Decoder