In [1]:
import numpy as np
import pandas as pd
import random
import torch
import gc

In [2]:
# Set the random seed for reproducibility
RANDOM_STATE = 0
N_JOBS = 8
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [3]:
HYPERPARAMETERS = {
    'Input Preprocessing' : {
        'Mask Proportions' : [0.1, 0.3, 0.5, 0.7],
        'Number of Query Points' : 1,
        'Batch Size' : 4
    },
    'Surface Embedding' : {
        'Embedding Dimension' : 8,
    },
    'Surface Encoding' : {
        'Number of Heads' : 4,
        'FFN Hidden Dimension' : 16,
        'Attention Dropout' : 0.1,
        'Gate Dropout' : 0.1,
        'FFN Dropout' : 0.1,
        'Number of Blocks' : 2,
        'External Feature Dimension' : 3,
    },
    'No-Arbitrage' : {
        'Butterfly' : 1,
        'Calendar' : 1,
    }
}

## Dataset

In [4]:
aapl_googl_data = pd.read_csv('volatility_surface_AAPL_GOOGL_2013_01_2013_06.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
aapl_googl_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Log Moneyness,Time to Maturity,Implied Volatility,Market Return,Market Volatility,Treasury Rate
Datetime,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-02,AAPL,-0.316688,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.316688,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.291996,0.007937,0.3726,0.025086,14.680000,0.055
...,...,...,...,...,...,...,...
2013-06-28,GOOGL,0.427518,2.253968,0.2430,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2383,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2426,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.442224,2.253968,0.2402,-0.004299,16.860001,0.030


In [5]:
def implied_volatility_surfaces(options_market_data):
    # Group the data by Datetime and Symbol
    grouped_data = options_market_data.groupby(level=['Datetime', 'Symbol'])

    surfaces = []
    for (date, symbol), surface in grouped_data:
        surface_dict = {
            'Datetime': date,
            'Symbol': symbol,
            'Market Features': {
                'Market Return': surface['Market Return'].values[0],
                'Market Volatility': surface['Market Volatility'].values[0],
                'Treasury Rate': surface['Treasury Rate'].values[0],
            },
            'Surface': {
                'Log Moneyness': surface['Log Moneyness'].values,
                'Time to Maturity': surface['Time to Maturity'].values,
                'Implied Volatility': surface['Implied Volatility'].values,
            }
        }
        surfaces.append(surface_dict)

    return surfaces

surfaces = implied_volatility_surfaces(aapl_googl_data)
surfaces[0]

{'Datetime': Timestamp('2013-01-02 00:00:00'),
 'Symbol': 'AAPL',
 'Market Features': {'Market Return': 0.0250861159586972,
  'Market Volatility': 14.68000030517578,
  'Treasury Rate': 0.0549999997019767},
 'Surface': {'Log Moneyness': array([-0.31668849, -0.31668849, -0.30426597, ...,  0.63882295,
          0.6483924 ,  0.6483924 ]),
  'Time to Maturity': array([0.00793651, 0.00793651, 0.00793651, ..., 2.95634921, 2.95634921,
         2.95634921]),
  'Implied Volatility': array([0.3726, 0.6095, 0.3726, ..., 0.3387, 0.3342, 0.3389])}}

In [6]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
import numpy as np

class IVSurfaceDataset(Dataset):
    def __init__(
        self, 
        data, 
        mask_proportions, 
        random_state=0,
        n_query_points=None
    ):
        self.data = data
        self.mask_proportions = mask_proportions
        self.random_state = random_state
        self.rng = np.random.default_rng(random_state)
        self.n_query_points = n_query_points

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        surface_data = self.data[idx]
        
        # Extract the surface coordinates and volatilities
        points_coordinates = np.stack([
            surface_data['Surface']['Log Moneyness'], 
            surface_data['Surface']['Time to Maturity']
        ], axis=1)
        points_volatilities = surface_data['Surface']['Implied Volatility']

        # Select a random mask proportion
        proportion = self.rng.choice(self.mask_proportions)

        # Perform clustering
        n_clusters = int(np.ceil(1 / proportion))
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('kmeans', KMeans(n_clusters=n_clusters, random_state=self.random_state, n_init='auto'))
        ])
        labels = pipeline.fit_predict(points_coordinates)
        masked_indices = np.array([], dtype=int)

        for cluster in range(n_clusters):
            cluster_indices = np.where(labels == cluster)[0]
            num_to_mask = int(np.ceil(len(cluster_indices) * proportion))
            masked_indices = np.append(masked_indices, [self.rng.choice(cluster_indices, size=num_to_mask, replace=False)])
        
        unmasked_indices = np.setdiff1d(range(len(labels)), masked_indices)

        # Calculate IV mean and std for unmasked points
        iv_mean = np.mean(points_volatilities[unmasked_indices])
        iv_std = np.std(points_volatilities[unmasked_indices])

        # Define query indices based on n_query_points
        if self.n_query_points is None:
            query_indices = masked_indices
        else:
            query_indices = self.rng.choice(masked_indices, size=self.n_query_points, replace=False)
            
        data_item = {
            'Datetime': surface_data['Datetime'],
            'Symbol': surface_data['Symbol'],
            'Mask Proportion': proportion,
            'Market Features': {
                'Market Return': torch.tensor(surface_data['Market Features']['Market Return'], dtype=torch.float32),
                'Market Volatility': torch.tensor(surface_data['Market Features']['Market Volatility'], dtype=torch.float32),
                'Treasury Rate': torch.tensor(surface_data['Market Features']['Treasury Rate'], dtype=torch.float32),
                'IV Mean': torch.tensor(iv_mean, dtype=torch.float32),
                'IV Std.': torch.tensor(iv_std, dtype=torch.float32),
            },
            'Input Surface': {
                'Log Moneyness': torch.tensor(points_coordinates[unmasked_indices, 0], dtype=torch.float32),
                'Time to Maturity': torch.tensor(points_coordinates[unmasked_indices, 1], dtype=torch.float32),
                'Implied Volatility': torch.tensor(points_volatilities[unmasked_indices], dtype=torch.float32)
            },
            'Query Points': {
                'Log Moneyness': torch.tensor(points_coordinates[query_indices, 0], dtype=torch.float32),
                'Time to Maturity': torch.tensor(points_coordinates[query_indices, 1], dtype=torch.float32),
                'Implied Volatility': torch.tensor(points_volatilities[query_indices], dtype=torch.float32)
            }
        }

        return data_item

    @staticmethod
    def collate_fn(batch):
        batched_data = {
            'Datetime': [item['Datetime'] for item in batch],
            'Symbol': [item['Symbol'] for item in batch],
            'Mask Proportion': [item['Mask Proportion'] for item in batch],
            'Market Features': {
                'Market Return': default_collate([item['Market Features']['Market Return'] for item in batch]),
                'Market Volatility': default_collate([item['Market Features']['Market Volatility'] for item in batch]),
                'Treasury Rate': default_collate([item['Market Features']['Treasury Rate'] for item in batch]),
                'IV Mean': default_collate([item['Market Features']['IV Mean'] for item in batch]),
                'IV Std.': default_collate([item['Market Features']['IV Std.'] for item in batch]),
            },
            'Input Surface': {
                'Log Moneyness': [item['Input Surface']['Log Moneyness'].clone().detach() for item in batch],
                'Time to Maturity': [item['Input Surface']['Time to Maturity'].clone().detach() for item in batch],
                'Implied Volatility': [item['Input Surface']['Implied Volatility'].clone().detach() for item in batch],
            },
            'Query Points': {
                'Log Moneyness': [item['Query Points']['Log Moneyness'].clone().detach().requires_grad_(True) for item in batch],
                'Time to Maturity': [item['Query Points']['Time to Maturity'].clone().detach().requires_grad_(True) for item in batch],
                'Implied Volatility': [item['Query Points']['Implied Volatility'].clone().detach() for item in batch],
            }
        }

        return batched_data


# Assuming surfaces is the output from the implied_volatility_surfaces function
mask_proportions = HYPERPARAMETERS['Input Preprocessing']['Mask Proportions']  
n_query_points = HYPERPARAMETERS['Input Preprocessing']['Number of Query Points']  
dataset = IVSurfaceDataset(surfaces, mask_proportions, RANDOM_STATE, n_query_points)
data_loader = DataLoader(
    dataset, 
    batch_size=HYPERPARAMETERS['Input Preprocessing']['Batch Size'], 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)

# Fetch one batch from the DataLoader
batch = next(iter(data_loader))
batch



{'Datetime': [Timestamp('2013-06-10 00:00:00'),
  Timestamp('2013-01-28 00:00:00'),
  Timestamp('2013-05-20 00:00:00'),
  Timestamp('2013-03-07 00:00:00')],
 'Symbol': ['AAPL', 'AAPL', 'GOOGL', 'AAPL'],
 'Mask Proportion': [0.7, 0.7, 0.1, 0.3],
 'Market Features': {'Market Return': tensor([-0.0003, -0.0019, -0.0007,  0.0018]),
  'Market Volatility': tensor([15.4400, 13.5700, 13.0200, 13.0600]),
  'Treasury Rate': tensor([0.0400, 0.0600, 0.0350, 0.0900]),
  'IV Mean': tensor([0.3143, 0.3340, 0.2723, 0.3624]),
  'IV Std.': tensor([0.0559, 0.0639, 0.0500, 0.1235])},
 'Input Surface': {'Log Moneyness': [tensor([-0.2852, -0.2553, -0.1573, -0.1310, -0.1181, -0.0928, -0.0804, -0.0804,
           -0.0681, -0.0560, -0.0440, -0.0205,  0.0025,  0.0138,  0.0470,  0.0578,
            0.0791,  0.0999,  0.1102,  0.1203,  0.1304,  0.1304,  0.1403,  0.1696,
            0.1886,  0.1886,  0.1980,  0.2073, -0.7859, -0.7859, -0.7371, -0.6906,
           -0.6906, -0.6462, -0.6247, -0.6036, -0.5430, -0.5236,

## Surface Embedding

### Components

In [7]:
import torch
import torch.nn as nn
from torch.utils.data._utils.collate import default_collate

class SurfaceBatchNorm(nn.Module):
    def __init__(
        self, 
        num_features=1, 
        momentum=0.1
    ):
        super(SurfaceBatchNorm, self).__init__()
        self.log_moneyness_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.time_to_maturity_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_return_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.market_volatility_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.treasury_rate_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.iv_mean_bn = nn.BatchNorm1d(num_features, momentum=momentum)
        self.iv_std_bn = nn.BatchNorm1d(num_features, momentum=momentum)

    def forward(self, batch):
        # Concatenate all tensors from the Input Surface into one tensor for each feature
        input_surface_log_moneyness = torch.cat([x for x in batch['Input Surface']['Log Moneyness']])
        input_surface_time_to_maturity = torch.cat([x for x in batch['Input Surface']['Time to Maturity']])

        # Concatenate Input Surface tensors with Query Points tensors
        total_log_moneyness = torch.cat([input_surface_log_moneyness] + [x for x in batch['Query Points']['Log Moneyness']])
        total_time_to_maturity = torch.cat([input_surface_time_to_maturity] + [x for x in batch['Query Points']['Time to Maturity']])

        # Normalize Log Moneyness and Time to Maturity
        norm_log_moneyness = self.log_moneyness_bn(total_log_moneyness.unsqueeze(1)).squeeze(1)
        norm_time_to_maturity = self.time_to_maturity_bn(total_time_to_maturity.unsqueeze(1)).squeeze(1)

        # Split the normalized results back to corresponding structures
        input_surface_sizes = [len(x) for x in batch['Input Surface']['Log Moneyness']]
        query_points_sizes = [len(x) for x in batch['Query Points']['Log Moneyness']]
        total_input_size = sum(input_surface_sizes)

        # Normalizing Market Features
        market_features = batch['Market Features']
        norm_market_return = self.market_return_bn(market_features['Market Return'].unsqueeze(1)).squeeze(1)
        norm_market_volatility = self.market_volatility_bn(market_features['Market Volatility'].unsqueeze(1)).squeeze(1)
        norm_treasury_rate = self.treasury_rate_bn(market_features['Treasury Rate'].unsqueeze(1)).squeeze(1)
        norm_iv_mean = self.iv_mean_bn(market_features['IV Mean'].unsqueeze(1)).squeeze(1)
        norm_iv_std = self.iv_std_bn(market_features['IV Std.'].unsqueeze(1)).squeeze(1)

        # Reconstructing the batch with normalized data
        output = {
            'Datetime': batch['Datetime'],
            'Symbol': batch['Symbol'],
            'Mask Proportion': batch['Mask Proportion'],
            'Market Features': {
                'Market Return': norm_market_return,
                'Market Volatility': norm_market_volatility,
                'Treasury Rate': norm_treasury_rate,
                'IV Mean': norm_iv_mean,
                'IV Std.': norm_iv_std
            },
            'Input Surface': {
                'Log Moneyness': list(torch.split(norm_log_moneyness[:total_input_size], input_surface_sizes)),
                'Time to Maturity': list(torch.split(norm_time_to_maturity[:total_input_size], input_surface_sizes)),
                'Implied Volatility': batch['Input Surface']['Implied Volatility']
            },
            'Query Points': {
                'Log Moneyness': list(torch.split(norm_log_moneyness[total_input_size:], query_points_sizes)),
                'Time to Maturity': list(torch.split(norm_time_to_maturity[total_input_size:], query_points_sizes)),
                'Implied Volatility': batch['Query Points']['Implied Volatility']
            }
        }

        # Ensure requires_grad is True for query point values
        for key in output['Query Points']:
            if key != 'Implied Volatility':  # We only set requires_grad for Log Moneyness and Time to Maturity
                for tensor in output['Query Points'][key]:
                    tensor.requires_grad_()

        return output

# Usage
surfacebatchnorm = SurfaceBatchNorm()
processed_batch = surfacebatchnorm(batch)
processed_batch

{'Datetime': [Timestamp('2013-06-10 00:00:00'),
  Timestamp('2013-01-28 00:00:00'),
  Timestamp('2013-05-20 00:00:00'),
  Timestamp('2013-03-07 00:00:00')],
 'Symbol': ['AAPL', 'AAPL', 'GOOGL', 'AAPL'],
 'Mask Proportion': [0.7, 0.7, 0.1, 0.3],
 'Market Features': {'Market Return': tensor([-0.0216, -0.4603, -0.1269,  0.6087], grad_fn=<SqueezeBackward1>),
  'Market Volatility': tensor([ 1.6897, -0.2052, -0.7625, -0.7220], grad_fn=<SqueezeBackward1>),
  'Treasury Rate': tensor([-0.7439,  0.1717, -0.9728,  1.5450], grad_fn=<SqueezeBackward1>),
  'IV Mean': tensor([-0.1952,  0.4019, -1.4712,  1.2645], grad_fn=<SqueezeBackward1>),
  'IV Std.': tensor([-0.5892, -0.3177, -0.7905,  1.6974], grad_fn=<SqueezeBackward1>)},
 'Input Surface': {'Log Moneyness': [tensor([-0.8178, -0.7456, -0.5087, -0.4450, -0.4138, -0.3526, -0.3225, -0.3225,
           -0.2929, -0.2636, -0.2346, -0.1777, -0.1221, -0.0948, -0.0146,  0.0115,
            0.0630,  0.1134,  0.1382,  0.1627,  0.1870,  0.1870,  0.2111,  0.2

In [8]:
import torch
import torch.nn as nn
import numpy as np

class EllipticalRBFKernel(nn.Module):
    def __init__(
        self, 
        input_dim, 
        bandwidth, 
        remove_kernel=False
    ):
        super(EllipticalRBFKernel, self).__init__()
        self.bandwidth = bandwidth
        # Initialize the log of the scale vector to zero, which corresponds to scale factors of one
        self.log_scale = nn.Parameter(torch.zeros(input_dim))
        self.remove_kernel = remove_kernel

    def forward(self, distances):
        if self.remove_kernel:
            # Create a mask for the condition check
            all_zeros = torch.all(distances==0.0, dim=-1)
            result = torch.where(
                all_zeros, 
                torch.full(distances.shape[:-1], 1.0, device=distances.device),
                torch.full(distances.shape[:-1], 1e-10, device=distances.device)
            )
            return result
        # Convert log scale to actual scale values
        scale = torch.exp(self.log_scale)
        
        # Calculate the scaled distances
        scaled_distances = (distances ** 2) * scale  # Element-wise multiplication by scale

        # Normalize by the trace of the scale matrix
        trace_scale_matrix = torch.sum(scale)
        normalized_distances = torch.sum(scaled_distances, dim=-1) / trace_scale_matrix

        # Compute the RBF kernel output using the normalized distances
        kernel_values = torch.exp(-normalized_distances / (2 * self.bandwidth ** 2))

        return kernel_values

class SurfaceContinuousKernelPositionalEmbedding(nn.Module):
    def __init__(
        self, 
        d_embedding,
        remove_kernel=False,
        remove_positional_embedding=False
    ):
        super(SurfaceContinuousKernelPositionalEmbedding, self).__init__()
        self.d_embedding = d_embedding
        self.remove_positional_embedding = remove_positional_embedding

        # Initialize multiple RBF kernels, each with a different fixed bandwidth
        self.kernels = nn.ModuleList()
        for i in range(1, d_embedding + 1):
            bandwidth_value = torch.erfinv(torch.tensor(i / (d_embedding + 1))) * np.sqrt(2)
            self.kernels.append(
                EllipticalRBFKernel(
                    bandwidth=bandwidth_value, 
                    input_dim=2, 
                    remove_kernel=remove_kernel
                )
            )

        self.input_surface_layer_norm = nn.LayerNorm(d_embedding)
        self.query_points_layer_norm = nn.LayerNorm(d_embedding)

        # Initialize learnable scaling parameter (the base for positional embedding)
        self.log_scale = nn.Parameter(torch.log(torch.tensor(10000.0)))

    def forward(
        self, 
        input_surface_batch, 
        query_points_batch
    ):
        batch_size = len(input_surface_batch['Log Moneyness'])

        input_surface_embeddings = []
        query_points_embeddings = []

        for i in range(batch_size):
            # Extract the coordinates and implied volatilities for each surface in the batch
            surface_coords = torch.stack([
                input_surface_batch['Log Moneyness'][i], 
                input_surface_batch['Time to Maturity'][i]
            ], dim=-1)
            surface_ivs = input_surface_batch['Implied Volatility'][i]

            query_coords = torch.stack([
                query_points_batch['Log Moneyness'][i], 
                query_points_batch['Time to Maturity'][i]
            ], dim=-1)

            all_coords = torch.cat((surface_coords, query_coords), dim=0)

            # Compute the pairwise differences between all points and the input surface points
            point_differences = all_coords.unsqueeze(1) - surface_coords.unsqueeze(0)  # (n+m, n, 2)

            # Initialize the output embeddings for the current surface with d_embedding channels
            all_embedded = torch.zeros((all_coords.shape[0], self.d_embedding), dtype=torch.float32, device=surface_coords.device)

            for kernel_idx, kernel in enumerate(self.kernels):
                # Apply the RBF kernel to each distance vector 
                kernel_outputs = kernel(point_differences)

                # Compute the weighted sum of IVs based on the kernel outputs
                weighted_sum = (kernel_outputs * surface_ivs.unsqueeze(0)).sum(dim=1)
                normalization_factor = kernel_outputs.sum(dim=1)

                all_embedded[:, kernel_idx] = weighted_sum / normalization_factor    

            # Split the embeddings into input surface and query points embeddings
            input_surface_embedded = all_embedded[:surface_coords.shape[0], :]
            query_points_embedded = all_embedded[surface_coords.shape[0]:, :]

            # Normalize the embedded surfaces
            input_surface_embedded = self.input_surface_layer_norm(input_surface_embedded)
            query_points_embedded = self.query_points_layer_norm(query_points_embedded)

            # Positional embedding for input surface points
            input_surface_pe = self._compute_positional_embedding(surface_coords)

            # Positional embedding for query points
            query_points_pe = self._compute_positional_embedding(query_coords)

            # Add positional embeddings with a factor of sqrt(2)
            input_surface_final = input_surface_embedded + input_surface_pe * np.sqrt(2)
            query_points_final = query_points_embedded + query_points_pe * np.sqrt(2)

            # Append the encoded surface for this input surface to the batch list
            input_surface_embeddings.append(input_surface_final)
            query_points_embeddings.append(query_points_final)

        # Keep all encoded surfaces as lists to handle variable lengths
        return {
            'Input Surface': input_surface_embeddings,
            'Query Points': query_points_embeddings
        }

    def _compute_positional_embedding(
        self, 
        coords, 
    ):
        positional_embedding = torch.zeros(coords.size(0), self.d_embedding, device=coords.device)

        if not self.remove_positional_embedding:
            for i in range(self.d_embedding // 4):
                div_factor = torch.exp(self.log_scale) ** (4 * i / self.d_embedding)
                positional_embedding[:, 4 * i] = torch.sin(coords[:, 0] / div_factor)
                positional_embedding[:, 4 * i + 1] = torch.cos(coords[:, 0] / div_factor)
                positional_embedding[:, 4 * i + 2] = torch.sin(coords[:, 1] / div_factor)
                positional_embedding[:, 4 * i + 3] = torch.cos(coords[:, 1] / div_factor)

        return positional_embedding

# Example of initializing and using this module
d_embedding = HYPERPARAMETERS['Surface Embedding']['Embedding Dimension']  # Desired number of output channels

# continuous_kernel_positional_embedding = SurfaceContinuousKernelPositionalEmbedding(d_embedding=d_embedding)
# kernel_positional_embedded_batch = continuous_kernel_positional_embedding(processed_batch['Input Surface'], processed_batch['Query Points'])
# kernel_positional_embedded_batch

In [9]:
# import gc
# import psutil
# import os
# def print_memory_usage(stage):
#     process = psutil.Process(os.getpid())
#     print(f"[{stage}] Memory Usage: {process.memory_info().rss / 1024 ** 2:.2f} MB")

# def initialize_model(d_embedding):
#     model = SurfaceContinuousKernelPositionalEmbedding(d_embedding=d_embedding)
#     return model

# def run_model(model, processed_batch):
#     return model(processed_batch['Input Surface'], processed_batch['Query Points'])

# print_memory_usage("Before initialization")
# model = initialize_model(d_embedding)
# print_memory_usage("After initialization")

# kernel_positional_embedded_batch = run_model(model, processed_batch)
# print_memory_usage("After model run")

# # Clear references and collect garbage
# del model, kernel_positional_embedded_batch
# gc.collect()
# print_memory_usage("After cleanup")

In [10]:
# import psutil
# import os

# def print_memory_usage():
#     process = psutil.Process(os.getpid())
#     mem_info = process.memory_info()
#     print(f"RSS: {mem_info.rss / (1024 * 1024):.2f} MB")

# print_memory_usage()

# # Cleanup to free up RAM
# del continuous_kernel_positional_embedding
# del kernel_positional_embedded_batch
# del processed_batch

# print_memory_usage()

In [11]:

# def run_model(processed_batch, HYPERPARAMETERS):
#     d_embedding = HYPERPARAMETERS['Surface Embedding']['Embedding Dimension']
#     continuous_kernel_positional_embedding = SurfaceContinuousKernelPositionalEmbedding(d_embedding=d_embedding)
#     kernel_positional_embedded_batch = continuous_kernel_positional_embedding(
#         processed_batch['Input Surface'], processed_batch['Query Points']
#     )
#     return kernel_positional_embedded_batch

# import torch

# def hook(module, input, output):
#     print(f"Memory allocated: {torch.cuda.memory_allocated()}")
#     print(f"Max memory allocated: {torch.cuda.max_memory_allocated()}")
#     print(f"Memory reserved: {torch.cuda.memory_reserved()}")
#     print(f"Max memory reserved: {torch.cuda.max_memory_reserved()}")

# continuous_kernel_positional_embedding = SurfaceContinuousKernelPositionalEmbedding(d_embedding=d_embedding)

# # Register the hook to your model layers
# for name, module in continuous_kernel_positional_embedding.named_modules():
#     module.register_forward_hook(hook)
    
# kernel_positional_embedded_batch = continuous_kernel_positional_embedding(processed_batch['Input Surface'], processed_batch['Query Points'])



## Block

In [12]:
import torch
import torch.nn as nn
import numpy as np

class SurfaceEmbedding(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        momentum=0.1,
        remove_kernel=False,
        remove_positional_embedding=False
    ):
        super(SurfaceEmbedding, self).__init__()
        self.batch_norm = SurfaceBatchNorm(num_features=1, momentum=momentum)
        self.kernel_positional_embedding = SurfaceContinuousKernelPositionalEmbedding(d_embedding, remove_kernel, remove_positional_embedding)
        self.layer_norm = nn.LayerNorm(d_embedding)
        self.mask_token = nn.Parameter(torch.randn(d_embedding))

    def forward(self, batch):
        # Apply batch normalization
        norm_batch = self.batch_norm(batch)

        # Extract market features from processed batch and create external_features_batch tensor
        market_features = norm_batch['Market Features']
        external_features_batch = torch.stack([
            market_features['Market Return'],
            market_features['Market Volatility'],
            market_features['Treasury Rate'],
            market_features['IV Mean'],
            market_features['IV Std.']
        ], dim=-1)  # (batch, features)

        # Compute kernel and positional embeddings
        embeddings = self.kernel_positional_embedding(norm_batch['Input Surface'], norm_batch['Query Points'])

        input_surface_embeddings = embeddings['Input Surface']
        query_points_embeddings = embeddings['Query Points']

        embedded_sequences = []

        for input_surface_embedding, query_points_embedding in zip(input_surface_embeddings, query_points_embeddings):
            # Add mask token to the query point embeddings
            masked_query_points_embedding = query_points_embedding + self.mask_token

            # Combine input surface embeddings and masked query points embeddings
            combined_sequence = torch.cat((input_surface_embedding, masked_query_points_embedding), dim=0)

            # Apply layer normalization
            combined_sequence = self.layer_norm(combined_sequence)

            embedded_sequences.append(combined_sequence)

        return embedded_sequences, external_features_batch


# # Example of initializing and using this module
# d_embedding = HYPERPARAMETERS['Surface Embedding']['Embedding Dimension']  # Desired number of output channels
# surface_embedding = SurfaceEmbedding(d_embedding=d_embedding)
# embedded_sequences_batch, external_features_batch = surface_embedding(batch)
# embedded_sequences_batch

# Surface Encoding

## Encoder

In [13]:
import torch
import torch.nn as nn

class ResidualNorm(nn.Module):
    def __init__(self, d_embedding):
        super(ResidualNorm, self).__init__()
        self.norm = nn.LayerNorm(d_embedding)

    def forward(
        self, 
        x, 
        sublayer_output
    ):
        return self.norm(x + sublayer_output)
    

class GatedAttentionFusion(nn.Module):
    def __init__(
        self, 
        d_embedding,
        gate_dropout,
        weight_initializer_std=0.02,
        bias_initializer_value=10.0,
        remove_external_attention=False,
        remove_gate=False
    ):
        super(GatedAttentionFusion, self).__init__()
        self.gate_layer = nn.Sequential(
            nn.Linear(d_embedding * 2, d_embedding),
            nn.Sigmoid(),
            nn.Dropout(gate_dropout)
        )
        self.remove_external_attention = remove_external_attention
        self.remove_gate = remove_gate

        # Initialize weights and biases
        self._initialize_weights(weight_initializer_std, bias_initializer_value)

    def _initialize_weights(
        self, 
        std, 
        bias_value
    ):
        for module in self.gate_layer:
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=std)
                nn.init.constant_(module.bias, bias_value)

    def forward(
        self, 
        self_attention_output, 
        external_attention_output
    ):
        if self.remove_external_attention:

            return self_attention_output

        if self.remove_gate:  

            return self_attention_output + external_attention_output
        # Concatenate self-attention and external attention outputs
        concatenated_output = torch.cat((self_attention_output, external_attention_output), dim=-1)
        # Compute gate values
        gate_values = self.gate_layer(concatenated_output)
        # Calculate gated embedding
        gated_embedding = gate_values * self_attention_output + (1 - gate_values) * external_attention_output

        return gated_embedding
    
    
class FeedForwardNetwork(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        ffn_hidden_dim, 
        ffn_dropout, 
        layer_depth, 
        weight_initializer_std=0.02, 
        bias_initializer_value=0,
    ):
        super(FeedForwardNetwork, self).__init__()
        self.feedforward = nn.Sequential(
            nn.Linear(d_embedding, ffn_hidden_dim),
            nn.GELU(),
            nn.Dropout(ffn_dropout),
            nn.Linear(ffn_hidden_dim, d_embedding),
            nn.Dropout(ffn_dropout)
        )

        self.layer_depth = layer_depth
        self._initialize_weights(weight_initializer_std, bias_initializer_value)

    def forward(self, x):
        return self.feedforward(x)
    
    def _initialize_weights(
        self, 
        std, 
        bias_value
    ):
        for i, module in enumerate(self.feedforward):
            if isinstance(module, nn.Linear):
                nn.init.normal_(module.weight, mean=0.0, std=std)
                nn.init.constant_(module.bias, bias_value)
                
                # Rescale the output matrices of the last linear projection
                if i == len(self.feedforward) - 2:
                    scale_factor = 1 / (2 * self.layer_depth) ** 0.5
                    module.weight.data *= scale_factor


class Encoder(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        n_heads, 
        ffn_hidden_dim, 
        attention_dropout, 
        gate_dropout,
        ffn_dropout,
        external_dim,
        layer_depth,
        weight_initializer_std=0.02,
        linear_bias_initializer_value=0.0,
        gate_bias_initializer_value=10.0,
        remove_external_attention=False,
        remove_gate=False
    ):
        super(Encoder, self).__init__()
        self.self_attention = nn.MultiheadAttention(
            embed_dim=d_embedding, 
            num_heads=n_heads, 
            dropout=attention_dropout
        )
        self.residual_norm_self_attention = ResidualNorm(d_embedding)
        self.external_attention = nn.MultiheadAttention(
            embed_dim=d_embedding, 
            num_heads=n_heads, 
            kdim=external_dim, 
            vdim=external_dim, 
            dropout=attention_dropout
        )
        self.residual_norm_external_attention = ResidualNorm(d_embedding)
        self.gated_attention_fusion = GatedAttentionFusion(
            d_embedding, 
            gate_dropout,
            weight_initializer_std,
            gate_bias_initializer_value,
            remove_external_attention, 
            remove_gate,
        )
        self.residual_norm_fusion = ResidualNorm(d_embedding)
        self.feed_forward = FeedForwardNetwork(
            d_embedding, 
            ffn_hidden_dim, 
            ffn_dropout, 
            layer_depth, 
            weight_initializer_std, 
            linear_bias_initializer_value
        )
        self.residual_norm_ffn = ResidualNorm(d_embedding)
        # Initialize self-attention
        self._initialize_attention_weights(self.self_attention, weight_initializer_std, linear_bias_initializer_value, layer_depth)
        # Initialize external-attention
        self._initialize_attention_weights(self.external_attention, weight_initializer_std, linear_bias_initializer_value, layer_depth)

    def _initialize_attention_weights(
        self, 
        attention_module, 
        weight_initializer_std, 
        linear_bias_initializer_value, 
        layer_depth
    ):
        if attention_module._qkv_same_embed_dim:
            nn.init.normal_(attention_module.in_proj_weight, mean=0.0, std=weight_initializer_std)
        else:
            nn.init.normal_(attention_module.q_proj_weight, mean=0.0, std=weight_initializer_std)
            nn.init.normal_(attention_module.k_proj_weight, mean=0.0, std=weight_initializer_std)
            nn.init.normal_(attention_module.v_proj_weight, mean=0.0, std=weight_initializer_std)

        if attention_module.in_proj_bias is not None:
            nn.init.constant_(attention_module.in_proj_bias, linear_bias_initializer_value)
            nn.init.constant_(attention_module.out_proj.bias, linear_bias_initializer_value)
        
        if attention_module.bias_k is not None:
            nn.init.constant_(attention_module.bias_k, linear_bias_initializer_value)
        if attention_module.bias_v is not None:
            nn.init.constant_(attention_module.bias_v, linear_bias_initializer_value)
        
        # Transformer layer rescaling for output weights
        scale_factor = 1 / (2 * layer_depth) ** 0.5
        nn.init.normal_(attention_module.out_proj.weight, mean=0.0, std=weight_initializer_std * scale_factor)

    def forward(
        self, 
        surface_embeddings, 
        external_features,
        output_attention_map=False
    ):
        # Self-Attention
        self_attention_output, self_attention_weights = self.self_attention(surface_embeddings, surface_embeddings, surface_embeddings)
        self_attention_output = self.residual_norm_self_attention(surface_embeddings, self_attention_output)
        # External Attention
        external_attention_output, external_attention_weights = self.external_attention(surface_embeddings, external_features, external_features) 
        external_attention_output = self.residual_norm_external_attention(surface_embeddings, external_attention_output)
        # Gated Attention Fusion
        gated_embedding = self.gated_attention_fusion(self_attention_output, external_attention_output)
        gated_embedding = self.residual_norm_fusion(surface_embeddings, gated_embedding)
        # Feed-Forward Network
        ffn_output = self.feed_forward(gated_embedding)
        # Final Residual Connection and Layer Normalization
        surface_embeddings = self.residual_norm_ffn(gated_embedding, ffn_output)

        if output_attention_map:
            # Remove the batch dimension for attention weights
            return surface_embeddings, self_attention_weights.squeeze(0), external_attention_weights.squeeze(0)
        
        return surface_embeddings, None, None

class SurfaceEncoder(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        num_encoder_blocks,
        n_heads, 
        ffn_hidden_dim,
        attention_dropout, 
        gate_dropout,
        ffn_dropout,
        external_dim,
        weight_initializer_std=0.02,
        linear_bias_initializer_value=0.0,
        gate_bias_initializer_value=10.0,
        remove_external_attention=False,
        remove_gate=False
    ):
        super(SurfaceEncoder, self).__init__()
        self.encoders = nn.ModuleList([
            Encoder(
                d_embedding, 
                n_heads, 
                ffn_hidden_dim, 
                attention_dropout, 
                gate_dropout,
                ffn_dropout,
                external_dim,
                (i + 1),
                weight_initializer_std,
                linear_bias_initializer_value,
                gate_bias_initializer_value,
                remove_external_attention,
                remove_gate
            )
            for i in range(num_encoder_blocks)
        ])

    def forward(
        self, 
        embedded_sequences_batch, 
        external_features_batch,
        output_attention_map=False
    ):
        batch_size = len(embedded_sequences_batch)
        encoded_sequences_batch = []
        self_attention_maps = []
        external_attention_maps = []

        for i in range(batch_size):
            surface_embeddings = embedded_sequences_batch[i].unsqueeze(1) 
            external_features = external_features_batch[i].unsqueeze(0).unsqueeze(0)

            for j, encoder in enumerate(self.encoders):
                if j == len(self.encoders) - 1 and output_attention_map:
                    surface_embeddings, self_attention_map, external_attention_map = encoder(surface_embeddings, external_features, output_attention_map)
                    
                else:
                    surface_embeddings, _, _ = encoder(surface_embeddings, external_features)
                
            encoded_sequences_batch.append(surface_embeddings.squeeze(1))
            if output_attention_map:
                self_attention_maps.append(self_attention_map)
                external_attention_maps.append(external_attention_map)

        if output_attention_map:
            return encoded_sequences_batch, self_attention_maps, external_attention_maps
        
        return encoded_sequences_batch, None, None    

# Example of initializing and using these modules
torch.manual_seed(RANDOM_STATE)
n_heads = HYPERPARAMETERS['Surface Encoding']['Number of Heads']
ffn_hidden_dim = HYPERPARAMETERS['Surface Encoding']['FFN Hidden Dimension']
attention_dropout = HYPERPARAMETERS['Surface Encoding']['Attention Dropout']
gate_dropout = HYPERPARAMETERS['Surface Encoding']['Gate Dropout']
ffn_dropout = HYPERPARAMETERS['Surface Encoding']['FFN Dropout']
num_encoder_blocks = HYPERPARAMETERS['Surface Encoding']['Number of Blocks']
external_dim = 5

surface_encoder = SurfaceEncoder(
    d_embedding, 
    num_encoder_blocks,
    n_heads, 
    ffn_hidden_dim, 
    attention_dropout, 
    gate_dropout, 
    ffn_dropout, 
    external_dim, 
)

# Assume embedded_sequences_batch is the output of the SurfaceEmbedding module and
# external_features is the formatted external market features batch
# encoded_sequences_batch, self_attention_map_batch, external_attention_map_batch = surface_encoder(embedded_sequences_batch, external_features_batch)
# encoded_sequences_batch

# IvySPT

In [14]:
import torch
import torch.nn as nn

class IvySPT(nn.Module):
    def __init__(
        self, 
        d_embedding, 
        num_encoder_blocks,
        n_heads, 
        ffn_hidden_dim,
        attention_dropout, 
        gate_dropout,
        ffn_dropout,
        external_dim,
        weight_initializer_std=0.02,
        linear_bias_initializer_value=0.0,
        gate_bias_initializer_value=10.0,
        remove_kernel=False,
        remove_positional_embedding=False,
        remove_external_attention=False,
        remove_gate=False
    ):
        super(IvySPT, self).__init__()
        self.surface_embedding = SurfaceEmbedding(
            d_embedding, 
            remove_kernel, 
            remove_positional_embedding
        )
        self.surface_encoder = SurfaceEncoder(
            d_embedding, 
            num_encoder_blocks,
            n_heads, 
            ffn_hidden_dim,
            attention_dropout, 
            gate_dropout,
            ffn_dropout,
            external_dim,
            weight_initializer_std,
            linear_bias_initializer_value,
            gate_bias_initializer_value,
            remove_external_attention,
            remove_gate
        )
        self.final_layer = nn.Linear(d_embedding, 1)
        nn.init.normal_(self.final_layer.weight, mean=0.0, std=weight_initializer_std * (1 / (2 * (num_encoder_blocks + 1)) ** 0.5))
        nn.init.constant_(self.final_layer.bias, linear_bias_initializer_value)

    def forward(
        self, 
        batch,
        output_attention_map=False
    ):
        # Obtain the embedded sequences and external features from the SurfaceEmbedding module
        embedded_sequences_batch, external_features_batch = self.surface_embedding(batch)

        # Encode the sequences using the SurfaceEncoder module
        encoded_sequences_batch, self_attention_maps, external_attention_maps = self.surface_encoder(
            embedded_sequences_batch, 
            external_features_batch, 
            output_attention_map
        )

        # List to hold the implied volatility estimates for each query point in the batch
        iv_estimates_batch = []

        query_self_attention_maps = []
        query_external_attention_maps = []

        for i in range(len(encoded_sequences_batch)):
            # Extract the encoded sequence
            encoded_sequence = encoded_sequences_batch[i]

            # Determine the number of query points for this sequence
            num_query_points = len(batch['Query Points']['Log Moneyness'][i])

            # Extract the encoded query points (last num_query_points elements in the sequence)
            encoded_query_points = encoded_sequence[-num_query_points:]

            # Estimate the implied volatility for each query point using the fully connected layer
            iv_estimates = self.final_layer(encoded_query_points).squeeze(-1)

            # Append the estimates to the batch list
            iv_estimates_batch.append(iv_estimates)

            if output_attention_map:
                # Extract the attention maps for the query points
                self_attention_map = self_attention_maps[i][-num_query_points:]
                external_attention_map = external_attention_maps[i][-num_query_points:]

                query_self_attention_maps.append(self_attention_map)
                query_external_attention_maps.append(external_attention_map)

        if output_attention_map:
            return iv_estimates_batch, self_attention_maps, external_attention_maps
        
        return iv_estimates_batch, None, None

# Example of initializing and using this module
torch.manual_seed(RANDOM_STATE)
n_heads = HYPERPARAMETERS['Surface Encoding']['Number of Heads']
ffn_hidden_dim = HYPERPARAMETERS['Surface Encoding']['FFN Hidden Dimension']
attention_dropout = HYPERPARAMETERS['Surface Encoding']['Attention Dropout']
gate_dropout = HYPERPARAMETERS['Surface Encoding']['Gate Dropout']
ffn_dropout = HYPERPARAMETERS['Surface Encoding']['FFN Dropout']
num_encoder_blocks = HYPERPARAMETERS['Surface Encoding']['Number of Blocks']
d_embedding = HYPERPARAMETERS['Surface Embedding']['Embedding Dimension']  # Desired number of output channels
external_dim = 5

ivy_spt = IvySPT(
    d_embedding, 
    num_encoder_blocks,
    n_heads, 
    ffn_hidden_dim,
    attention_dropout, 
    gate_dropout,
    ffn_dropout,
    external_dim
)

# Pass the batch through the IvySPT model to get implied volatility estimates
iv_estimates_batch, self_attention_maps, external_attention_maps = ivy_spt(batch, output_attention_map=False)
gc.collect()
iv_estimates_batch

[tensor([0.0168], grad_fn=<SqueezeBackward1>),
 tensor([0.0115], grad_fn=<SqueezeBackward1>),
 tensor([0.0085], grad_fn=<SqueezeBackward1>),
 tensor([0.0136], grad_fn=<SqueezeBackward1>)]

In [15]:
batch['Query Points']['Implied Volatility']

[tensor([0.2849]), tensor([0.6361]), tensor([0.2944]), tensor([0.2965])]

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SurfaceArbitrageFreeLoss(nn.Module):
    def __init__(
        self, 
        mse_coeff=1,
        calendar_coeff=1,
        butterfly_coeff=1
    ):
        super(SurfaceArbitrageFreeLoss, self).__init__()
        self.mse_coeff = mse_coeff
        self.calendar_coeff = calendar_coeff
        self.butterfly_coeff = butterfly_coeff

    def forward(self, iv_estimates_batch, batch):
        mse_losses = []
        calendar_arbitrage_losses = []
        butterfly_arbitrage_losses = []

        for iv_estimates, target_volatility, time_to_maturity, log_moneyness in zip(
            iv_estimates_batch, 
            batch['Query Points']['Implied Volatility'], 
            batch['Query Points']['Time to Maturity'], 
            batch['Query Points']['Log Moneyness']
        ):
            # Calculate mean squared error between model estimates and target volatilities
            mse_loss = F.mse_loss(iv_estimates, target_volatility, reduction='none')
            mse_losses.append(mse_loss)

            # Calculate the total implied variance
            total_implied_variance = time_to_maturity * iv_estimates.pow(2)

            # Compute gradients needed for arbitrage conditions
            w_t = torch.autograd.grad(total_implied_variance.sum(), time_to_maturity, create_graph=True)[0] 
            w_x = torch.autograd.grad(total_implied_variance.sum(), log_moneyness, create_graph=True)[0]
            w_xx = torch.autograd.grad(w_x.sum(), log_moneyness, create_graph=True)[0]

            # Calculate Calendar Arbitrage Loss
            calendar_arbitrage_loss = torch.clamp(-w_t, min=0) ** 2
            calendar_arbitrage_losses.append(calendar_arbitrage_loss)

            # Calculate Butterfly Arbitrage Loss
            w = total_implied_variance
            g = (1 - log_moneyness * w_x / (2 * w)) ** 2 - w_x / 4 * (1 / w + 1 / 4) + w_xx / 2
            butterfly_arbitrage_loss = torch.clamp(-g, min=0) ** 2
            butterfly_arbitrage_losses.append(butterfly_arbitrage_loss)

        # Combine all losses
        mse_loss = torch.cat(mse_losses).mean()
        calendar_arbitrage_loss = torch.cat(calendar_arbitrage_losses).mean()
        butterfly_arbitrage_loss = torch.cat(butterfly_arbitrage_losses).mean()

        total_loss = self.mse_coeff * mse_loss + self.calendar_coeff * calendar_arbitrage_loss + \
            self.butterfly_coeff * butterfly_arbitrage_loss

        return total_loss, mse_loss, calendar_arbitrage_loss, butterfly_arbitrage_loss

surface_arbitrage_free_loss = SurfaceArbitrageFreeLoss()  
total_loss, mse_loss, calendar_arbitrage_loss, butterfly_arbitrage_loss = surface_arbitrage_free_loss(iv_estimates_batch, batch)
total_loss, mse_loss, calendar_arbitrage_loss, butterfly_arbitrage_loss

(tensor(0.1559, grad_fn=<AddBackward0>),
 tensor(0.1559, grad_fn=<MeanBackward0>),
 tensor(0., grad_fn=<MeanBackward0>),
 tensor(0., grad_fn=<MeanBackward0>))