In [1]:
import numpy as np
import pandas as pd
import random
import torch

In [2]:
# Set the random seed for reproducibility
RANDOM_STATE = 0
BATCH_SIZE = 32
N_JOBS = 8
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [3]:
aapl_googl_data = pd.read_csv('volatility_surface_AAPL_GOOGL_2013_01_2013_06.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
aapl_googl_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Log Moneyness,Time to Maturity,Implied Volatility,Market Return,Market Volatility,Treasury Rate
Datetime,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-02,AAPL,-0.316688,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.316688,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.291996,0.007937,0.3726,0.025086,14.680000,0.055
...,...,...,...,...,...,...,...
2013-06-28,GOOGL,0.427518,2.253968,0.2430,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2383,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2426,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.442224,2.253968,0.2402,-0.004299,16.860001,0.030


In [4]:
import gc
from joblib_progress import joblib_progress
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from joblib import Parallel, delayed

def implied_volatility_surface_datasets(
    options_market_data, 
    proportions, 
    n_jobs=1,
    random_state=0,
    n_chunks=1
):
    def mask_surface(
        date, 
        symbol, 
        surface, 
        rng
    ):
        def mask_surface_with_proportion(
            surface_data, 
            proportion, 
        ):
            n_clusters = int(np.ceil(1 / proportion))
            points_coordinates = surface_data['points_coordinates']
            points_volatilities = surface_data['points_volatilities']

            # Create the clustering pipeline
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto'))
            ])
            
            # Fit the pipeline to the data points
            labels = pipeline.fit_predict(points_coordinates)
            
            single_surface_datasets = []
            for cluster in range(n_clusters):
                cluster_indices = np.where(labels == cluster)[0]
                num_to_mask = int(np.ceil(len(cluster_indices) * proportion))
                masked_indices = rng.choice(cluster_indices, size=num_to_mask, replace=False)
                
                for idx in masked_indices:
                    unmasked_indices = np.setdiff1d(cluster_indices, masked_indices)

                    single_surface_datasets.append({
                        'Datetime': surface_data['datetime'],
                        'Symbol': surface_data['symbol'],
                        'Market Features': surface_data['market_features'],
                        'Input Surface': {
                            'Log Moneyness': points_coordinates[unmasked_indices, 0],
                            'Time to Maturity': points_coordinates[unmasked_indices, 1],
                            'Implied Volatility': points_volatilities[unmasked_indices]
                        },
                        'Query Point': {
                            'Log Moneyness': points_coordinates[idx, 0],
                            'Time to Maturity': points_coordinates[idx, 1]
                        },
                        'Target Volatility': points_volatilities[idx]
                    })

            return single_surface_datasets
        
        surface_data = {
            'datetime': date,
            'symbol': symbol,
            'points_coordinates': surface[['Log Moneyness', 'Time to Maturity']].values,
            'points_volatilities': surface['Implied Volatility'].values,
            'market_features': {
                'Market Return': surface['Market Return'].values[0],
                'Market Volatility': surface['Market Volatility'].values[0],
                'Treasury Rate': surface['Treasury Rate'].values[0]
            }
        }
        
        datasets = []
        for proportion in proportions:
            datasets.extend(mask_surface_with_proportion(surface_data, proportion))

        return datasets

    rng = np.random.default_rng(random_state)
    all_surfaces = list(options_market_data.groupby(level=['Datetime', 'Symbol']))
    n_surfaces = len(all_surfaces)
    
    # Split the array into 'n_chunks' chunks
    chunks = np.array_split(range(n_surfaces), n_chunks)
    # Initialize the list to hold all results
    surface_datasets = []
    # Process each chunk sequentially
    with joblib_progress("Surfaces...", total=n_surfaces): 
        for chunk in chunks:
            # Process the current chunk in parallel
            output = Parallel(n_jobs=n_jobs)(
                delayed(mask_surface)(date, symbol, surface, rng)
                for (date, symbol), surface in [all_surfaces[i] for i in chunk]
            )
            # Extend the overall results with the current chunk's results
            surface_datasets.extend(output)
            gc.collect()  

    # Flatten the list of lists into a single list of datasets
    return [item for sublist in surface_datasets for item in sublist]

aapl_googl_dataset = implied_volatility_surface_datasets(
    aapl_googl_data,
    [0.1, 0.2, 0.4, 0.8],
    n_jobs=N_JOBS,
    random_state=RANDOM_STATE,
    n_chunks=4
)

Output()

In [None]:
import pickle

with open('aapl_googl_dataset.pickle', 'wb') as handle:
    pickle.dump(aapl_googl_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('aapl_googl_dataset.pickle', 'rb') as handle:
    aapl_googl_dataset_ = pickle.load(handle)


In [5]:
len(aapl_googl_dataset)

863509

In [6]:
aapl_googl_dataset[0]

{'Datetime': Timestamp('2013-01-02 00:00:00'),
 'Symbol': 'AAPL',
 'Market Features': {'Market Return': 0.0250861159586972,
  'Market Volatility': 14.68000030517578,
  'Treasury Rate': 0.0549999997019767},
 'Input Surface': {'Log Moneyness': array([-0.74747141, -0.72842322, -0.72842322, -0.70973108, -0.69138194,
         -0.69138194, -0.67336344, -0.67336344, -0.63827212, -0.63827212,
         -0.62117768, -0.62117768, -0.60437057, -0.60437057, -0.58784126,
         -0.58784126, -0.57158074, -0.5555804 , -0.5555804 , -0.53983205,
         -0.53983205, -0.52432786, -0.52432786, -0.50906039, -0.50906039,
         -0.49402251, -0.49402251, -0.47920742, -0.47920742, -0.46460862,
         -0.46460862, -0.45021989, -0.45021989, -0.43603525, -0.43603525,
         -0.42204901, -0.42204901, -0.40825569, -0.40825569, -0.39465004,
         -0.39465004, -0.74747141, -0.74747141, -0.72842322, -0.70973108,
         -0.70973108, -0.69138194, -0.69138194, -0.67336344, -0.67336344,
         -0.65566386

In [15]:
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import Dataset

class IVSurfaceDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data_point = self.data[idx]

        # Convert each component of the data point into tensors as appropriate
        return {
            'Datetime': data_point['Datetime'],
            'Symbol': data_point['Symbol'],
            'Market Features': {
                'Market Return': torch.tensor(data_point['Market Features']['Market Return'], dtype=torch.float32),
                'Market Volatility': torch.tensor(data_point['Market Features']['Market Volatility'], dtype=torch.float32),
                'Treasury Rate': torch.tensor(data_point['Market Features']['Treasury Rate'], dtype=torch.float32),
            },
            'Input Surface': {
                'Log Moneyness': torch.tensor(data_point['Input Surface']['Log Moneyness'], dtype=torch.float32),
                'Time to Maturity': torch.tensor(data_point['Input Surface']['Time to Maturity'], dtype=torch.float32),
                'Implied Volatility': torch.tensor(data_point['Input Surface']['Implied Volatility'], dtype=torch.float32),
            },
            'Query Point': {
                'Log Moneyness': torch.tensor(data_point['Query Point']['Log Moneyness'], dtype=torch.float32),
                'Time to Maturity': torch.tensor(data_point['Query Point']['Time to Maturity'], dtype=torch.float32),
            },
            'Target Volatility': torch.tensor(data_point['Target Volatility'], dtype=torch.float32),
        }

    def collate_fn(batch):
        # Organize batch data by structuring as a dictionary with batched components
        batched_data = {
            'Datetime': [item['Datetime'] for item in batch],
            'Symbol': [item['Symbol'] for item in batch],
            'Market Features': {
                'Market Return': default_collate([item['Market Features']['Market Return'] for item in batch]),
                'Market Volatility': default_collate([item['Market Features']['Market Volatility'] for item in batch]),
                'Treasury Rate': default_collate([item['Market Features']['Treasury Rate'] for item in batch]),
            },
            'Input Surface': {
                'Log Moneyness': [item['Input Surface']['Log Moneyness'] for item in batch],
                'Time to Maturity': [item['Input Surface']['Time to Maturity'] for item in batch],
                'Implied Volatility': [item['Input Surface']['Implied Volatility'] for item in batch],
            },
            'Query Point': {
                'Log Moneyness': default_collate([item['Query Point']['Log Moneyness'] for item in batch]),
                'Time to Maturity': default_collate([item['Query Point']['Time to Maturity'] for item in batch]),
            },
            'Target Volatility': default_collate([item['Target Volatility'] for item in batch]),
        }

        return batched_data



aapl_googl_data_loader = DataLoader(
    IVSurfaceDataset(aapl_googl_dataset), 
    batch_size=4, 
    shuffle=True, 
    num_workers=0, 
    collate_fn=IVSurfaceDataset.collate_fn
)

# Fetch one batch from the DataLoader
batch = next(iter(aapl_googl_data_loader))
batch

{'Datetime': [Timestamp('2013-01-29 00:00:00'),
  Timestamp('2013-04-16 00:00:00'),
  Timestamp('2013-01-29 00:00:00'),
  Timestamp('2013-06-11 00:00:00')],
 'Symbol': ['AAPL', 'GOOGL', 'AAPL', 'AAPL'],
 'Market Features': {'Market Return': tensor([ 0.0051,  0.0142,  0.0051, -0.0102]),
  'Market Volatility': tensor([13.3100, 13.9600, 13.3100, 17.0700]),
  'Treasury Rate': tensor([0.0630, 0.0520, 0.0630, 0.0450])},
 'Input Surface': {'Log Moneyness': [tensor([-0.2553, -0.2414, -0.2276, -0.1873, -0.1742, -0.1613, -0.1613, -0.1486,
           -0.1360, -0.1236, -0.1113, -0.0992, -0.0992, -0.0872, -0.0754, -0.0637,
           -0.0637, -0.0521, -0.0521, -0.0294, -0.0182, -0.0072,  0.0038,  0.0038,
            0.0146,  0.0146,  0.0253,  0.0359,  0.0359,  0.0463,  0.0463,  0.0567,
            0.0567,  0.0669,  0.0669,  0.0771,  0.0771,  0.0871,  0.0971,  0.1070,
            0.1167, -0.1613, -0.1486, -0.1360, -0.1113, -0.0992, -0.0754, -0.0754,
           -0.0637, -0.0637, -0.0521, -0.0521, -0.

In [27]:
import torch
import torch.nn as nn

class SurfaceBatchNorm(nn.Module):
    def __init__(self, num_features=1, eps=1e-5, momentum=0.1):
        super(SurfaceBatchNorm, self).__init__()
        self.log_moneyness_bn = nn.BatchNorm1d(num_features, eps, momentum)
        self.time_to_maturity_bn = nn.BatchNorm1d(num_features, eps, momentum)
        self.implied_volatility_bn = nn.BatchNorm1d(num_features, eps, momentum)
        self.market_return_bn = nn.BatchNorm1d(num_features, eps, momentum)
        self.market_volatility_bn = nn.BatchNorm1d(num_features, eps, momentum)
        self.treasury_rate_bn = nn.BatchNorm1d(num_features, eps, momentum)

    def forward(self, batch):
        # Concatenate all tensors from the Input Surface into one tensor for each feature
        input_surface_log_moneyness = torch.cat([x for x in batch['Input Surface']['Log Moneyness']])
        input_surface_time_to_maturity = torch.cat([x for x in batch['Input Surface']['Time to Maturity']])
        input_surface_implied_volatility = torch.cat([x for x in batch['Input Surface']['Implied Volatility']])

        # Concatenate Input Surface tensors with Query Point tensors
        total_log_moneyness = torch.cat([input_surface_log_moneyness, batch['Query Point']['Log Moneyness']])
        total_time_to_maturity = torch.cat([input_surface_time_to_maturity, batch['Query Point']['Time to Maturity']])

        # Normalize Log Moneyness and Time to Maturity
        norm_log_moneyness = self.log_moneyness_bn(total_log_moneyness.unsqueeze(1)).squeeze(1)
        norm_time_to_maturity = self.time_to_maturity_bn(total_time_to_maturity.unsqueeze(1)).squeeze(1)

        # Normalize Implied Volatility (only from Input Surface)
        norm_implied_volatility = self.implied_volatility_bn(input_surface_implied_volatility.unsqueeze(1)).squeeze(1)

        # Split the normalized results back to corresponding structures
        input_surface_sizes = [len(x) for x in batch['Input Surface']['Log Moneyness']]
        total_input_size = sum(input_surface_sizes)

        # Normalizing Market Features
        market_features = batch['Market Features']
        norm_market_return = self.market_return_bn(market_features['Market Return'].unsqueeze(1)).squeeze(1)
        norm_market_volatility = self.market_volatility_bn(market_features['Market Volatility'].unsqueeze(1)).squeeze(1)
        norm_treasury_rate = self.treasury_rate_bn(market_features['Treasury Rate'].unsqueeze(1)).squeeze(1)

        # Reconstructing the batch with normalized data
        output = {
            'Datetime': batch['Datetime'],
            'Symbol': batch['Symbol'],
            'Market Features': {
                'Market Return': norm_market_return,
                'Market Volatility': norm_market_volatility,
                'Treasury Rate': norm_treasury_rate
            },
            'Input Surface': {
                'Log Moneyness': torch.split(norm_log_moneyness[:total_input_size], input_surface_sizes),
                'Time to Maturity': torch.split(norm_time_to_maturity[:total_input_size], input_surface_sizes),
                'Implied Volatility': torch.split(norm_implied_volatility, input_surface_sizes)
            },
            'Query Point': {
                'Log Moneyness': norm_log_moneyness[total_input_size:],
                'Time to Maturity': norm_time_to_maturity[total_input_size:]
            },
            'Target Volatility': batch['Target Volatility']
        }

        return output

# Usage
surfacebatchnorm = SurfaceBatchNorm()
surfacebatchnorm(batch)

{'Datetime': [Timestamp('2013-01-29 00:00:00'),
  Timestamp('2013-04-16 00:00:00'),
  Timestamp('2013-01-29 00:00:00'),
  Timestamp('2013-06-11 00:00:00')],
 'Symbol': ['AAPL', 'GOOGL', 'AAPL', 'AAPL'],
 'Market Features': {'Market Return': tensor([ 0.1659,  1.1436,  0.1659, -1.4754], grad_fn=<SqueezeBackward1>),
  'Market Volatility': tensor([-0.7081, -0.2906, -0.7081,  1.7067], grad_fn=<SqueezeBackward1>),
  'Treasury Rate': tensor([ 0.8748, -0.4525,  0.8748, -1.2971], grad_fn=<SqueezeBackward1>)},
 'Input Surface': {'Log Moneyness': (tensor([-1.1849, -1.1448, -1.1052, -0.9897, -0.9522, -0.9152, -0.9152, -0.8787,
           -0.8426, -0.8070, -0.7718, -0.7370, -0.7370, -0.7027, -0.6688, -0.6352,
           -0.6352, -0.6021, -0.6021, -0.5369, -0.5048, -0.4731, -0.4418, -0.4418,
           -0.4108, -0.4108, -0.3801, -0.3498, -0.3498, -0.3198, -0.3198, -0.2900,
           -0.2900, -0.2606, -0.2606, -0.2315, -0.2315, -0.2027, -0.1741, -0.1459,
           -0.1179, -0.9152, -0.8787, -0.8426