In [2]:
import numpy as np
import pandas as pd
import random
import torch

In [None]:
# Set the random seed for reproducibility
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

In [13]:
aapl_googl_data = pd.read_csv('volatility_surface_AAPL_GOOGL_2013_01_2013_06.csv', parse_dates=True, index_col=[0, 1], date_format="ISO8601")
aapl_googl_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Log Moneyness,Time to Maturity,Implied Volatility,Market Return,Market Volatility,Treasury Rate
Datetime,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-02,AAPL,-0.316688,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.316688,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.3726,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.304266,0.007937,0.6095,0.025086,14.680000,0.055
2013-01-02,AAPL,-0.291996,0.007937,0.3726,0.025086,14.680000,0.055
...,...,...,...,...,...,...,...
2013-06-28,GOOGL,0.427518,2.253968,0.2430,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2383,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.434898,2.253968,0.2426,-0.004299,16.860001,0.030
2013-06-28,GOOGL,0.442224,2.253968,0.2402,-0.004299,16.860001,0.030


In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

def generate_datasets(df, proportions, random_state=42):
    rng = np.random.default_rng(random_state)
    
    datasets = []
    grouped = df.groupby(level=['Datetime', 'Symbol'])
    
    for (date, symbol), group in grouped:
        market_features = {
            'Market Return': group['Market Return'].iloc[0],
            'Market Volatility': group['Market Volatility'].iloc[0],
            'Treasury Rate': group['Treasury Rate'].iloc[0]
        }
        
        for p in proportions:
            datasets.extend(generate_single_dataset(group, p, market_features, rng, random_state))
    
    return datasets

def generate_single_dataset(group, proportion, market_features, rng, random_state):
    k = int(np.ceil(1 / proportion))
    data_points = group[['Log Moneyness', 'Time to Maturity']]
    iv_values = group['Implied Volatility']

    # Create the clustering pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('kmeans', KMeans(n_clusters=k, random_state=random_state))
    ])
    
    # Fit the pipeline to the data points
    labels = pipeline.fit_predict(data_points)
    
    # Cluster-based masking
    datasets = []

    for cluster in range(k):
        cluster_indices = np.where(labels == cluster)[0]
        num_to_mask = int(np.ceil(len(cluster_indices) * proportion))
        masked_indices = rng.choice(cluster_indices, size=num_to_mask, replace=False)
        
        for idx in masked_indices:
            input_surface = group.drop(index=group.iloc[masked_indices].index)
            query_point = group.iloc[idx][['Log Moneyness', 'Time to Maturity']]
            target_iv = group.iloc[idx]['Implied Volatility']
            
            datasets.append({
                'Datetime': group.index.get_level_values('Datetime')[0],
                'Symbol': group.index.get_level_values('Symbol')[0],
                'Input Surface': input_surface[['Log Moneyness', 'Time to Maturity', 'Implied Volatility']],
                'Market Features': market_features,
                'Query Point': query_point.to_dict(),
                'Target Volatility': target_iv
            })
    
    return datasets


Index(['Log Moneyness', 'Time to Maturity', 'Implied Volatility',
       'Market Return', 'Market Volatility', 'Treasury Rate'],
      dtype='object')