In [None]:
!pip install PyALE

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import MinMaxScaler
from PyALE import ale
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
google_colab = True
lags = 140
training_size = 991232
random_seed = 0

bs_vars = ['Underlying_last', 'Strike', 'TTM', 'R']
underlying_lags = [f'Underlying_{i}' for i in range(lags - 1, 0, -1)] + ['Underlying_return']

### Setup

In [None]:
if google_colab:
    import tensorflow as tf
    # Print info
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Not connected to a GPU')
    else:
        print(gpu_info)
    
    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

    if ram_gb < 20:
        print('Not using a high-RAM runtime')
    else:
        print('You are using a high-RAM runtime!')

    # Code to read csv file into Colaboratory:
    !pip install -U -q PyDrive
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials
    # Authenticate and create the PyDrive client.
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)
    id = "1Doyuyo_VDOmJf0CLo5kl9XzMTfhGtxiR"
    downloaded = drive.CreateFile({'id':id}) 
    downloaded.GetContentFile('2010-2023_NSS_filtered_vF.csv')  
    df_read = pd.read_csv('2010-2023_NSS_filtered_vF.csv')
else:
    file = "../data/processed_data/2010-2023_NSS_filtered_vF.csv"
    df_read = pd.read_csv(file)

display(df_read)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = df_read
del df_read

df = df[(df["Quote_date"] >= "2019-06-01")]


# Group the data by Quote Date and calculate the mean for Underlying Price
df_agg = df.groupby('Quote_date').mean().reset_index()
df_agg = df_agg[['Quote_date', 'Underlying_last']]

# Values to returns
df_agg["Underlying_return"] = df_agg["Underlying_last"].pct_change()

# Add the Underlying Price Lag column
for i in range(1, lags + 1):
    df_agg['Underlying_' + str(i)] = df_agg['Underlying_return'].shift(i)

df = pd.merge(df, df_agg[['Quote_date', 'Underlying_return'] + ['Underlying_' + str(i) for i in range(1, lags + 1)]], on='Quote_date', how='left')
del df_agg

# Filter df between 2011-09-01
df = df[(df["Quote_date"] >= "2020-02-01")]

print(df)

In [None]:
# Format settings
max_timesteps = lags

def create_rw_dataset(window_number = 0, df = None, shap = False):
    '''Creates dataset for a single rolling window period offsett by the window number'''

    # Create train, validation and test set split points
    test_months = 1
    train_start = datetime(2011,12,1) + relativedelta(months=window_number * test_months)
    val_start = train_start + relativedelta(months=3*12)
    train_start = str(train_start.date())
    val_start = str(val_start.date())
        
    # Split train and validation data
    df_train = df[(df['Quote_date'] >= train_start) & (df['Quote_date'] < val_start)]

    # Extract target values
    train_y = df_train['Price'].to_numpy()

    # Print earliest and latest date in every dataframe used
    print("--------------Dataframe dates--------------")
    print(f"Train: {df_train['Quote_date'].min()} - {df_train['Quote_date'].max()}")
    print("-------------------------------------------")

    # Convert dataframes to numpy arrays
    train_x = [df_train[underlying_lags].to_numpy(), df_train[bs_vars].to_numpy()]

    del df_train

    # Scale features based on training set
    underlying_scaler = MinMaxScaler()
    train_x[0] = underlying_scaler.fit_transform(train_x[0].flatten().reshape(-1, 1)).reshape(train_x[0].shape)

    bs_scaler = MinMaxScaler()
    train_x[1] = bs_scaler.fit_transform(train_x[1])


    # Shuffle training set
    np.random.seed(random_seed)
    shuffle = np.random.permutation(len(train_x[0]))
    train_x = [train_x[0][shuffle], train_x[1][shuffle]]

    return train_x, train_start

### Ale

In [None]:
class ModelClass:
    def __init__(self, model):
        self.model = model
    
    def predict(self, x_2d):
        x_2d = x_2d.to_numpy()
        x_3d = [x_2d[:, :lags].reshape(len(x_2d[:, :lags]), lags, 1), x_2d[:, lags:]]
        return np.array(self.model(x_3d))

def create_ale(x_2d, model_object):
    cols = x_2d.columns
    ale_values = np.zeros((len(cols)))
    for i in range(len(cols)):
        value = ale(x_2d, model_object, feature = [cols[i]])
        ale_values[i] = abs(value['eff']).mean()
        print(f'Finished {cols[i]}')
    return ale_values

In [None]:
window = 96
train_x, train_start = create_rw_dataset(df=df, window_number=window, shap = True)

checkpoint_time = '11.05 1 mnd test sett full model run'
if google_colab:
    checkpoint_path = f'/content/drive/MyDrive/01. Masters Thesis - Shared/05. Checkpoints/{checkpoint_time}/{train_start}/'
else:
    checkpoint_path = f'../checkpoints/{checkpoint_time}/{train_start}/'

c_model = load_model(checkpoint_path + ".h5")

In [None]:
checkpoint_time = '11.05 1 mnd test sett full model run'
sample = 100
feature_labels = ['Returns', 'S&P500', 'Strike', 'TTM', 'R']

all_ale_aggs_pct = np.array([])

for window in range(96, 97, 1):
    '''train_x, train_start = create_rw_dataset(df=df, window_number=window, shap = True)
    
    checkpoint_path = f'/content/drive/MyDrive/01. Masters Thesis - Shared/05. Checkpoints/{checkpoint_time}/{train_start}/'

    c_model = load_model(checkpoint_path + ".h5")'''
    print(f'Train start: {train_start}')

    train_x_sample = (train_x[0][:sample], train_x[1][:sample])

    train_x_2d = np.hstack((train_x_sample[0], train_x_sample[1]))

    df_ale_train = pd.DataFrame(train_x_2d, columns = [underlying_lags + bs_vars])

    model = ModelClass(c_model)

    ale_values = create_ale(df_ale_train, model)

    save_path = f'/content/drive/MyDrive/01. Masters Thesis - Shared/05. Predictions/ALE/{checkpoint_time}/{train_start}/'
    df_ale_values = pd.DataFrame(ale_values, columns = [underlying_lags + bs_vars])
    df_ale_values.to_csv(save_path)

    ale_agg = np.append(np.sum(ale_values[:lags]), ale_values[lags:])
    ale_agg_pct = ale_agg / np.sum(ale_agg)
    all_ale_agg_pct = np.append(all_ale_agg_pct, ale_agg_pct)
    print(f'Ale agg: {ale_agg}')
    print(f'Ale pct: {ale_agg_pct}')

    df_pct = pd.DataFrame(feature_labels, ['Feature'])
    df_pct['Importance'] = ale_agg_pct
    df_pct = df_pct.sort_values('Importance', ascending = False)

    sns.barplot(x='Feature', y='Importance', data = df_pct)
    plt.show()

    del train_x

print('Combined for all models:')
df_pct = pd.DataFrame(feature_labels, ['Feature'])
avg_pct = np.mean(all_ale_aggs_pct, axis = 0)
print(f'Avg pre scale {avg_pct}')
avg_pct = avg_pct / np.sum(avg_pct)
print(f'Avg post scale {avg_pct}')
df_pct['Importance'] = all_ale_agg_pct
df_pct = df_pct.sort_values('Importance', ascending = False)

sns.barplot(x='Feature', y='Importance', data = df_pct)
plt.show()