In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import MinMaxScaler

In [None]:
google_colab = False
lags = 140
training_size = 991232
random_seed = 0

bs_vars = ['Underlying_last', 'Strike', 'TTM', 'R']
underlying_lags = [f'Underlying_{i}' for i in range(lags - 1, 0, -1)] + ['Underlying_return']

### Setup

In [None]:
if google_colab:
    import tensorflow as tf
    # Print info
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
        print('Not connected to a GPU')
    else:
        print(gpu_info)
    
    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

    if ram_gb < 20:
        print('Not using a high-RAM runtime')
    else:
        print('You are using a high-RAM runtime!')

    # Code to read csv file into Colaboratory:
    !pip install -U -q PyDrive
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials
    # Authenticate and create the PyDrive client.
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)
    id = "1Doyuyo_VDOmJf0CLo5kl9XzMTfhGtxiR"
    downloaded = drive.CreateFile({'id':id}) 
    downloaded.GetContentFile('2010-2023_NSS_filtered_vF.csv')  
    df_read = pd.read_csv('2010-2023_NSS_filtered_vF.csv')
else:
    file = "../data/processed_data/2010-2023_NSS_filtered_vF.csv"
    df_read = pd.read_csv(file)

display(df_read)

In [None]:
df = df_read
del df_read

df = df[(df["Quote_date"] >= "2019-06-01")]


# Group the data by Quote Date and calculate the mean for Underlying Price
df_agg = df.groupby('Quote_date').mean().reset_index()
df_agg = df_agg[['Quote_date', 'Underlying_last']]

# Values to returns
df_agg["Underlying_return"] = df_agg["Underlying_last"].pct_change()

# Add the Underlying Price Lag column
for i in range(1, lags + 1):
    df_agg['Underlying_' + str(i)] = df_agg['Underlying_return'].shift(i)

df = pd.merge(df, df_agg[['Quote_date', 'Underlying_return'] + ['Underlying_' + str(i) for i in range(1, lags + 1)]], on='Quote_date', how='left')
del df_agg

# Filter df between 2011-09-01
df = df[(df["Quote_date"] >= "2011-12-01")]

print(df)

In [None]:
# Format settings
max_timesteps = lags

def create_rw_dataset(window_number = 0, df = None, shap = False):
    '''Creates dataset for a single rolling window period offsett by the window number'''

    # Create train, validation and test set split points
    test_months = 1
    train_start = datetime(2011,12,1) + relativedelta(months=window_number * test_months)
    val_start = train_start + relativedelta(months=3*12)
    test_start = val_start + relativedelta(months = 1)
    test_end = test_start + relativedelta(months=test_months)
    train_start = str(train_start.date())
    val_start = str(val_start.date())
    test_start = str(test_start.date())
    test_end = str(test_end.date())
        
    # Split train and validation data
    df_train = df[(df['Quote_date'] >= train_start) & (df['Quote_date'] < val_start)]
    df_val = df[(df['Quote_date'] >= val_start) & (df['Quote_date'] < test_start)]
    df_test = df[(df['Quote_date'] >= test_start) & (df['Quote_date'] < test_end)]

    del df

    # Extract target values
    train_y = df_train['Price'].to_numpy()
    val_y = df_val['Price'].to_numpy()
    test_y = df_test['Price'].to_numpy()

    # Print earliest and latest date in every dataframe used
    print("--------------Dataframe dates--------------")
    print(f"Train: {df_train['Quote_date'].min()} - {df_train['Quote_date'].max()}")
    print(f"Val: {df_val['Quote_date'].min()} - {df_val['Quote_date'].max()}")
    print(f"Test: {df_test['Quote_date'].min()} - {df_test['Quote_date'].max()}")
    print("-------------------------------------------")

    # Convert dataframes to numpy arrays
    train_x = [df_train[underlying_lags].to_numpy(), df_train[bs_vars].to_numpy()]
    val_x = [df_val[underlying_lags].to_numpy(), df_val[bs_vars].to_numpy()]
    test_x = [df_test[underlying_lags].to_numpy(), df_test[bs_vars].to_numpy()]

    del df_train
    del df_val

    # Scale features based on training set
    underlying_scaler = MinMaxScaler()
    train_x[0] = underlying_scaler.fit_transform(train_x[0].flatten().reshape(-1, 1)).reshape(train_x[0].shape)
    val_x[0] = underlying_scaler.transform(val_x[0].flatten().reshape(-1,1)).reshape(val_x[0].shape)
    test_x[0] = underlying_scaler.transform(test_x[0].flatten().reshape(-1,1)).reshape(test_x[0].shape)

    bs_scaler = MinMaxScaler()
    train_x[1] = bs_scaler.fit_transform(train_x[1])
    val_x[1] = bs_scaler.transform(val_x[1])
    test_x[1] = bs_scaler.transform(test_x[1])


    # Shuffle training set
    np.random.seed(random_seed)
    shuffle = np.random.permutation(len(train_x[0]))
    train_x = [train_x[0][shuffle], train_x[1][shuffle]]
    train_y = train_y[shuffle]

    # Extract training set
    train_x = [train_x[0][:training_size], train_x[1][:training_size]]
    train_y = train_y[:training_size]


    # Reshape data to fit LSTM
    train_x = [train_x[0].reshape(len(train_x[0]), max_timesteps, 1), train_x[1]]
    val_x = [val_x[0].reshape(len(val_x[0]), max_timesteps, 1), val_x[1]]
    test_x = [test_x[0].reshape(len(test_x[0]), max_timesteps, 1), test_x[1]]

    print(f'Train shape: {train_x[0].shape}, {train_x[1].shape}')
    print(f'Val shape: {val_x[0].shape}, {val_x[1].shape}')
    print(f'Test shape: {test_x[0].shape}, {test_x[1].shape}')

    return train_x, train_y, val_x, val_y, test_x, test_y, train_start, val_start, test_start, df_test

### Ale

In [None]:
class ModelClass:
    def __init__(self, model):
        self.model = model
    
    def predict(self, x_2d):
        x_3d = [x_2d[:, :lags].reshape(len(x_2d[:, :lags]), lags, 1), x_2d[:, lags:]]
        return np.array(self.model(x_3d))

def create_ale(x_2d, model_object):
    ale_values = np.zeros((len(x_2d), len(x_2d[0])))
    for i in range(len(x_2d[0])):
        x_2d_copy = x_2d.copy()
        np.random.shuffle(x_2d_copy[:, i])
        ale_values[:, i] = model_object.predict(x_2d_copy).flatten()
    print(ale_values.shape)
    print(ale_values)
    return ale_values

In [None]:
windows = 99
start_window = 99
window_interval = 1
checkpoint_time = '11.05 1 mnd test sett full model run'

df_shap_combined = pd.DataFrame()

model = None

timestamp = datetime.now()
timestamp = timestamp.strftime("%m-%d_%H-%M")

for window in range(start_window-1, windows, window_interval):
    # Load data
    train_x, test_x, train_x_org, train_start, val_start, test_start = create_rw_dataset(df=df, window_number=window, shap = True)

    if google_colab:
        checkpoint_path = f'/content/drive/MyDrive/01. Masters Thesis - Shared/05. Checkpoints/{checkpoint_time}/{train_start}/'
    else:
        checkpoint_path = f'../checkpoints/{checkpoint_time}/{train_start}/'

    c_model = load_model(checkpoint_path + ".h5")

    train_x_2d_input = np.hstack((train_x[0], train_x[1]))
    test_x_2d_input = np.hstack((test_x[0], test_x[1]))
    
    model = ModelClass(c_model)

    ale_values = create_ale(test_x_2d_input, model)