### Imports

In [20]:
import pandas as pd
import datetime
import os
import sys
import math
import matplotlib.pyplot as plt
import numpy as np
import warnings
from pandas.errors import SettingWithCopyWarning
import dask.dataframe as dd


warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

### Parameters

In [21]:
google_colab = False

### Data loading

In [22]:
# Set the path to the root directory
path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
# Read dataframes using Dask
df = dd.read_csv(path + '/data/predictions/combined_predictions_BS_v2.csv')
df_ba = dd.read_csv(path + '/data/processed_data/2011_11feb-2023mar_NSS_filtered_with_BID_ASK_IV_DELTA.csv')
df["Quote_date"] = dd.to_datetime(df["Quote_date"])
df_ba["Quote_date"] = dd.to_datetime(df_ba["Quote_date"])


df = df[df['Quote_date'] >= '2015-01-11']
df_ba = df_ba[df_ba['Quote_date'] >= '2015-01-11']

# In df, rename IV to BS-IV
df = df.rename(columns={"IV": "BS-IV"})
# In df_ba, drop 

In [23]:
df = df_ba.merge(df, on=["Quote_date", "TTM", "Strike"], how="inner", suffixes=("", "_drop"))

### Data processing

In [24]:
# Turning the dask dataframe into a pandas dataframe
df = df.compute()

In [25]:
# Drop the columns that were duplicated
df = df.drop(columns=["Unnamed: 0", "Underlying_last_drop", "R_drop"])

In [26]:
display(df)

Unnamed: 0,Quote_date,Expire_date,Price,Bid,Ask,Underlying_last,Strike,TTM,Delta,IV,R,Price_drop,Rolling,GARCH,BS-IV,Heston,LSTM-MLP
0,2015-01-12,2015-01-15,603.295,602.2,604.39,2028.56,1425.0,0.008219,1.00000,0.00000,0.000200,603.295,603.562342,603.562342,643.551645,605.290994,603.12946
1,2015-01-12,2015-01-15,483.900,482.4,485.40,2028.56,1545.0,0.008219,0.99564,1.02531,0.000200,483.900,483.562540,483.562540,496.670262,485.306680,483.69577
2,2015-01-12,2015-01-15,468.450,467.2,469.70,2028.56,1560.0,0.008219,1.00000,0.00000,0.000200,468.450,468.562564,468.562564,483.219698,467.614355,468.75870
3,2015-01-12,2015-01-15,363.550,362.4,364.70,2028.56,1665.0,0.008219,1.00000,0.00000,0.000200,363.550,363.562737,363.562737,374.802509,361.991271,364.29678
4,2015-01-12,2015-01-15,322.950,321.9,324.00,2028.56,1705.0,0.008219,1.00000,0.00000,0.000200,322.950,323.562803,323.562803,330.141073,323.068179,324.65866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436709,2023-03-31,2024-04-19,422.450,420.0,424.90,4109.88,4075.0,1.054795,0.61661,0.20127,0.046174,422.450,413.842527,366.069425,460.903651,425.307917,472.97458
436710,2023-03-31,2024-04-19,360.050,357.7,362.40,4109.88,4175.0,1.054795,0.57119,0.19367,0.046174,360.050,360.096980,309.693752,396.952162,366.215057,411.19080
436711,2023-03-31,2024-06-21,1556.450,1544.9,1568.00,4109.88,2675.0,1.227397,0.92096,0.31054,0.045030,1556.450,1579.926061,1578.874712,1616.257310,1555.101104,1584.21610
436712,2023-03-31,2024-06-21,441.050,439.4,442.70,4109.88,4100.0,1.227397,0.60836,0.20096,0.045030,441.050,436.190941,384.288997,481.722637,443.870558,496.36075


In [27]:
# Dates
df["TTM"] = (df["TTM"] * 365).astype(int)
df['Expiry_date'] = pd.to_datetime(df['Expire_date'])
df.drop(columns=["Expire_date"], axis=1, inplace=True)
df['Quote_date'] = pd.to_datetime(df['Quote_date'])

# Adding option ID
df["Option_ID"] = df["Expiry_date"].astype(str) + "-" + df["Strike"].astype(str)

# Period to be traded on
df = df[(df["Expiry_date"] >= "2015-01-01") & (df["Expiry_date"] <= "2023-03-31")]
df = df[(df["Quote_date"] >= "2015-01-01") & (df["Quote_date"] <= "2023-03-31")]

Create underlying dict

In [28]:
# Create dict with Quote_date as key and Underlying_last as key
underlying_last_dict = df.groupby("Quote_date")["Underlying_last"].mean().to_dict()

Adding TTM=0 row

In [29]:
def add_row(group):
    last_row = group.iloc[-1]
    expiry_date = last_row['Expiry_date']

    # If underlying_last_dict[expiry_date] is not defined, return None
    if expiry_date not in underlying_last_dict:
        print("Marked option for deletion: " + str(last_row['Option_ID']) + " on date: " + str(last_row['Quote_date']) + " due to missing underlying price.")
        return None

    underlying_last_on_expiry = underlying_last_dict[expiry_date]
    intrinsic_value = np.maximum(underlying_last_on_expiry - last_row['Strike'], 0)

    new_row = last_row.copy()
    new_row['Quote_date'] = expiry_date
    new_row['Expiry_date'] = expiry_date
    new_row['TTM'] = 0
    new_row['Underlying_last'] = underlying_last_on_expiry
    new_row['Price'] = intrinsic_value

    return new_row

In [30]:
df = df.sort_values(['Quote_date', 'Expiry_date'])

In [31]:
new_rows = df.groupby("Option_ID").apply(add_row).reset_index(drop=True)

2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00
2015-04-01 00:00:00


In [32]:
new_rows = new_rows.dropna()

In [33]:
# Concatenate the original df with the new rows
df = pd.concat([df, new_rows], axis=0)

In [34]:
# Sort the dataframe by Quote_date and Expiry_date
df = df.sort_values(['Quote_date', 'Expiry_date'])

In [35]:
display(df)

Unnamed: 0,Quote_date,Expire_date,Price,Bid,Ask,Underlying_last,Strike,TTM,Delta,IV,R,Price_drop,Rolling,GARCH,BS-IV,Heston,LSTM-MLP,Expiry_date,Option_ID
0,2015-01-12,2015-01-15,603.295,602.2,604.39,2028.56,1425.0,2.0,1.00000,0.00000,0.0002,603.295,6.035623e+02,6.035623e+02,6.435516e+02,605.290994,603.12946,2015-01-15,2015-01-15-1425.0
1,2015-01-12,2015-01-15,483.900,482.4,485.40,2028.56,1545.0,2.0,0.99564,1.02531,0.0002,483.900,4.835625e+02,4.835625e+02,4.966703e+02,485.306680,483.69577,2015-01-15,2015-01-15-1545.0
2,2015-01-12,2015-01-15,468.450,467.2,469.70,2028.56,1560.0,2.0,1.00000,0.00000,0.0002,468.450,4.685626e+02,4.685626e+02,4.832197e+02,467.614355,468.75870,2015-01-15,2015-01-15-1560.0
3,2015-01-12,2015-01-15,363.550,362.4,364.70,2028.56,1665.0,2.0,1.00000,0.00000,0.0002,363.550,3.635627e+02,3.635627e+02,3.748025e+02,361.991271,364.29678,2015-01-15,2015-01-15-1665.0
4,2015-01-12,2015-01-15,322.950,321.9,324.00,2028.56,1705.0,2.0,1.00000,0.00000,0.0002,322.950,3.235628e+02,3.235628e+02,3.301411e+02,323.068179,324.65866,2015-01-15,2015-01-15-1705.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436184,2023-03-30,2023-03-31,466.450,465.9,467.00,4050.95,3585.0,0.0,1.00000,-0.00027,0.0474,466.450,4.664155e+02,4.664155e+02,4.664155e+02,458.415914,474.68167,2023-03-31,2023-03-31-3585.0
436185,2023-03-30,2023-03-31,461.450,460.9,462.00,4050.95,3590.0,0.0,1.00000,-0.00025,0.0474,461.450,4.614162e+02,4.614162e+02,4.614162e+02,453.898531,469.61115,2023-03-31,2023-03-31-3590.0
436186,2023-03-30,2023-03-31,180.650,180.0,181.30,4050.95,3870.0,0.0,1.00000,0.00000,0.0474,180.650,1.814525e+02,1.814525e+02,1.814525e+02,183.942814,192.88177,2023-03-31,2023-03-31-3870.0
436187,2023-03-30,2023-03-31,0.025,0.0,0.05,4050.95,4560.0,0.0,0.00030,0.52978,0.0474,0.025,1.499611e-38,1.276967e-45,7.073497e-07,-6.195841,0.00000,2023-03-31,2023-03-31-4560.0


In [36]:
unique_dates = df['Quote_date'].unique()

# Get first and last Quote_date for each Option_ID
date_ranges = df.groupby("Option_ID")['Quote_date'].agg(['first', 'last'])

# Filter out Option_IDs where first or last date is not in unique_dates
invalid_option_ids = date_ranges[
    ~date_ranges['first'].isin(unique_dates) | 
    ~date_ranges['last'].isin(unique_dates)
].index

if len(invalid_option_ids) > 0:
    print(f"Dropped option groups: {invalid_option_ids.tolist()} because their first or last date is not in unique_dates")
df = df[~df['Option_ID'].isin(invalid_option_ids)]

# Get count of unique dates for each Option_ID
date_counts = df.groupby("Option_ID")['Quote_date'].nunique()

# Get expected count of unique dates between first and last date for each Option_ID
expected_counts = date_ranges.apply(lambda x: np.sum((unique_dates >= x['first']) & (unique_dates <= x['last'])), axis=1)

# Filter out Option_IDs where actual count does not equal expected count
invalid_option_ids = date_counts[date_counts != expected_counts].index
if len(invalid_option_ids) > 0:
    print(f"Dropped option groups: {invalid_option_ids.tolist()} because they are missing dates between their first and last date")

# Keep only rows with valid Option_IDs
df = df[~df['Option_ID'].isin(invalid_option_ids)]

# Get row counts for each Option_ID
row_counts = df.groupby("Option_ID").size()

# Filter out Option_IDs with less than 2 rows
invalid_option_ids = row_counts[row_counts < 2].index
if len(invalid_option_ids) > 0:
    print(f"Dropped option groups: {invalid_option_ids.tolist()} because they have less than 2 rows")
df = df[~df['Option_ID'].isin(invalid_option_ids)]


Dropped option groups: ['2015-01-23-1000.0', '2015-01-23-2500.0', '2015-01-30-1000.0', '2015-01-30-1025.0', '2015-01-30-2500.0', '2015-02-06-1000.0', '2015-02-13-1000.0', '2015-02-19-1000.0', '2015-02-19-1025.0', '2015-02-19-2500.0', '2015-02-27-1000.0', '2015-02-27-1025.0', '2015-02-27-1050.0', '2015-02-27-2500.0', '2015-03-06-1000.0', '2015-03-06-1025.0', '2015-03-06-1050.0', '2015-03-06-2500.0', '2015-03-13-1000.0', '2015-03-13-1025.0', '2015-03-13-1050.0', '2015-03-13-2500.0', '2015-03-19-1000.0', '2015-03-19-1050.0', '2015-03-19-2500.0', '2015-03-27-1000.0', '2015-03-27-1025.0', '2015-03-27-1050.0', '2015-03-27-2500.0', '2015-03-31-1000.0', '2015-03-31-1025.0', '2015-03-31-1050.0', '2015-03-31-2500.0', '2015-04-01-1000.0', '2015-04-01-1050.0', '2015-04-01-1100.0', '2015-04-01-1150.0', '2015-04-01-1200.0', '2015-04-01-1225.0', '2015-04-01-1250.0', '2015-04-01-1275.0', '2015-04-01-1300.0', '2015-04-01-1325.0', '2015-04-01-1350.0', '2015-04-01-1375.0', '2015-04-01-1400.0', '2015-04-0

In [37]:
display(df)

Unnamed: 0,Quote_date,Expire_date,Price,Bid,Ask,Underlying_last,Strike,TTM,Delta,IV,R,Price_drop,Rolling,GARCH,BS-IV,Heston,LSTM-MLP,Expiry_date,Option_ID
0,2015-01-12,2015-01-15,603.295,602.2,604.39,2028.56,1425.0,2.0,1.00000,0.00000,0.0002,603.295,6.035623e+02,6.035623e+02,6.435516e+02,605.290994,603.12946,2015-01-15,2015-01-15-1425.0
1,2015-01-12,2015-01-15,483.900,482.4,485.40,2028.56,1545.0,2.0,0.99564,1.02531,0.0002,483.900,4.835625e+02,4.835625e+02,4.966703e+02,485.306680,483.69577,2015-01-15,2015-01-15-1545.0
2,2015-01-12,2015-01-15,468.450,467.2,469.70,2028.56,1560.0,2.0,1.00000,0.00000,0.0002,468.450,4.685626e+02,4.685626e+02,4.832197e+02,467.614355,468.75870,2015-01-15,2015-01-15-1560.0
3,2015-01-12,2015-01-15,363.550,362.4,364.70,2028.56,1665.0,2.0,1.00000,0.00000,0.0002,363.550,3.635627e+02,3.635627e+02,3.748025e+02,361.991271,364.29678,2015-01-15,2015-01-15-1665.0
4,2015-01-12,2015-01-15,322.950,321.9,324.00,2028.56,1705.0,2.0,1.00000,0.00000,0.0002,322.950,3.235628e+02,3.235628e+02,3.301411e+02,323.068179,324.65866,2015-01-15,2015-01-15-1705.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436183,2023-03-30,2023-03-31,595.250,593.7,596.80,4050.95,3455.0,0.0,1.00000,0.00000,0.0474,595.250,5.963986e+02,5.963986e+02,5.963986e+02,604.961845,607.60376,2023-03-31,2023-03-31-3455.0
436184,2023-03-30,2023-03-31,466.450,465.9,467.00,4050.95,3585.0,0.0,1.00000,-0.00027,0.0474,466.450,4.664155e+02,4.664155e+02,4.664155e+02,458.415914,474.68167,2023-03-31,2023-03-31-3585.0
436185,2023-03-30,2023-03-31,461.450,460.9,462.00,4050.95,3590.0,0.0,1.00000,-0.00025,0.0474,461.450,4.614162e+02,4.614162e+02,4.614162e+02,453.898531,469.61115,2023-03-31,2023-03-31-3590.0
436186,2023-03-30,2023-03-31,180.650,180.0,181.30,4050.95,3870.0,0.0,1.00000,0.00000,0.0474,180.650,1.814525e+02,1.814525e+02,1.814525e+02,183.942814,192.88177,2023-03-31,2023-03-31-3870.0


In [40]:
# Write to csv
df.to_csv(path + '/data/trading/Full trading data - All models.csv', index=False)