### Imports

In [25]:
import pandas as pd
import datetime
import os
import sys
import math
import matplotlib.pyplot as plt
import numpy as np
import warnings
from pandas.errors import SettingWithCopyWarning
import dask.dataframe as dd


warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

### Parameters

In [26]:
google_colab = False

### Data loading

In [27]:
# Set the path to the root directory
path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
# Read dataframes using Dask
df = dd.read_csv(path + '/data/predictions/combined_predictions_BS_v2.csv')
df_ba = dd.read_csv(path + '/data/processed_data/2011_11feb-2023mar_NSS_filtered_with_BID_ASK_IV_DELTA.csv')
df["Quote_date"] = dd.to_datetime(df["Quote_date"])
df_ba["Quote_date"] = dd.to_datetime(df_ba["Quote_date"])


df = df[df['Quote_date'] >= '2015-01-11']
df_ba = df_ba[df_ba['Quote_date'] >= '2015-01-11']

# In df, rename IV to BS-IV
df = df.rename(columns={"IV": "BS-IV"})
# In df_ba, drop 

In [28]:
display(df.head())
display(df_ba.head())

Unnamed: 0,Quote_date,Price,Underlying_last,Strike,TTM,Rolling,GARCH,BS-IV,Heston,LSTM-MLP,R
0,2015-08-06,518.15,2083.81,1560.0,0.150685,523.917073,523.917074,527.02496,521.561175,519.09094,0.000456
1,2015-08-06,537.6,2083.81,1540.0,0.175342,543.954987,543.954988,548.71488,541.018356,538.49286,0.000537
2,2015-08-06,300.505,2083.81,1780.0,0.175342,304.003048,304.066993,308.40148,303.9103,304.30002,0.000537
3,2015-08-06,167.045,2083.81,1930.0,0.252055,160.196684,163.01798,169.563981,161.147219,171.60628,0.000797
4,2015-08-06,215.045,2083.81,1900.0,0.613699,202.411426,208.172588,226.194027,204.498331,222.42206,0.002118




Unnamed: 0.1,Unnamed: 0,Quote_date,Expire_date,Price,Bid,Ask,Underlying_last,Strike,TTM,Delta,IV,R


In [29]:
df = df_ba.merge(df, on=["Quote_date", "TTM", "Strike"], how="inner", suffixes=("", "_drop"))

In [31]:
display(df)

Unnamed: 0_level_0,Unnamed: 0,Quote_date,Expire_date,Price,Bid,Ask,Underlying_last,Strike,TTM,Delta,IV,R,Price_drop,Underlying_last_drop,Rolling,GARCH,BS-IV,Heston,LSTM-MLP,R_drop
npartitions=24,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
,int64,datetime64[ns],object,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


### Data processing

In [32]:
# Turning the dask dataframe into a pandas dataframe
df = df.compute()

In [34]:
# Drop the columns that were duplicated
df = df.drop(columns=["Unnamed: 0", "Underlying_last_drop", "R_drop"])

In [35]:
display(df)

Unnamed: 0,Quote_date,Expire_date,Price,Bid,Ask,Underlying_last,Strike,TTM,Delta,IV,R,Price_drop,Rolling,GARCH,BS-IV,Heston,LSTM-MLP
0,2015-01-12,2015-01-15,603.295,602.2,604.39,2028.56,1425.0,0.008219,1.00000,0.00000,0.000200,603.295,603.562342,603.562342,643.551645,605.290994,603.12946
1,2015-01-12,2015-01-15,483.900,482.4,485.40,2028.56,1545.0,0.008219,0.99564,1.02531,0.000200,483.900,483.562540,483.562540,496.670262,485.306680,483.69577
2,2015-01-12,2015-01-15,468.450,467.2,469.70,2028.56,1560.0,0.008219,1.00000,0.00000,0.000200,468.450,468.562564,468.562564,483.219698,467.614355,468.75870
3,2015-01-12,2015-01-15,363.550,362.4,364.70,2028.56,1665.0,0.008219,1.00000,0.00000,0.000200,363.550,363.562737,363.562737,374.802509,361.991271,364.29678
4,2015-01-12,2015-01-15,322.950,321.9,324.00,2028.56,1705.0,0.008219,1.00000,0.00000,0.000200,322.950,323.562803,323.562803,330.141073,323.068179,324.65866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436709,2023-03-31,2024-04-19,422.450,420.0,424.90,4109.88,4075.0,1.054795,0.61661,0.20127,0.046174,422.450,413.842527,366.069425,460.903651,425.307917,472.97458
436710,2023-03-31,2024-04-19,360.050,357.7,362.40,4109.88,4175.0,1.054795,0.57119,0.19367,0.046174,360.050,360.096980,309.693752,396.952162,366.215057,411.19080
436711,2023-03-31,2024-06-21,1556.450,1544.9,1568.00,4109.88,2675.0,1.227397,0.92096,0.31054,0.045030,1556.450,1579.926061,1578.874712,1616.257310,1555.101104,1584.21610
436712,2023-03-31,2024-06-21,441.050,439.4,442.70,4109.88,4100.0,1.227397,0.60836,0.20096,0.045030,441.050,436.190941,384.288997,481.722637,443.870558,496.36075


In [37]:
# Dates
df["TTM"] = (df["TTM"] * 365).astype(int)
df['Expiry_date'] = pd.to_datetime(df['Expire_date'])
df['Quote_date'] = pd.to_datetime(df['Quote_date'])

# Adding option ID
df["Option_ID"] = df["Expiry_date"].astype(str) + "-" + df["Strike"].astype(str)

# Period to be traded on
df = df[(df["Expiry_date"] >= "2015-01-01") & (df["Expiry_date"] <= "2023-03-31")]
df = df[(df["Quote_date"] >= "2015-01-01") & (df["Quote_date"] <= "2023-03-31")]

Create underlying dict

In [38]:
# Create dict with Quote_date as key and Underlying_last as key
underlying_last_dict = df.groupby("Quote_date")["Underlying_last"].mean().to_dict()

Adding TTM=0 row

In [40]:
# Sort the dataframe by Quote_date and Expiry_date
df = df.sort_values(['Quote_date', 'Expiry_date'])

# Iterate through every Option_ID
for option_id in df["Option_ID"].unique():
    # Get the option group
    group = df[df["Option_ID"] == option_id]

    # Taking row from option group (could be any) to be used in getting the Strike price
    last_row = group.iloc[-1]
    
    expiry_date = last_row['Expiry_date']

    # Get the underlying price on the day of expiry
    underlying_last_on_expiry = underlying_last_dict[expiry_date]
    # Calculate the intrinsic value
    intrinsic_value = np.maximum(underlying_last_on_expiry - last_row['Strike'], 0)

    new_row = last_row.copy()
    new_row['Quote_date'] = expiry_date
    new_row['Expiry_date'] = expiry_date
    new_row['TTM'] = 0
    new_row['Underlying_last'] = underlying_last_on_expiry
    new_row['Price'] = intrinsic_value

    # Add the new row to the dataframe using concat
    new_row = pd.DataFrame(new_row).transpose()
    df = pd.concat([df, new_row], ignore_index=True)

# Sort the dataframe by Quote_date and Expiry_date
df = df.sort_values(['Quote_date', 'Expiry_date'])

In [None]:
unique_dates = df['Quote_date'].unique()

# Remove option groups if it has a date between it's first and last that is not in unique_dates
for option_id in df["Option_ID"].unique():
    option_group = df[df["Option_ID"] == option_id]
    group_first_date = option_group['Quote_date'].iloc[0]
    group_last_date = option_group['Quote_date'].iloc[-1]

    if group_first_date not in unique_dates or group_last_date not in unique_dates:
        df = df.drop(option_group.index)
        print("Dropped option group: ", option_id, " because it's first or last date is not in unique_dates")

    # If every unique_dates between group_first_date and group_last_date is not in group, drop the group
    dates_between = unique_dates[(unique_dates >= group_first_date) & (unique_dates <= group_last_date)]
    if len(dates_between) != len(option_group['Quote_date'].unique()):
        df = df.drop(option_group.index)
        print("Dropped option group: ", option_id, " because it's missing dates between it's first and last date")


    if len(option_group) < 2:
        df = df.drop(option_group.index)
        print("Dropped option group: ", option_id, " because it has less than 2 rows")


  for option_id, option_group in option_groups:


Dropped option group:  2018-01-19-1350.0  because it's missing dates between it's first and last date
Dropped option group:  2018-01-19-1375.0  because it's missing dates between it's first and last date
Dropped option group:  2018-01-19-1400.0  because it's missing dates between it's first and last date
Dropped option group:  2018-01-19-3500.0  because it's missing dates between it's first and last date
Dropped option group:  2018-01-26-1400.0  because it's missing dates between it's first and last date


In [None]:
display(df)

Unnamed: 0.1,Unnamed: 0,Quote_date,Price,Prediction,Underlying_last,Strike,TTM,R,Expiry_date,Option_ID
0,4119222,2018-01-02,1095.295,1102.66750,2695.87,1600.0,0.00274,0.0129,2018-01-03,2018-01-03-1600.0
1,4119223,2018-01-02,995.295,1002.39465,2695.87,1700.0,0.00274,0.0129,2018-01-03,2018-01-03-1700.0
2,4119224,2018-01-02,945.300,952.23970,2695.87,1750.0,0.00274,0.0129,2018-01-03,2018-01-03-1750.0
3,4119225,2018-01-02,895.305,902.30440,2695.87,1800.0,0.00274,0.0129,2018-01-03,2018-01-03-1800.0
4,4119226,2018-01-02,845.290,852.40234,2695.87,1850.0,0.00274,0.0129,2018-01-03,2018-01-03-1850.0
...,...,...,...,...,...,...,...,...,...,...
19206,4190191,2018-01-29,0.000,0.00000,2853.21,3010.0,0.00000,0.0124,2018-01-29,2018-01-29-3010.0
19207,4190192,2018-01-29,0.000,0.00000,2853.21,3020.0,0.00000,0.0124,2018-01-29,2018-01-29-3020.0
19208,4190193,2018-01-29,0.000,0.00000,2853.21,3050.0,0.00000,0.0124,2018-01-29,2018-01-29-3050.0
19209,4190194,2018-01-29,0.000,0.00000,2853.21,3100.0,0.00000,0.0124,2018-01-29,2018-01-29-3100.0


In [None]:
# Write to csv
df.to_csv(path + '/data/trading/Full trading data - All models.csv', index=False)