### Imports

In [1]:
import pandas as pd
import datetime
import os
import sys
import math
import matplotlib.pyplot as plt
import numpy as np
import warnings
from pandas.errors import SettingWithCopyWarning
import dask.dataframe as dd


warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

### Parameters

In [2]:
google_colab = False

### Data loading

In [3]:
# Set the path to the root directory
path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
# Read dataframes using Dask
df = dd.read_csv(path + '/data/predictions/03.06_Combined_results.csv')
df_ba = dd.read_csv(path + '/data/processed_data/2011_11feb-2023mar_NSS_filtered_vF_wo_moneyness_filtering_B_A.csv')
df["Quote_date"] = dd.to_datetime(df["Quote_date"])
df_ba["Quote_date"] = dd.to_datetime(df_ba["Quote_date"])


df = df[df['Quote_date'] >= '2015-01-11']
df_ba = df_ba[df_ba['Quote_date'] >= '2015-01-11']

# In df, rename IV to BS-IV
df = df.rename(columns={"IV": "BS-IV"})
# In df_ba, drop 

In [4]:
df = df_ba.merge(df, on=["Quote_date", "TTM", "Strike"], how="inner", suffixes=("", "_drop"))

### Data processing

In [5]:
# Turning the dask dataframe into a pandas dataframe
df = df.compute()

In [6]:
# Drop the columns that were duplicated
df = df.drop(columns=["Unnamed: 0", "Underlying_last_drop", "R_drop"])

In [7]:
display(df)

Unnamed: 0,Quote_date,Expire_date,Price,Bid,Ask,Underlying_last,Strike,TTM,Delta,IV,...,Unnamed: 0_drop,Option_ID,Price_drop,Expiry_date,Rolling,GARCH,BS-IV,Heston,MLP,LSTM-MLP
0,2021-10-28,2021-11-22,2.40,2.3,2.5,4596.06,4770.0,0.068493,0.05234,0.09040,...,8024770,2021-11-22 00:00:00.000000000-4770.0,2.40,2021-11-22 00:00:00.000000000,14.081819,3.731827,32.459923,16.602708,3.063117,0.000000
1,2021-10-28,2021-11-24,471.15,465.1,477.2,4596.06,4125.0,0.073973,0.97148,0.22757,...,8024830,2021-11-24 00:00:00.000000000-4125.0,471.15,2021-11-24 00:00:00.000000000,471.356773,471.243560,472.982939,483.403958,497.956120,472.454200
2,2021-10-28,2021-11-24,447.35,441.3,453.4,4596.06,4150.0,0.073973,0.96024,0.23055,...,8024833,2021-11-24 00:00:00.000000000-4150.0,447.35,2021-11-24 00:00:00.000000000,446.436876,446.245529,448.610653,459.295221,474.198600,448.179100
3,2021-10-28,2021-11-24,278.95,277.6,280.3,4596.06,4330.0,0.073973,0.86316,0.20684,...,8024855,2021-11-24 00:00:00.000000000-4330.0,278.95,2021-11-24 00:00:00.000000000,270.579322,266.718558,281.597490,291.291243,298.791870,275.611360
4,2021-10-28,2021-11-24,27.90,27.6,28.2,4596.06,4655.0,0.073973,0.31650,0.11015,...,8024919,2021-11-24 00:00:00.000000000-4655.0,27.90,2021-11-24 00:00:00.000000000,44.978175,24.666053,71.101789,59.286527,25.695774,14.938367
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276384,2023-03-31,2024-06-21,1023.55,1016.8,1030.3,4109.88,3300.0,1.227397,0.83766,0.25839,...,10539408,2024-06-21 00:00:00.000000008-3300.0,1023.55,2024-06-21 00:00:00.000000008,1011.429569,999.079034,1013.919792,1023.155752,1026.013800,1054.239500
276385,2023-03-31,2024-06-21,441.05,439.4,442.7,4109.88,4100.0,1.227397,0.60836,0.20096,...,10539440,2024-06-21 00:00:00.000000008-4100.0,441.05,2024-06-21 00:00:00.000000008,436.190941,393.773176,443.319286,443.870558,445.750240,495.669250
276386,2023-03-31,2024-06-21,378.75,377.1,380.4,4109.88,4200.0,1.227397,0.56651,0.19304,...,10539444,2024-06-21 00:00:00.000000008-4200.0,378.75,2024-06-21 00:00:00.000000008,383.643065,339.205866,391.068229,385.906237,383.547600,436.614650
276387,2023-03-31,2024-06-21,241.50,240.2,242.8,4109.88,4450.0,1.227397,0.44877,0.17343,...,10539452,2024-06-21 00:00:00.000000008-4450.0,241.50,2024-06-21 00:00:00.000000008,272.306342,226.177016,280.003432,258.122649,249.474870,307.278440


In [8]:
# Dates
df["TTM"] = (df["TTM"] * 365).astype(int)
df['Expiry_date'] = pd.to_datetime(df['Expire_date'])
df.drop(columns=["Expire_date"], axis=1, inplace=True)
df['Quote_date'] = pd.to_datetime(df['Quote_date'])

# Adding option ID
df["Option_ID"] = df["Expiry_date"].astype(str) + "-" + df["Strike"].astype(str)

# Period to be traded on
df = df[(df["Expiry_date"] >= "2015-01-01") & (df["Expiry_date"] <= "2023-03-31")]
df = df[(df["Quote_date"] >= "2015-01-01") & (df["Quote_date"] <= "2023-03-31")]

Create underlying dict

In [9]:
# Create dict with Quote_date as key and Underlying_last as key
underlying_last_dict = df.groupby("Quote_date")["Underlying_last"].mean().to_dict()

Adding TTM=0 row

In [10]:
def add_row(group):
    last_row = group.iloc[-1]
    expiry_date = last_row['Expiry_date']

    # If underlying_last_dict[expiry_date] is not defined, return None
    if expiry_date not in underlying_last_dict:
        print("Marked option for deletion: " + str(last_row['Option_ID']) + " on date: " + str(last_row['Quote_date']) + " due to missing underlying price.")
        return None

    underlying_last_on_expiry = underlying_last_dict[expiry_date]
    intrinsic_value = np.maximum(underlying_last_on_expiry - last_row['Strike'], 0)

    new_row = last_row.copy()
    new_row['Quote_date'] = expiry_date
    new_row['Expiry_date'] = expiry_date
    new_row['TTM'] = 0
    new_row['Underlying_last'] = underlying_last_on_expiry
    new_row['Price'] = intrinsic_value

    return new_row

In [11]:
df = df.sort_values(['Quote_date', 'Expiry_date'])

In [12]:
new_rows = df.groupby("Option_ID").apply(add_row).reset_index(drop=True)

Marked option for deletion: 2017-09-27-1500.0 on date: 2017-09-26 00:00:00 due to missing underlying price.
Marked option for deletion: 2017-09-27-1600.0 on date: 2017-09-26 00:00:00 due to missing underlying price.
Marked option for deletion: 2017-09-27-1650.0 on date: 2017-09-26 00:00:00 due to missing underlying price.
Marked option for deletion: 2017-09-27-1700.0 on date: 2017-09-26 00:00:00 due to missing underlying price.
Marked option for deletion: 2017-09-27-1750.0 on date: 2017-09-26 00:00:00 due to missing underlying price.
Marked option for deletion: 2017-09-27-1800.0 on date: 2017-09-26 00:00:00 due to missing underlying price.
Marked option for deletion: 2017-09-27-1850.0 on date: 2017-09-26 00:00:00 due to missing underlying price.
Marked option for deletion: 2017-09-27-1900.0 on date: 2017-09-26 00:00:00 due to missing underlying price.
Marked option for deletion: 2017-09-27-1950.0 on date: 2017-09-26 00:00:00 due to missing underlying price.
Marked option for deletion: 

In [None]:
new_rows = new_rows.dropna()

In [None]:
# Concatenate the original df with the new rows
df = pd.concat([df, new_rows], axis=0)

In [None]:
# Write to csv
df.to_csv(path + '/data/trading/Full trading data - All models - wo missing-filtering.csv', index=False)