In [134]:
import pandas as pd
import numpy as np

# Loading dataset

In [135]:
def read_files(path, filenames):
    """Reads all files and returns a dataframe"""
    return pd.concat((pd.read_csv(path + f, skipinitialspace=True) for f in filenames))

In [136]:
path_opt = "./data/raw_data/"
filenames_opt = ["spx_eod_" + str(year) + (str(month) if month >= 10 else "0"+str(month)) + ".csv" for year in range(2022, 2022) for month in range(1, 13)] + ["spx_eod_2022" + (str(month) if month >= 10 else "0" + str(month)) + ".csv" for month in range(1, 13)]
df = read_files(path_opt, filenames_opt)


# Cleaning raw data

In [137]:
def process_options(df_opt, call = True):
    """Cleans up column names and add time to maturity (TTM)"""
    keys = {key: key[key.find("[")+1:key.find("]")][0] + key[key.find("[")+1:key.find("]")][1:].lower()  for key in df_opt.keys()}
    df_opt = df_opt.rename(columns=keys)

    if call:
        keys = {"C_ask": "Ask", "C_bid": "Bid"}
    else:
        keys = {"P_ask": "Ask", "P_bid": "Bid"}
    df_opt = df_opt.rename(columns=keys)

    df_opt["Quote_date"] = pd.to_datetime(df_opt["Quote_date"])
    df_opt["Expire_date"] = pd.to_datetime(df_opt["Expire_date"])
    df_opt["TTM"] = df_opt.apply(lambda row: (row.Expire_date - row.Quote_date).days, axis = 1)
    df_opt["Price"] = (df_opt["Ask"] + df_opt["Bid"])/2

    columns = ["Quote_date", "Expire_date", "Price", "Underlying_last", "Strike", "TTM"]
    df_opt = df_opt[columns]
    df_opt = df_opt[(df_opt["TTM"] != 0) & (df_opt["TTM"] <= 365*3)]
    return df_opt[columns]

In [138]:
df = process_options(df)

# Adding lags

In [139]:
df['Quote_date']= pd.to_datetime(df['Quote_date'])

In [141]:
# Group the data by Quote Date and calculate the mean for Underlying Price
df_agg = df.groupby('Quote_date').mean().reset_index()

# Add the Underlying Price Lag column
for i in range(1, 21):
    df_agg['Underlying_' + str(i)] = df_agg['Underlying_last'].shift(i)

df = pd.merge(df, df_agg[['Quote_date', 'Underlying_1', 'Underlying_2', 'Underlying_3', 'Underlying_4', 'Underlying_5', 'Underlying_6', 'Underlying_7', 'Underlying_8', 'Underlying_9', 'Underlying_10', 'Underlying_11', 'Underlying_12', 'Underlying_13', 'Underlying_14', 'Underlying_15', 'Underlying_16', 'Underlying_17', 'Underlying_18', 'Underlying_19', 'Underlying_20']], on='Quote_date', how='left')

  df_agg = df.groupby('Quote_date').mean().reset_index()


# Matching rates

In [142]:
def process_rates(df_r):
    """Renames rate duration"""
    df_r["Date"] = pd.to_datetime(df_r["Date"])
    keys = {  "Date" : "Quote_date",
                                    "1 Mo": 30,
                                    "3 Mo": 90,
                                    "6 Mo": 180,
                                    "1 Yr": 365,
                                    "2 Yr": 365*2,
                                    "3 Yr": 365*3,
                                    "5 Yr": 365*5,
                                    "7 Yr": 365*7,
                                    "10 Yr": 365*10}
    df_r = df_r.rename(columns = keys)
    return df_r[keys.values()]

def combine_opt_rates(df_opt, df_r):
    df_opt = pd.merge(df_opt, df_r, on ="Quote_date", how = "left")
    rates = list(df_r.columns)
    rates.remove("Quote_date")
    df_opt["TTM_diff"] = df_opt["TTM"].apply(lambda x: (np.abs(np.array(rates) - x)).argmin())
    df_opt["R"] = df_opt[["TTM_diff"] + rates].values.tolist()
    df_opt["R"] = df_opt["R"].apply(lambda x: x[int(x[0]+1)])
    df_opt = df_opt.drop(rates + ["TTM_diff"], axis=1)
    return df_opt.dropna()

In [143]:
df_r = pd.read_csv("./data/raw_data/daily-treasury-rates.csv")
df_r = process_rates(df_r)
df = combine_opt_rates(df, df_r)
print(df)

        Quote_date Expire_date     Price  Underlying_last  Strike   TTM  \
173719  2022-01-31  2022-02-02  3312.500          4516.89  1200.0     2   
173720  2022-01-31  2022-02-02  3114.000          4516.89  1400.0     2   
173721  2022-01-31  2022-02-02  2913.550          4516.89  1600.0     2   
173722  2022-01-31  2022-02-02  2712.600          4516.89  1800.0     2   
173723  2022-01-31  2022-02-02  2512.600          4516.89  2000.0     2   
...            ...         ...       ...              ...     ...   ...   
2015244 2022-12-30  2025-12-19     6.950          3839.81  8400.0  1085   
2015245 2022-12-30  2025-12-19     5.950          3839.81  8600.0  1085   
2015246 2022-12-30  2025-12-19     5.450          3839.81  8800.0  1085   
2015247 2022-12-30  2025-12-19     4.500          3839.81  9000.0  1085   
2015248 2022-12-30  2025-12-19     4.025          3839.81  9200.0  1085   

         Underlying_1  Underlying_2  Underlying_3  Underlying_4  ...  \
173719        4431.80      

In [144]:
display(df)

Unnamed: 0,Quote_date,Expire_date,Price,Underlying_last,Strike,TTM,Underlying_1,Underlying_2,Underlying_3,Underlying_4,...,Underlying_12,Underlying_13,Underlying_14,Underlying_15,Underlying_16,Underlying_17,Underlying_18,Underlying_19,Underlying_20,R
173719,2022-01-31,2022-02-02,3312.500,4516.89,1200.0,2,4431.80,4325.89,4347.26,4358.03,...,4660.64,4726.55,4713.53,4669.85,4676.41,4696.25,4700.64,4793.19,4795.57,0.03
173720,2022-01-31,2022-02-02,3114.000,4516.89,1400.0,2,4431.80,4325.89,4347.26,4358.03,...,4660.64,4726.55,4713.53,4669.85,4676.41,4696.25,4700.64,4793.19,4795.57,0.03
173721,2022-01-31,2022-02-02,2913.550,4516.89,1600.0,2,4431.80,4325.89,4347.26,4358.03,...,4660.64,4726.55,4713.53,4669.85,4676.41,4696.25,4700.64,4793.19,4795.57,0.03
173722,2022-01-31,2022-02-02,2712.600,4516.89,1800.0,2,4431.80,4325.89,4347.26,4358.03,...,4660.64,4726.55,4713.53,4669.85,4676.41,4696.25,4700.64,4793.19,4795.57,0.03
173723,2022-01-31,2022-02-02,2512.600,4516.89,2000.0,2,4431.80,4325.89,4347.26,4358.03,...,4660.64,4726.55,4713.53,4669.85,4676.41,4696.25,4700.64,4793.19,4795.57,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015244,2022-12-30,2025-12-19,6.950,3839.81,8400.0,1085,3848.66,3783.05,3829.13,3844.82,...,4019.51,3990.29,3934.36,3963.94,3933.45,3941.50,3999.15,4071.55,4076.96,4.22
2015245,2022-12-30,2025-12-19,5.950,3839.81,8600.0,1085,3848.66,3783.05,3829.13,3844.82,...,4019.51,3990.29,3934.36,3963.94,3933.45,3941.50,3999.15,4071.55,4076.96,4.22
2015246,2022-12-30,2025-12-19,5.450,3839.81,8800.0,1085,3848.66,3783.05,3829.13,3844.82,...,4019.51,3990.29,3934.36,3963.94,3933.45,3941.50,3999.15,4071.55,4076.96,4.22
2015247,2022-12-30,2025-12-19,4.500,3839.81,9000.0,1085,3848.66,3783.05,3829.13,3844.82,...,4019.51,3990.29,3934.36,3963.94,3933.45,3941.50,3999.15,4071.55,4076.96,4.22


In [145]:
df.to_csv("./data/processed_data/2022.csv")