In [1]:
import pandas as pd
import numpy as np

# Loading dataset

In [2]:
def read_files(path, filenames):
    """Reads all files and returns a dataframe"""
    return pd.concat((pd.read_csv(path + f, skipinitialspace=True) for f in filenames))

In [3]:
path_opt = "./data/raw_data/"
filenames_opt = ["spx_eod_" + str(year) + (str(month) if month >= 10 else "0" + str(month)) + ".csv" for year in range(2010, 2022) for month in range(1, 13)] + ["spx_eod_2022" + (str(month) if month >= 10 else "0" + str(month)) + ".csv" for month in range(1, 13)]
df = read_files(path_opt, filenames_opt)

In [4]:
len_after_first_read = len(df)

# Cleaning raw data

In [5]:
def process_options(df_opt, call = True):
    """Cleans up column names and add time to maturity (TTM)"""
    keys = {key: key[key.find("[")+1:key.find("]")][0] + key[key.find("[")+1:key.find("]")][1:].lower()  for key in df_opt.keys()}
    df_opt = df_opt.rename(columns=keys)

    if call:
        keys = {"C_ask": "Ask", "C_bid": "Bid"}
    else:
        keys = {"P_ask": "Ask", "P_bid": "Bid"}
    df_opt = df_opt.rename(columns=keys)

    df_opt["Quote_date"] = pd.to_datetime(df_opt["Quote_date"])
    df_opt["Expire_date"] = pd.to_datetime(df_opt["Expire_date"])
    df_opt["TTM"] = df_opt.apply(lambda row: (row.Expire_date - row.Quote_date).days, axis = 1)
    df_opt["Price"] = (df_opt["Ask"] + df_opt["Bid"])/2

    columns = ["Quote_date", "Expire_date", "Price", "Underlying_last", "Strike", "TTM"]
    df_opt = df_opt[columns]
    return df_opt[columns]

In [6]:
df = process_options(df)
len_after_process = len(df)

df = df.dropna()
len_after_nan = len(df)

# Matching rates

In [8]:
def process_rates(df_r):
    """Renames rate duration"""
    df_r["Date"] = pd.to_datetime(df_r["Date"])
    keys = {  "Date" : "Quote_date",
                                    "1 Mo": 30,
                                    "3 Mo": 90,
                                    "6 Mo": 180,
                                    "1 Yr": 365,
                                    "2 Yr": 365*2,
                                    "3 Yr": 365*3,
                                    "5 Yr": 365*5,
                                    "7 Yr": 365*7,
                                    "10 Yr": 365*10}
    df_r = df_r.rename(columns = keys)
    return df_r[keys.values()]

def combine_opt_rates(df_opt, df_r):
    df_opt = pd.merge(df_opt, df_r, on ="Quote_date", how = "left")
    rates = list(df_r.columns)
    rates.remove("Quote_date")
    df_opt["TTM_diff"] = df_opt["TTM"].apply(lambda x: (np.abs(np.array(rates) - x)).argmin())
    df_opt["R"] = df_opt[["TTM_diff"] + rates].values.tolist()
    df_opt["R"] = df_opt["R"].apply(lambda x: x[int(x[0]+1)])
    df_opt = df_opt.drop(rates + ["TTM_diff"], axis=1)
    df_opt = df_opt.ffill()
    df_opt_len_before = len(df_opt)
    df_opt = df_opt.dropna()
    df_opt_len_after = len(df_opt)
    print("Dropped " + str(df_opt_len_before - df_opt_len_after) + " rows in rate matching")
    return df_opt.dropna()

In [9]:
df_r = pd.concat((pd.read_csv("./data/raw_data/" + f, skipinitialspace=True) for f in ["daily-treasury-rates.csv", "yield-curve-rates-1990-2021.csv"]))
df_r = process_rates(df_r)
df = combine_opt_rates(df, df_r)
len_after_rate_matching = len(df)

Dropped 0 rows in rate matching


## Analysing lost data

In [12]:
print("Starting with " + str(len_after_first_read) + " rows, and ended up with " + str(len_after_rate_matching) + " rows")
print("Dropped " + str(len_after_first_read - len_after_rate_matching) + " rows in total")
print("Which is " + str(round((len_after_first_read - len_after_rate_matching)/len_after_first_read*100, 2)) + "%")
print(len_after_first_read-len_after_process, "rows dropped in process_options")
print(len_after_process-len_after_nan, "rows dropped in dropna")
print(len_after_nan-len_after_rate_matching, "rows dropped in rate matching")


Starting with 13547394 rows, and ended up with 13536349 rows
Dropped 11045 rows in total
Which is 0.08%
0 rows dropped in process_options
11045 rows dropped in dropna
0 rows dropped in rate matching


## Read to file

In [11]:
df.to_csv("./data/processed_data/2010-2022.csv")