In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pandas_ta as ta
import yfinance as yf

from matplotlib.pyplot import figure
from pandarallel import pandarallel
from scipy.stats import norm

%matplotlib widget
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [2]:
dfs = []
# directory_path = "data/qqq-options-data-2020-2022"
directory_path = "data/spy-options-data-2020-2022"

for dirname, _, filenames in os.walk(directory_path):
    for filename in filenames:
        data_path = os.path.join(dirname, filename)
        print(data_path)
        dfs.append(pd.read_csv(data_path, low_memory=False))

df_raw = pd.concat(dfs)
columns = df_raw.columns
columns = [s.replace("[", "") for s in columns]
columns = [s.replace("]", "") for s in columns]
columns = [s.replace(" ", "") for s in columns]
df_raw.columns = columns

date_columns = ["QUOTE_READTIME", "QUOTE_DATE", "EXPIRE_DATE"]
numeric_cols = df_raw.columns.to_list()
numeric_cols.remove("QUOTE_READTIME")
numeric_cols.remove("QUOTE_DATE")
numeric_cols.remove("EXPIRE_DATE")

df_numeric = df_raw.drop(columns=date_columns)

for i in numeric_cols:
    df_numeric[i] = pd.to_numeric(df_numeric[i], errors="coerce")

df_final = df_numeric.drop(columns=["C_SIZE", "P_SIZE"])
df_final["EXPIRE_UNIX"] = pd.to_datetime(df_final.EXPIRE_UNIX, unit="s", utc=True)
df_final["OPTION_ID"] = df_final["EXPIRE_UNIX"].astype(str) + df_final["STRIKE"].astype(str)
df_final["OPTION_ID"] = df_final["OPTION_ID"].apply(lambda x: hash(x))
df_final["QUOTE_UNIXTIME"] = pd.to_datetime(df_final.QUOTE_UNIXTIME, unit="s", utc=True).apply(
    lambda x: pd.Timestamp(x).round(freq="D")
)

df_final.set_index(pd.DatetimeIndex(df_final.QUOTE_UNIXTIME), inplace=True)
# df_final.drop(columns=["QUOTE_UNIXTIME"], inplace=True)


def get_df_cnn_fear_greed_index():
    print("Retrieving historical ETF fear and greed index")
    import json

    with open("data/cnn_fear_greed_index_data.json") as f:
        cnn_fear_greed_index_data = json.load(f)

    df_fear_greed_index = pd.DataFrame(
        data=cnn_fear_greed_index_data["data"]["c:50108"]["series"][0], columns=["x", "y"]
    )
    df_fear_greed_index.set_index(
        pd.DatetimeIndex([pd.Timestamp(x, unit="s", tz="UTC") for x in df_fear_greed_index.x]),
        inplace=True,
    )
    df_fear_greed_index.rename(columns={"y": "fear_greed_index"}, inplace=True)
    df_fear_greed_index["fear_greed_index"] = df_fear_greed_index.fear_greed_index.astype(float).round().shift()
    return df_fear_greed_index


df_fear_greed_index = get_df_cnn_fear_greed_index()
df_final = df_final.join(df_fear_greed_index, how="inner")
df_final.sort_index(inplace=True)

df_final

data/spy-options-data-2020-2022/spy_2020_2022.csv
Retrieving historical ETF fear and greed index


Unnamed: 0_level_0,QUOTE_UNIXTIME,QUOTE_TIME_HOURS,UNDERLYING_LAST,EXPIRE_UNIX,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,C_BID,C_ASK,STRIKE,P_BID,P_ASK,P_LAST,P_DELTA,P_GAMMA,P_VEGA,P_THETA,P_RHO,P_IV,P_VOLUME,STRIKE_DISTANCE,STRIKE_DISTANCE_PCT,OPTION_ID,x,fear_greed_index
QUOTE_UNIXTIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2021-05-18 00:00:00+00:00,2021-05-18 00:00:00+00:00,16.0,415.52,2021-05-17 20:00:00+00:00,0.0,1.00000,0.00000,0.00000,-0.00040,0.00843,,,0.00,125.06,125.79,290.0,0.00,0.01,0.00,-0.00077,0.00007,0.00077,-0.00462,0.00000,2.08951,,125.5,0.302,3792060337129155469,2021-05-18,36.0
2021-05-18 00:00:00+00:00,2021-05-18 00:00:00+00:00,16.0,415.52,2021-05-17 20:00:00+00:00,0.0,1.00000,0.00000,0.00000,-0.00055,0.00804,,,0.00,115.06,115.79,300.0,0.00,0.01,0.03,-0.00026,0.00007,-0.00014,-0.00491,0.00000,1.90443,9.0,115.5,0.278,3401021022364756414,2021-05-18,36.0
2021-05-18 00:00:00+00:00,2021-05-18 00:00:00+00:00,16.0,415.52,2021-05-17 20:00:00+00:00,0.0,1.00000,0.00000,0.00000,0.00000,0.00789,,10.0,116.92,110.06,110.79,305.0,0.00,0.01,0.04,-0.00029,0.00001,0.00036,-0.00537,0.00000,1.81308,20.0,110.5,0.266,-4123517462413106355,2021-05-18,36.0
2021-05-18 00:00:00+00:00,2021-05-18 00:00:00+00:00,16.0,415.52,2021-05-17 20:00:00+00:00,0.0,1.00000,0.00000,0.00000,-0.00022,0.00835,,3.0,101.32,105.06,105.79,310.0,0.00,0.01,0.01,-0.00052,0.00002,0.00074,-0.00457,-0.00032,1.72345,500.0,105.5,0.254,-8038047175987578789,2021-05-18,36.0
2021-05-18 00:00:00+00:00,2021-05-18 00:00:00+00:00,16.0,415.52,2021-05-17 20:00:00+00:00,0.0,1.00000,0.00000,0.00000,-0.00024,0.00903,,0.0,98.59,100.06,100.79,315.0,0.00,0.01,0.01,-0.00077,0.00009,0.00023,-0.00533,0.00000,1.63556,4.0,100.5,0.242,3691768375929604803,2021-05-18,36.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 00:00:00+00:00,2022-12-31 00:00:00+00:00,16.0,382.44,2025-12-19 21:00:00+00:00,1085.0,0.10595,0.00150,1.19348,-0.01204,1.05129,0.17770,2.0,5.71,0.59,10.00,630.0,243.00,250.50,0.00,-1.00000,0.00000,0.00000,0.00000,0.00000,,,247.6,0.647,2747401659209165995,2022-12-31,37.0
2022-12-31 00:00:00+00:00,2022-12-31 00:00:00+00:00,16.0,382.44,2025-12-19 21:00:00+00:00,1085.0,0.09392,0.00140,1.09723,-0.01082,0.94207,0.17304,,0.00,2.00,7.00,635.0,247.66,257.50,0.00,-0.89431,0.00307,0.72687,-0.00434,-0.20330,0.29557,,252.6,0.660,6499954274878750791,2022-12-31,37.0
2022-12-31 00:00:00+00:00,2022-12-31 00:00:00+00:00,16.0,382.44,2025-12-19 21:00:00+00:00,1085.0,0.09279,0.00137,1.08956,-0.01114,0.93199,0.17558,,0.00,2.00,7.00,640.0,253.00,262.50,0.00,-0.88536,0.00298,0.78662,-0.00524,-1.78777,0.30289,,257.6,0.673,-7916591500203698588,2022-12-31,37.0
2022-12-31 00:00:00+00:00,2022-12-31 00:00:00+00:00,16.0,382.44,2025-12-19 21:00:00+00:00,1085.0,0.09938,0.00147,1.14502,-0.01128,0.99068,0.18239,3.0,4.33,0.12,10.00,645.0,258.00,267.50,0.00,-0.88264,0.00287,0.79928,-0.00538,-1.82985,0.30699,,262.6,0.687,5512922230975615022,2022-12-31,37.0


In [13]:
option_history = []

# tnx = yf.Ticker("^TNX")
# r = tnx.info["open"] / 100
r = 0.05
N = norm.cdf


def black_scholes_call(S, K, T, sigma):
    d1 = (np.log(S / K) + (r + sigma**2 / 2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    return S * N(d1) - K * np.exp(-r * T) * N(d2)


def black_scholes_put(S, K, T, sigma):
    d1 = (np.log(S / K) + (r + sigma**2 / 2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)
    return K * np.exp(-r * T) * N(-d2) - S * N(-d1)


num = 0
total = len(df_final)

for index, row in df_final.iterrows():
    num += 1

    if num % 100 == 0:
        print(f"{num}/{total} -- {num/total*100:.2f}%", end="\r")

    option_history.append(
        {
            "OPTION_ID": hash("CALL" + str(row.EXPIRE_UNIX) + str(row.STRIKE)),
            "OPTION_TYPE": "CALL",
            "QUOTE_UNIXTIME": row.QUOTE_UNIXTIME,
            "EXPIRE_UNIX": row.EXPIRE_UNIX,
            "UNDERLYING_LAST": row.UNDERLYING_LAST,
            "STRIKE": row.STRIKE,
            "STRIKE_DISTANCE": row.STRIKE_DISTANCE,
            "STRIKE_DISTANCE_PCT": row.STRIKE_DISTANCE_PCT,
            "DTE": row.DTE,
            "DELTA": row.C_DELTA,
            "GAMMA": row.C_GAMMA,
            "VEGA": row.C_VEGA,
            "THETA": row.C_THETA,
            "RHO": row.C_RHO,
            "IV": row.C_IV,
            "VOLUME": row.C_VOLUME,
            "LAST": row.C_LAST,
            "BID": row.C_BID,
            "ASK": row.C_ASK,
            "BLACK_SCHOLES": black_scholes_call(row.UNDERLYING_LAST, row.STRIKE, row.DTE, row.C_IV),
        }
    )

    option_history.append(
        {
            "OPTION_ID": hash("PUT" + str(row.EXPIRE_UNIX) + str(row.STRIKE)),
            "OPTION_TYPE": "PUT",
            "QUOTE_UNIXTIME": row.QUOTE_UNIXTIME,
            "EXPIRE_UNIX": row.EXPIRE_UNIX,
            "UNDERLYING_LAST": row.UNDERLYING_LAST,
            "STRIKE": row.STRIKE,
            "STRIKE_DISTANCE": row.STRIKE_DISTANCE,
            "STRIKE_DISTANCE_PCT": row.STRIKE_DISTANCE_PCT,
            "DTE": row.DTE,
            "DELTA": row.P_DELTA,
            "GAMMA": row.P_GAMMA,
            "VEGA": row.P_VEGA,
            "THETA": row.P_THETA,
            "RHO": row.P_RHO,
            "IV": row.P_IV,
            "VOLUME": row.P_VOLUME,
            "LAST": row.P_LAST,
            "BID": row.P_BID,
            "ASK": row.P_ASK,
            "BLACK_SCHOLES": black_scholes_put(row.UNDERLYING_LAST, row.STRIKE, row.DTE, row.P_IV),
        }
    )

df_option_history = pd.DataFrame(data=option_history)
df_option_history

  d1 = (np.log(S/K) + (r + sigma**2/2)*T) / (sigma*np.sqrt(T))
  d1 = (np.log(S/K) + (r + sigma**2/2)*T) / (sigma*np.sqrt(T))


394700/1672745 -- 23.60%

  d1 = (np.log(S/K) + (r + sigma**2/2)*T) / (sigma*np.sqrt(T))
  d1 = (np.log(S/K) + (r + sigma**2/2)*T) / (sigma*np.sqrt(T))


1672700/1672745 -- 100.00%

Unnamed: 0,OPTION_ID,OPTION_TYPE,QUOTE_UNIXTIME,EXPIRE_UNIX,UNDERLYING_LAST,STRIKE,STRIKE_DISTANCE,STRIKE_DISTANCE_PCT,DTE,DELTA,GAMMA,VEGA,THETA,RHO,IV,VOLUME,LAST,BID,ASK,BLACK_SCHOLES
0,-7322242243761756953,CALL,2021-05-18 00:00:00+00:00,2021-05-17 20:00:00+00:00,415.52,290.0,125.5,0.302,0.0,1.00000,0.00000,0.00000,-0.00040,0.00843,,,0.00,125.06,125.79,
1,5455096934374096829,PUT,2021-05-18 00:00:00+00:00,2021-05-17 20:00:00+00:00,415.52,290.0,125.5,0.302,0.0,-0.00077,0.00007,0.00077,-0.00462,0.00000,2.08951,,0.00,0.00,0.01,0.000000e+00
2,7283866516831300545,CALL,2021-05-18 00:00:00+00:00,2021-05-17 20:00:00+00:00,415.52,300.0,115.5,0.278,0.0,1.00000,0.00000,0.00000,-0.00055,0.00804,,,0.00,115.06,115.79,
3,-2371234746074998763,PUT,2021-05-18 00:00:00+00:00,2021-05-17 20:00:00+00:00,415.52,300.0,115.5,0.278,0.0,-0.00026,0.00007,-0.00014,-0.00491,0.00000,1.90443,9.0,0.03,0.00,0.01,0.000000e+00
4,5823950779946275706,CALL,2021-05-18 00:00:00+00:00,2021-05-17 20:00:00+00:00,415.52,305.0,110.5,0.266,0.0,1.00000,0.00000,0.00000,0.00000,0.00789,,10.0,116.92,110.06,110.79,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3345485,-6010632484508909765,PUT,2022-12-31 00:00:00+00:00,2025-12-19 21:00:00+00:00,382.44,640.0,257.6,0.673,1085.0,-0.88536,0.00298,0.78662,-0.00524,-1.78777,0.30289,,0.00,253.00,262.50,5.464081e-22
3345486,-8954592756181917103,CALL,2022-12-31 00:00:00+00:00,2025-12-19 21:00:00+00:00,382.44,645.0,262.6,0.687,1085.0,0.09938,0.00147,1.14502,-0.01128,0.99068,0.18239,3.0,4.33,0.12,10.00,3.824400e+02
3345487,2765500859453546109,PUT,2022-12-31 00:00:00+00:00,2025-12-19 21:00:00+00:00,382.44,645.0,262.6,0.687,1085.0,-0.88264,0.00287,0.79928,-0.00538,-1.82985,0.30699,,0.00,258.00,267.50,6.417358e-22
3345488,460786814213622546,CALL,2022-12-31 00:00:00+00:00,2025-12-19 21:00:00+00:00,382.44,650.0,267.6,0.700,1085.0,0.11433,0.00149,1.25917,-0.01324,1.11689,0.19561,2.0,4.15,2.60,10.00,3.824400e+02


In [24]:
# df_option_history.groupby("OPTION_ID", as_index=False).size().sort_values(by="size")
# df_option_history.loc[df_option_history.OPTION_ID == 7567807300680646149]
df_option_history.describe().astype(str)

Unnamed: 0,OPTION_ID,UNDERLYING_LAST,STRIKE,STRIKE_DISTANCE,STRIKE_DISTANCE_PCT,DTE,DELTA,GAMMA,VEGA,THETA,RHO,IV,VOLUME,LAST,BID,ASK,BLACK_SCHOLES
count,3345490.0,3345490.0,3345490.0,3345490.0,3345490.0,3345490.0,3345264.0,3345264.0,3345264.0,3345264.0,3345264.0,3150560.0,2661380.0,3343174.0,3343174.0,3343174.0,3150558.0
mean,-1.867895758668993e+16,422.4817242735743,410.63111890933766,75.33245037348784,0.1790713473960466,147.1184683140589,0.034387096572946,-0.5288097618693175,-1.143036924329438,-0.0776872490422281,-0.0710304322409232,0.309125141733533,352.13847890943794,24.35027838814251,42.28535490225756,43.32169549954625,181.53715288608916
std,5.333752009624958e+18,29.910959614987416,104.01964219225547,75.17768208739193,0.1781094845371424,201.0141208180822,0.6270397443469435,673.1088231930211,35.22754673890763,0.4943370371657205,5.820357973573033,0.3213016782655849,3693.740872243663,46.83231480347753,61.59514692336674,62.38893931069413,185.23809766270904
min,-9.223013021143044e+18,356.58,25.0,0.0,0.0,0.0,-1.0,-1021263.30425,-2815.39969,-41.14461,-2878.27141,-0.0005,0.0,0.0,0.0,0.01,-477.479999843628
25%,-4.654388431224253e+18,396.39,355.0,23.3,0.055,17.0,-0.38263,0.0005,0.04639,-0.08124,-0.13922,0.18272,0.0,0.02,1.29,1.44,0.5680430154344775
50%,-5707620811033872.0,426.05,414.0,50.5,0.12,51.0,0.0,0.00262,0.26222,-0.04272,0.0,0.24391,2.0,3.42,16.09,16.97,97.48393684328616
75%,4.59396603132675e+18,446.58,466.0,106.6,0.253,199.96,0.61978,0.00629,0.69951,-0.01416,0.13801,0.3309,29.0,26.47,55.57,57.24,382.257623261438
max,9.222572201009666e+18,477.77,4898.0,4469.3,10.425,1096.0,1.0,9.46681,147.32807,0.0,252.84124,41.39908,282571.0,444.2,4451.19,4455.5,675.091595051081


In [None]:
print(len(df_temp))
df_temp = df_option_history.copy()
df_temp["ASK_BLACK_SCHOLES_COMPARE"] = df_temp.ASK > df_temp.BLACK_SCHOLES
df_temp.groupby(["OPTION_TYPE", "ASK_BLACK_SCHOLES_COMPARE"], as_index=False).size().sort_values(by="size").sort_values(
    by=["OPTION_TYPE", "ASK_BLACK_SCHOLES_COMPARE"]
)

In [None]:
df_rolling_ask_max = (
    df_temp.groupby("OPTION_ID", as_index=False)["ASK"]
    .rolling(window=20, min_periods=1)
    .max()
    .rename(columns={"ASK": "MAX_ASK_20"})
)
df_rolling_ask_max

In [None]:
# df_temp.merge(df_rolling_ask_max, left_on="OPTION_ID", right_on="OPTION_ID", how="inner")