#2 Run
This file takes in the file written as a .csv from naive.ipynb. The output is used by compare.ipynb

In [1]:
from math import log, sqrt, pi, exp
from scipy.stats import norm
import pandas as pd
import tensorflow as tf
import numpy as np
from arch import arch_model

2023-04-28 12:49:38.419456: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def read_file(file):
    """Read a single file and return a dataframe"""
    return pd.read_csv(file, skipinitialspace=True)

In [3]:
import pandas as pd

file = '../data/processed_data/2010-2022_filtered.csv'
df = pd.read_csv(file, skipinitialspace=True)

In [4]:
display(df)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Quote_date,Expire_date,Price,Underlying_last,Strike,TTM,R,Moneyness
0,0,0,2010-01-04,2010-01-07,207.490,1132.99,925.0,3,0.05,1.224854
1,1,1,2010-01-04,2010-01-07,182.500,1132.99,950.0,3,0.05,1.192621
2,2,2,2010-01-04,2010-01-07,157.500,1132.99,975.0,3,0.05,1.162041
3,3,3,2010-01-04,2010-01-07,132.600,1132.99,1000.0,3,0.05,1.132990
4,4,4,2010-01-04,2010-01-07,107.705,1132.99,1025.0,3,0.05,1.105356
...,...,...,...,...,...,...,...,...,...,...
11895723,13536141,13536141,2022-12-30,2024-12-20,362.600,3839.81,4300.0,721,4.41,0.892979
11895724,13536142,13536142,2022-12-30,2024-12-20,319.150,3839.81,4400.0,721,4.41,0.872684
11895725,13536143,13536143,2022-12-30,2024-12-20,279.000,3839.81,4500.0,721,4.41,0.853291
11895726,13536144,13536144,2022-12-30,2024-12-20,241.950,3839.81,4600.0,721,4.41,0.834741


In [5]:
garch = False
rolling_avg = False
implied_vol = True
test_pq = False

# Volatility

### GARCH

In [6]:
if test_pq:
    # Assuming your dataframe is named `df`
    df_unique_dates = df[['Quote_date', 'Underlying_last']].drop_duplicates()

    df_unique_dates['log_returns'] = np.log(df_unique_dates['Underlying_last']) - np.log(df_unique_dates['Underlying_last'].shift(1))
    df_unique_dates = df_unique_dates[['Quote_date', 'log_returns']].dropna()

    # Define a function to fit GARCH models and return AIC and BIC
    def fit_garch_aic_bic(log_returns, p, q):
        model = arch_model(log_returns, vol='Garch', p=p, q=q, dist='Normal')
        results = model.fit(disp='off')
        return results.aic, results.bic

    # Test GARCH models with different p and q values
    pq_values = [(1, 1), (1, 2), (2, 1), (2, 2), (1, 3), (3, 1), (2, 3), (3, 2), (3, 3), (4, 1), (1, 4), (4, 2), (2, 4), (4, 4)]

    aic_bic_values = [fit_garch_aic_bic(df_unique_dates['log_returns'], p, q) for p, q in pq_values]
    aic_values, bic_values = zip(*aic_bic_values)

    # Find the best p and q values based on AIC and BIC
    best_pq_aic = pq_values[np.argmin(aic_values)]
    best_pq_bic = pq_values[np.argmin(bic_values)]
    print(f"Best GARCH model based on AIC: GARCH({best_pq_aic[0]},{best_pq_aic[1]})")
    print(f"Best GARCH model based on BIC: GARCH({best_pq_bic[0]},{best_pq_bic[1]})")


In [7]:
if garch:
    df_unique_dates = df[['Quote_date', 'Underlying_last']].drop_duplicates()

    df_unique_dates['log_returns'] = np.log(df_unique_dates['Underlying_last']) - np.log(df_unique_dates['Underlying_last'].shift(1))
    df_unique_dates = df_unique_dates[['Quote_date', 'log_returns']].dropna()

    # Define GARCH(1,1) model
    model = arch_model(df_unique_dates['log_returns'], vol='Garch', p=1, q=1, dist='Normal')

    # Fit the model
    results = model.fit(update_freq=5)

    # Predict the volatility (annualized)
    df_unique_dates['predicted_volatility'] =  results.conditional_volatility * np.sqrt(252)

    # Merge the predicted volatility with the original dataframe
    df = df.merge(df_unique_dates[['Quote_date', 'predicted_volatility']], on='Quote_date', how='left')
    
    # Drop rows with NaN values
    df = df.dropna()

    print(df.head())


### Rolling average

In [8]:
if rolling_avg:    
    # Add volatility column with 30 day rolling standard deviation of Underlying_last

    # New dataframe without duplicate Quote_dates
    df2 = df.drop_duplicates(subset=['Quote_date'])

    # Calculate volatility
    df2['rolling_volatility'] = np.log(df2["Underlying_last"] / df2["Underlying_last"].shift()).rolling(30).std()*(252**0.5)

    # Matching volatility in df2 to df
    df['rolling_volatility'] = df['Quote_date'].map(df2.set_index('Quote_date')['rolling_volatility'])

    # Drop rows with NaN values
    df = df.dropna()

### Implied vol

In [9]:
if implied_vol:
    import numpy as np
    from scipy.stats import norm
    from scipy.optimize import brentq

    def implied_volatility(target_value, S, K, T, r):
        def option_price(vol):
            d1 = (np.log(S/K) + (r + 0.5 * vol**2) * T) / (vol * np.sqrt(T))
            d2 = d1 - vol * np.sqrt(T)
            return S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2) - target_value
        try:
            return brentq(option_price, 1e-6, 1)
        except ValueError:
            return np.nan

    # Apply the function to the DataFrame
    df['Implied_volatility'] = df.apply(lambda row: implied_volatility(row['Price'], row['Underlying_last'], row['Strike'], row['TTM'], row['R']), axis=1)
    len_before = len(df)
    df = df.dropna()
    len_after = len(df)
    print(f"Number of rows dropped: {len_before - len_after}, which is {round((len_before - len_after)/len_before*100, 2)}% of the original dataframe")

    # Calculate the average implied volatility for each date
    avg_implied_vol_df = df.groupby('Quote_date')['Implied_volatility'].mean().reset_index()

    # Add a new column with the average implied volatility shifted by one row
    avg_implied_vol_df['avg_implied_vol_t-1'] = avg_implied_vol_df['Implied_volatility'].shift(1)

    # Merge the avg_implied_vol_df DataFrame back to the original df DataFrame
    df = df.merge(avg_implied_vol_df[['Quote_date', 'avg_implied_vol_t-1']], on='Quote_date', how='left')

  d1 = (np.log(S/K) + (r + 0.5 * vol**2) * T) / (vol * np.sqrt(T))
  d1 = (np.log(S/K) + (r + 0.5 * vol**2) * T) / (vol * np.sqrt(T))


# BS

In [None]:
# Black-Scholes formula for call options
def d1(S,K,T,r,sigma):
    x1 = S.apply(lambda x : log(x)) - K.apply(lambda x : log(x))
    x2 = (r + ((sigma.apply(lambda x : x**2)) / 2)) * T
    x3 = sigma * T.apply(lambda x: sqrt(x))
    return  (x1 + x2) / x3

def d2(S,K,T,r,sigma):
    return d1(S,K,T,r,sigma) - sigma * T.apply(lambda x : sqrt(x))  

def bs_call(S,K,T,r,sigma):
    T = T/365
    r = r/100
    return S * d1(S,K,T,r,sigma).apply(lambda x : norm.cdf(x)) - K * (-r*T).apply(lambda x : exp(x)) * d2(S,K,T,r,sigma).apply(lambda x : norm.cdf(x))

In [None]:
if rolling_avg:
    df["Volatility"] = df["rolling_volatility"]
if garch:
    df["Volatility"] = df["predicted_volatility"]
if implied_vol:
    df["Volatility"] = df["avg_implied_vol_t-1"]
    
df["BS"] = bs_call(df["Underlying_last"], df["Strike"], df["TTM"], df["R"], df["Volatility"])


In [None]:
print("RMSE for full period", np.sqrt(np.mean((df['BS'] - df['Price'])**2)))

RMSE for full period 66.07943242950758


In [None]:
# Write to file
df.to_csv('../data/predictions/BS_IV_avg.csv', encoding='utf-8', index=False)