#2 Run
This file takes in the file written as a .csv from naive.ipynb. The output is used by compare.ipynb

In [86]:
from math import log, sqrt, pi, exp
from scipy.stats import norm
import pandas as pd
import tensorflow as tf
import numpy as np
from arch import arch_model

In [87]:
def read_file(file):
    """Read a single file and return a dataframe"""
    return pd.read_csv(file, skipinitialspace=True)

In [88]:
garch = False
rolling_avg = False
implied_vol = True
test_pq = False

In [89]:
import pandas as pd

if implied_vol:
    file = '../data/processed_data/2010-2023_NSS_filtered_with_IV.csv'
    df_IV = pd.read_csv(file, skipinitialspace=True)
    df_IV = df_IV[df_IV["Quote_date"] >= "2015-01-01"]
else:
    file = '../data/processed_data/2010-2023_NSS_filtered_vF.csv'
    df = pd.read_csv(file, skipinitialspace=True)
    df = df[df["Quote_date"] >= "2014-09-01"]

# Volatility

### GARCH

In [90]:
if test_pq:
    # Assuming your dataframe is named `df`
    df_unique_dates = df[['Quote_date', 'Underlying_last']].drop_duplicates()

    df_unique_dates['log_returns'] = np.log(df_unique_dates['Underlying_last']) - np.log(df_unique_dates['Underlying_last'].shift(1))
    df_unique_dates = df_unique_dates[['Quote_date', 'log_returns']].dropna()

    # Define a function to fit GARCH models and return AIC and BIC
    def fit_garch_aic_bic(log_returns, p, q):
        model = arch_model(log_returns, vol='Garch', p=p, q=q, dist='Normal')
        results = model.fit(disp='off')
        return results.aic, results.bic

    # Test GARCH models with different p and q values
    pq_values = [(1, 1), (1, 2), (2, 1), (2, 2), (1, 3), (3, 1), (2, 3), (3, 2), (3, 3), (4, 1), (1, 4), (4, 2), (2, 4), (4, 4)]

    aic_bic_values = [fit_garch_aic_bic(df_unique_dates['log_returns'], p, q) for p, q in pq_values]
    aic_values, bic_values = zip(*aic_bic_values)

    # Find the best p and q values based on AIC and BIC
    best_pq_aic = pq_values[np.argmin(aic_values)]
    best_pq_bic = pq_values[np.argmin(bic_values)]
    print(f"Best GARCH model based on AIC: GARCH({best_pq_aic[0]},{best_pq_aic[1]})")
    print(f"Best GARCH model based on BIC: GARCH({best_pq_bic[0]},{best_pq_bic[1]})")


In [91]:
if garch:
    df_unique_dates = df[['Quote_date', 'Underlying_last']].drop_duplicates()

    df_unique_dates['log_returns'] = np.log(df_unique_dates['Underlying_last']) - np.log(df_unique_dates['Underlying_last'].shift(1))
    df_unique_dates = df_unique_dates[['Quote_date', 'log_returns']].dropna()

    # Define GARCH(1,1) model
    model = arch_model(df_unique_dates['log_returns'], vol='Garch', p=1, q=1, dist='Normal')

    # Fit the model
    results = model.fit(update_freq=5)

    # Predict the volatility (annualized)
    df_unique_dates['predicted_volatility'] =  results.conditional_volatility * np.sqrt(252)

    # Merge the predicted volatility with the original dataframe
    df = df.merge(df_unique_dates[['Quote_date', 'predicted_volatility']], on='Quote_date', how='left')

    df = df[df["Quote_date"] >= "2015-01-01"]
    # Drop rows with NaN values
    df = df.dropna()



### Rolling average

In [92]:
if rolling_avg:    
    # Add volatility column with 30 day rolling standard deviation of Underlying_last

    # New dataframe without duplicate Quote_dates
    df2 = df.drop_duplicates(subset=['Quote_date'])

    # Calculate volatility
    df2['rolling_volatility'] = np.log(df2["Underlying_last"] / df2["Underlying_last"].shift()).rolling(30).std()*(252**0.5)

    # Matching volatility in df2 to df
    df['rolling_volatility'] = df['Quote_date'].map(df2.set_index('Quote_date')['rolling_volatility'])

    df = df[df["Quote_date"] >= "2015-01-01"]

    # Drop rows with NaN values
    df = df.dropna()

### Implied vol

In [93]:
# Black-Scholes formula for call options
def d1(S,K,T,r,sigma):
    x1 = S.apply(lambda x : log(x)) - K.apply(lambda x : log(x))
    x2 = (r + ((sigma.apply(lambda x : x**2)) / 2)) * T
    x3 = sigma * T.apply(lambda x: sqrt(x))
    return  (x1 + x2) / x3

def d2(S,K,T,r,sigma):
    return d1(S,K,T,r,sigma) - sigma * T.apply(lambda x : sqrt(x))  

def bs_call(S,K,T,r,sigma):
    return S * d1(S,K,T,r,sigma).apply(lambda x : norm.cdf(x)) - K * (-r*T).apply(lambda x : exp(x)) * d2(S,K,T,r,sigma).apply(lambda x : norm.cdf(x))

In [94]:
if implied_vol:    
    # Calculate the average implied volatility for each date
    avg_implied_vol_df = df_IV.groupby(
        'Quote_date')['IV'].median().reset_index()

    # Add a new column with the average implied volatility shifted by one row
    avg_implied_vol_df['avg_implied_vol_t-1'] = avg_implied_vol_df['IV'].shift(1)

    # Merge the avg_implied_vol_df DataFrame back to the original df DataFrame
    df = df.merge(avg_implied_vol_df[[
        'Quote_date', 'avg_implied_vol_t-1']], on='Quote_date', how='left')

    # Drop rows with NaN values
    df = df.dropna()


In [95]:
display(df)

Unnamed: 0.1,Unnamed: 0,Quote_date,Price,Underlying_last,Strike,TTM,R,predicted_volatility,Volatility,BS,avg_implied_vol_t-1
2776,1812133,2015-01-05,970.00,2021.05,1050.0,0.010959,0.00020,0.125301,0.125301,971.052301,0.187670
2777,1812134,2015-01-05,920.05,2021.05,1100.0,0.010959,0.00020,0.125301,0.125301,921.052411,0.187670
2778,1812135,2015-01-05,870.05,2021.05,1150.0,0.010959,0.00020,0.125301,0.125301,871.052521,0.187670
2779,1812136,2015-01-05,845.05,2021.05,1175.0,0.010959,0.00020,0.125301,0.125301,846.052575,0.187670
2780,1812137,2015-01-05,820.05,2021.05,1200.0,0.010959,0.00020,0.125301,0.125301,821.052630,0.187670
...,...,...,...,...,...,...,...,...,...,...,...
10539482,13739049,2023-03-31,217.75,4109.88,4700.0,1.726027,0.04198,0.148997,0.148997,216.072717,0.178325
10539483,13739050,2023-03-31,180.00,4109.88,4800.0,1.726027,0.04198,0.148997,0.148997,186.281498,0.178325
10539484,13739051,2023-03-31,146.55,4109.88,4900.0,1.726027,0.04198,0.148997,0.148997,159.937644,0.178325
10539485,13739052,2023-03-31,118.20,4109.88,5000.0,1.726027,0.04198,0.148997,0.148997,136.773457,0.178325


In [96]:
if False:
    df = df_read.copy()
    # Filter out options with a difference in strike price of more than 4% from the underlying price and a TTM of less than 20 days
    df = df[(df['Underlying_last'] > df['Strike'] * 0.94) & (df['Underlying_last'] < df['Strike'] * 1.06) & (df['TTM'] > 15)]

    from py_vollib.black_scholes import black_scholes as bs
    from py_vollib.black_scholes.greeks.analytical import vega
    import matplotlib.pyplot as plt

    def implied_vol(S0, K, T, r, market_price, flag='c', tol=0.000001):
        """Compute the implied volatility of a European Option
            S0: initial stock price
            K:  strike price
            T:  maturity
            r:  risk-free rate
            market_price: market observed price
            tol: user choosen tolerance
        """
        T = T/365 #converting to years
        r = r/100
        max_iter = 200 #max number of iterations
        vol_old = 0.20 #initial guess
        for k in range(max_iter):
            bs_price = bs(flag, S0, K, T, r, vol_old)
            Cprime =  vega(flag, S0, K, T, r, vol_old)*100
            C = bs_price - market_price
            vol_new = vol_old - C/Cprime
            bs_new = bs(flag, S0, K, T, r, vol_new)
            if (abs(vol_old - vol_new) < tol or abs(bs_new - market_price) < tol):
                break
            vol_old = vol_new
        implied_vol = vol_old
        return implied_vol
    

    # Add implied volatility column
    df['Implied_volatility'] = df.apply(lambda x: implied_vol(x['Underlying_last'], x['Strike'], x['TTM'], x['R'], x['Price'], 'c'), axis=1)

    display(df)
    print("Number of rows with NaN values: ", df.isna().sum().sum())
    df = df.dropna()

    # Calculate the average implied volatility for each date
    avg_implied_vol_df = df.groupby('Quote_date')['Implied_volatility'].mean().reset_index()

    # Add a new column with the average implied volatility shifted by one row
    avg_implied_vol_df['avg_implied_vol_t-1'] = avg_implied_vol_df['Implied_volatility'].shift(1)

    # Merge the avg_implied_vol_df DataFrame back to the original df DataFrame
    df = df.merge(avg_implied_vol_df[['Quote_date', 'avg_implied_vol_t-1']], on='Quote_date', how='left')
    

In [97]:
if False:
    df = df_read.copy()
    import numpy as np
    from scipy.stats import norm

    def bs_call(S, K, T, r, vol):
        eps = 1e-8
        d1 = np.divide((np.log(S/K) + (r + 0.5*vol**2)*T), (vol*np.sqrt(T) + eps))
        d2 = d1 - vol * np.sqrt(T)
        return S * norm.cdf(d1) - np.exp(-r * T) * K * norm.cdf(d2)

    def bs_vega(S, K, T, r, sigma):
        eps = 1e-8
        d1 = np.divide((np.log(S/K) + (r + 0.5*sigma**2)*T), (sigma*np.sqrt(T) + eps))
        return S * norm.pdf(d1) * np.sqrt(T)

    def find_vol(target_values, S, K, T, r):
        MAX_ITERATIONS = 200
        PRECISION = 1.0e-5
        sigmas = np.full_like(target_values, 0.5)

        for _ in range(MAX_ITERATIONS):
            prices = bs_call(S, K, T, r, sigmas)
            vegas = bs_vega(S, K, T, r, sigmas)
            diffs = target_values - prices

            mask = np.abs(diffs) >= PRECISION
            if not np.any(mask):
                break

            sigmas[mask] += np.divide(diffs[mask], vegas[mask], out=np.full_like(diffs[mask], np.nan), where=(vegas[mask] != 0))

        return sigmas


    df['Implied_volatility'] = find_vol(df['Price'].values, df['Underlying_last'].values, df['Strike'].values, df['TTM'].values, df['R'].values)


    len_before = len(df)
    df = df.dropna()
    len_after = len(df)
    print(f"Number of rows dropped: {len_before - len_after}, which is {round((len_before - len_after)/len_before*100, 2)}% of the original dataframe")

    # Calculate the average implied volatility for each date
    avg_implied_vol_df = df.groupby('Quote_date')['Implied_volatility'].mean().reset_index()

    # Add a new column with the average implied volatility shifted by one row
    avg_implied_vol_df['avg_implied_vol_t-1'] = avg_implied_vol_df['Implied_volatility'].shift(1)

    # Merge the avg_implied_vol_df DataFrame back to the original df DataFrame
    df = df.merge(avg_implied_vol_df[['Quote_date', 'avg_implied_vol_t-1']], on='Quote_date', how='left')

# BS

In [98]:
df = df[df["Quote_date"] >= "2015-01-01"]

In [99]:
if rolling_avg:
    df["Volatility"] = df["rolling_volatility"]
if garch:
    df["Volatility"] = df["predicted_volatility"]
if implied_vol:
    df["Volatility"] = df["avg_implied_vol_t-1"]
    
df["BS"] = bs_call(df["Underlying_last"], df["Strike"], df["TTM"], df["R"], df["Volatility"])


In [100]:
print("RMSE for full period", np.sqrt(np.mean((df['BS'] - df['Price'])**2)))

RMSE for full period 26.71753177110124


In [101]:
# Write to file
df.to_csv('../data/predictions/BS_IV_median.csv', encoding='utf-8', index=False)