In [31]:
import pandas as pd
import yfinance as yf
import re
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [32]:
def initializeUnderlyingDataAndDicts():
    global underlyingData, underlyingPriceDict, underlying15DVolDict, underlying30DVolDict, underlying60DVolDict, underlyingYZVolDict

    # Opening underlying file
    underlyingData = pd.read_csv("data/AAPL.csv")
    
    # Prepare all the underlying volatility dicts
    underlyingData["PriceChange"] = underlyingData["Close"] - underlyingData["Close"].shift(-1)
    underlyingData["PercentChange"] = underlyingData["PriceChange"] / (underlyingData["Close"] + underlyingData["PriceChange"])
    underlyingData['15DayVol'] = underlyingData["PercentChange"].rolling(15).std() * (252 ** 0.5)
    underlyingData['30DayVol'] = underlyingData["PercentChange"].rolling(30).std() * (252 ** 0.5)
    underlyingData['60DayVol'] = underlyingData["PercentChange"].rolling(60).std() * (252 ** 0.5)
    
    # Code from here: https://harbourfronts.com/garman-klass-yang-zhang-historical-volatility-calculation-volatility-analysis-python/
    underlyingData['YZVol'] = np.sqrt(252 / 30 * pd.DataFrame.rolling(np.log(underlyingData.loc[:, "Open"] / underlyingData.loc[:, "Close"].shift(1)) ** 2 +
                         0.5 * np.log(underlyingData.loc[:, "High"] / underlyingData.loc[:, "Low"]) ** 2 -
                         (2 * np.log(2) - 1) *
                         np.log(underlyingData.loc[:, "Close"] / underlyingData.loc[:, "Open"]) ** 2,
                         window=30).sum())
    
    # Remove NaN
    underlyingData = underlyingData.dropna()
    underlyingData.reset_index(drop=True)
    
    # Populate global dictionaries
    underlyingPriceDict = dict(zip(underlyingData.Date, underlyingData.Close))
    underlying15DVolDict = dict(zip(underlyingData.Date, underlyingData["15DayVol"]))
    underlying30DVolDict = dict(zip(underlyingData.Date, underlyingData["30DayVol"]))
    underlying60DVolDict = dict(zip(underlyingData.Date, underlyingData["60DayVol"]))
    underlyingYZVolDict = dict(zip(underlyingData.Date, underlyingData.YZVol))



def getObjectiveClosePrice(underlyingData, date):
    # Gets the close price on a date given objective data. None if date DNE
    # Consider using a hash map.
    try:
        return float(underlyingData.loc[underlyingData['Date'] == date]['Close'])
    except:
        return None

def getOptionProfit(expireDate, strike):
    # Gets the profit of an option given its expiration date and strike price
    endPrice = underlyingPriceDict[expireDate]
    if endPrice == None:
        return 0
    return max(endPrice - strike, 0)    
    
def getAndProcessOneYearOptionData():
    initializeUnderlyingDataAndDicts()
    finalDay = '2022-04-27'
    
    # Creating masterDf 
    masterDf = pd.read_csv("data/aapl_eod_202101.txt")
    for i in range(2, 13):
        masterDf = masterDf.append(pd.read_csv("data/aapl_eod_2021" + str(i).zfill(2) + ".txt"))
    
    # Dropping irrelevant columns
    cols_to_drop = [0, 1, 3, 6, 14, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
    masterDf.drop(masterDf.columns[cols_to_drop],axis=1,inplace=True)
    
    # Modifying column names for readability
    col_names = []
    for x in list(masterDf.columns):
        x = x.replace(' [', '').replace(']', '')
        col_names.append(x)

    masterDf.columns = col_names

    # Converting all but a few columns to numeric and removing NaN
    for i in range(len(col_names)):
        if i == 0 or i == 2 or i == 11:
            continue
        masterDf[col_names[i]] = pd.to_numeric(masterDf[col_names[i]], errors='coerce')
        masterDf = masterDf[masterDf[col_names[i]].notna()]
    
    # Stripping strings in problematic columns
    masterDf['QUOTE_DATE'] = masterDf['QUOTE_DATE'].str.strip()
    masterDf['C_SIZE'] = masterDf['C_SIZE'].str.strip()
    masterDf['EXPIRE_DATE'] = masterDf['EXPIRE_DATE'].str.strip()

    # Risk free rate column
    riskFreeRateDf = pd.read_csv("data/RiskFreeRate.csv")
    riskFreeRateDict = dict(zip(riskFreeRateDf.DATE, riskFreeRateDf.DTB3))
    masterDf['RISK_FREE_RATE'] = masterDf['QUOTE_DATE'].map(riskFreeRateDict)
    masterDf['RISK_FREE_RATE'] = pd.to_numeric(masterDf['RISK_FREE_RATE'], errors='coerce')
    masterDf = masterDf[masterDf['RISK_FREE_RATE'].notna()]
    
    # Map volatility columns
    masterDf["15DayVol"] = masterDf["QUOTE_DATE"].map(underlying15DVolDict)
    masterDf["30DayVol"] = masterDf["QUOTE_DATE"].map(underlying30DVolDict)
    masterDf["60DayVol"] = masterDf["QUOTE_DATE"].map(underlying60DVolDict)
    masterDf["YZVol"] = masterDf["QUOTE_DATE"].map(underlyingYZVolDict)
    
    # Converting date columns to dates. Requires .date() to become DateTime
    masterDf['EXPIRE_DATE'] = pd.to_datetime(masterDf['EXPIRE_DATE'])
    masterDf['QUOTE_DATE'] = pd.to_datetime(masterDf['QUOTE_DATE'])
    
    # Remove all options past the last date of underlying data
    masterDf = masterDf[~(masterDf['EXPIRE_DATE'] > finalDay)]
    
    # Map option profits
    masterDf['OPTION_PROFIT'] = masterDf.apply(lambda x: getOptionProfit(str(x['EXPIRE_DATE'].date()), x['STRIKE']), axis=1)
    masterDf.reset_index(drop=True)

    return masterDf

def filter_for_moneyness(df, low = None, high = None):
    if low == None and high == None:
        return df
    if low == None:
        return df[df['STRIKE'] / df['UNDERLYING_LAST'] <= high]
    if high == None:
        return df[df['STRIKE'] / df['UNDERLYING_LAST'] >= low]
    return df[(df['STRIKE'] / df['UNDERLYING_LAST'] >= low) & (df['STRIKE'] / df['UNDERLYING_LAST'] <= high)]

# For NN, features are: DTE, Underlying last, Delta, gamma, vega, theta, rho, IB

In [3]:
df = getAndProcessOneYearOptionData()
df

  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,QUOTE_DATE,UNDERLYING_LAST,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,...,C_ASK,STRIKE,STRIKE_DISTANCE,STRIKE_DISTANCE_PCT,RISK_FREE_RATE,15DayVol,30DayVol,60DayVol,YZVol,OPTION_PROFIT
13,2021-01-04,129.45,2021-01-08,4.00,0.99330,0.00182,0.00267,-0.01615,0.01475,0.70653,...,23.56,106.0,23.4,0.181,0.09,0.335185,0.290138,0.345166,0.282196,26.050003
16,2021-01-04,129.45,2021-01-08,4.00,0.99171,0.00236,0.00355,-0.01585,0.01479,0.61801,...,20.55,109.0,20.4,0.158,0.09,0.335185,0.290138,0.345166,0.282196,23.050003
17,2021-01-04,129.45,2021-01-08,4.00,0.99211,0.00254,0.00301,-0.01615,0.01530,0.58957,...,19.55,110.0,19.4,0.150,0.09,0.335185,0.290138,0.345166,0.282196,22.050003
18,2021-01-04,129.45,2021-01-08,4.00,0.98019,0.00493,0.00688,-0.04014,0.01468,0.65222,...,18.59,111.0,18.4,0.143,0.09,0.335185,0.290138,0.345166,0.282196,21.050003
19,2021-01-04,129.45,2021-01-08,4.00,0.98415,0.00440,0.00543,-0.02860,0.01464,0.58313,...,17.55,112.0,17.4,0.135,0.09,0.335185,0.290138,0.345166,0.282196,20.050003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19210,2021-12-31,177.58,2022-04-14,103.96,0.06265,0.00433,0.11726,-0.01649,0.02991,0.29953,...,0.76,230.0,52.4,0.295,0.06,0.293058,0.297586,0.243602,0.311005,0.000000
19211,2021-12-31,177.58,2022-04-14,103.96,0.05137,0.00364,0.10102,-0.01461,0.02492,0.30623,...,0.60,235.0,57.4,0.323,0.06,0.293058,0.297586,0.243602,0.311005,0.000000
19212,2021-12-31,177.58,2022-04-14,103.96,0.04452,0.00317,0.08905,-0.01318,0.02085,0.31495,...,0.53,240.0,62.4,0.351,0.06,0.293058,0.297586,0.243602,0.311005,0.000000
19213,2021-12-31,177.58,2022-04-14,103.96,0.03869,0.00268,0.08006,-0.01283,0.01835,0.32534,...,0.48,245.0,67.4,0.380,0.06,0.293058,0.297586,0.243602,0.311005,0.000000


In [4]:
df = filter_for_moneyness(df, 0.8, 1.2)

In [7]:
df.columns

Index(['QUOTE_DATE', 'UNDERLYING_LAST', 'EXPIRE_DATE', 'DTE', 'C_DELTA',
       'C_GAMMA', 'C_VEGA', 'C_THETA', 'C_RHO', 'C_IV', 'C_LAST', 'C_SIZE',
       'C_BID', 'C_ASK', 'STRIKE', 'STRIKE_DISTANCE', 'STRIKE_DISTANCE_PCT',
       'RISK_FREE_RATE', '15DayVol', '30DayVol', '60DayVol', 'YZVol',
       'OPTION_PROFIT'],
      dtype='object')

In [27]:
X = df.iloc[:, [1, 3, 17, 18, 19, 20, 21]].values
#  4, 5, 6, 7, 8, 9, 10, 13, 14, 
y = df.iloc[:, 13].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [28]:
X_a = df.iloc[:, [1, 3, 17, 18, 19, 20, 21]]
X_a

Unnamed: 0,UNDERLYING_LAST,DTE,RISK_FREE_RATE,15DayVol,30DayVol,60DayVol,YZVol
13,129.45,4.00,0.09,0.335185,0.290138,0.345166,0.282196
16,129.45,4.00,0.09,0.335185,0.290138,0.345166,0.282196
17,129.45,4.00,0.09,0.335185,0.290138,0.345166,0.282196
18,129.45,4.00,0.09,0.335185,0.290138,0.345166,0.282196
19,129.45,4.00,0.09,0.335185,0.290138,0.345166,0.282196
...,...,...,...,...,...,...,...
19202,177.58,103.96,0.06,0.293058,0.297586,0.243602,0.311005
19203,177.58,103.96,0.06,0.293058,0.297586,0.243602,0.311005
19204,177.58,103.96,0.06,0.293058,0.297586,0.243602,0.311005
19205,177.58,103.96,0.06,0.293058,0.297586,0.243602,0.311005


In [29]:
sc = MinMaxScaler()
X_train_sc = pd.DataFrame(sc.fit_transform(X_train))
X_test_sc = pd.DataFrame(sc.transform(X_test))

In [30]:
model = MLPRegressor(verbose = True, activation = 'relu')
model.fit(X_train_sc, y_train)


# YZ 100 iterations loss = 11.41, relu
# YZ 100 iterations loss = 13.9, logistic

Iteration 1, loss = 48.14518331
Iteration 2, loss = 34.87599505
Iteration 3, loss = 33.29690384
Iteration 4, loss = 32.90411729
Iteration 5, loss = 32.82908823
Iteration 6, loss = 32.80315023
Iteration 7, loss = 32.79679085
Iteration 8, loss = 32.79339361
Iteration 9, loss = 32.77997246
Iteration 10, loss = 32.76774757




MLPRegressor(verbose=True)

In [16]:
y_train_pred = model.predict(X_train_sc)
y_test_pred = model.predict(X_test_sc)
print(metrics.mean_squared_error(y_train, y_train_pred))
print(metrics.mean_squared_error(y_test, y_test_pred))

20.584004809543885
20.249190186813294


In [21]:
max(y_test)

71.13999899999999

In [20]:
max(y_test_pred)

63.02642172898726