In [65]:
import pandas as pd
import math

csv = pd.read_csv('MSFT.csv',
                  names=['UnderlyingSymbol','UnderlyingPrice','Exchange',
                     'OptionSymbol','OptionExt','Type','Expiration',
                     'DataDate','Strike','Last','Bid','Ask','Volume',
                     'OpenInterest','IV','Delta','Gamma','Theta','Vega','AKA'])

# Remove the exchange columns...
csv = csv.drop(columns=['Exchange','OptionExt','AKA'])

# Parse the date columns into datetimes...
csv['Expiration'] = pd.to_datetime(csv['Expiration'])
csv['DataDate'] = pd.to_datetime(csv['DataDate'])
csv['TimeRemaining'] = (csv['Expiration'] - csv['DataDate']).dt.days

# Tag the quality of the option
bins = [0, 0.16, 0.32, 0.64, 1]
moneynes = pd.cut(x=abs(csv.Delta), bins=bins, labels=['OTM','NTM','ATM','ITM'])
csv['ITM'] = moneynes == 'ITM'
csv['NTM'] = moneynes == 'NTM'
csv['ATM'] = moneynes == 'ATM'
csv['OTM'] = moneynes == 'OTM'


# OneHotEncode the Type property
csv['IsCall'] = csv['Type'] == 'call'
csv['IsPut'] = csv['Type'] == 'put'
csv = csv.drop(columns=['Type'])

# Filter out no/bid records...
csv = csv[(csv.Bid>0) & (csv.OpenInterest >0)]

# Sort the values
csv = csv.sort_values(by=['Expiration','DataDate','Strike'])

In [66]:
csv

Unnamed: 0,UnderlyingSymbol,UnderlyingPrice,OptionSymbol,Expiration,DataDate,Strike,Last,Bid,Ask,Volume,...,Gamma,Theta,Vega,TimeRemaining,ITM,NTM,ATM,OTM,IsCall,IsPut
213,MSFT,60.65,MQF020216C00030000,2002-02-16,2002-02-08,30.0,29.30,30.40,30.80,15,...,0.0000,-0.5758,0.0000,8,True,False,False,False,True,False
215,MSFT,60.65,MQF020216C00035000,2002-02-16,2002-02-08,35.0,25.70,25.40,25.80,0,...,0.0000,-0.6717,0.0000,8,True,False,False,False,True,False
217,MSFT,60.65,MQF020216C00040000,2002-02-16,2002-02-08,40.0,29.90,20.40,20.80,0,...,0.0000,-0.7677,0.0000,8,True,False,False,False,True,False
219,MSFT,60.65,MQF020216C00045000,2002-02-16,2002-02-08,45.0,18.10,15.40,15.80,0,...,0.0000,-0.8637,0.0000,8,True,False,False,False,True,False
221,MSFT,60.65,MSQ020216C00050000,2002-02-16,2002-02-08,50.0,9.60,10.40,10.80,20,...,0.0000,-0.9677,0.0010,8,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2153013,MSFT,157.70,MSFT220121P00200000,2022-01-21,2019-12-31,200.0,47.60,45.00,50.00,0,...,0.0064,-2.8635,77.1793,752,True,False,False,False,False,True
2153006,MSFT,157.70,MSFT220121C00210000,2022-01-21,2019-12-31,210.0,5.72,5.60,6.75,17,...,0.0060,-3.8707,69.1413,752,False,True,False,False,True,False
2153007,MSFT,157.70,MSFT220121P00210000,2022-01-21,2019-12-31,210.0,55.55,53.00,57.80,0,...,0.0059,-2.3131,70.9763,752,True,False,False,False,False,True
2153009,MSFT,157.70,MSFT220121P00220000,2022-01-21,2019-12-31,220.0,64.65,61.50,66.50,2,...,0.0053,-1.8844,65.4867,752,True,False,False,False,False,True


In [67]:
x_data = csv[['UnderlyingPrice','Strike','IV',
              'Delta','Gamma','Theta','Vega',
              'TimeRemaining','IsCall','IsPut',
              'OTM','NTM','ATM','NTM']]

# Take the midprice between bid and ask
y_data = (csv['Bid']+csv['Ask'])/2

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

In [108]:
# Apply GradientDecent
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

from sklearn.decomposition import PCA
pca = PCA(0.95)
pca = pca.fit(x_train)
x_train_pca = pca.transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)


from sklearn.linear_model import SGDRegressor
clf = SGDRegressor()
clf.fit(x_train_pca, y_train)
clf.score(x_test_pca, y_test)
# 51.8%



-8.101642435310715e+21

In [109]:
x_train_pca.shape

(1015582, 2)

In [105]:
y_test[0:10]

664777     0.72
903723     0.32
1042292    4.15
2009787    3.90
390709     1.62
2069916    0.05
1946184    0.12
829416     0.27
123523     1.73
1608226    7.35
Name: Bid, dtype: float64

In [71]:
from sklearn.decomposition import PCA
pca = PCA(0.95)
pca = pca.fit(x_train)
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)

(x_train.shape, x_train_pca.shape)

((1015582, 14), (1015582, 2))

In [80]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

db = DBSCAN().fit(x_train_pca,y_train)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 1857
Estimated number of noise points: 1001075


In [82]:
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor().fit(x_train, y_train)
clf.score(x_test, y_test)
# 99.8%

0.9999999231557859

In [96]:
(clf.predict(x_test[1:10]), y_test[1:10])

(array([ 4.85,  9.6 , 14.3 ,  7.6 ,  0.05,  1.04,  4.65,  7.6 ,  1.63]),
 903723     0.32
 1042292    4.15
 2009787    3.90
 390709     1.62
 2069916    0.05
 1946184    0.12
 829416     0.27
 123523     1.73
 1608226    7.35
 Name: Bid, dtype: float64)

In [110]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

#clf = DecisionTreeRegressor().fit(x_train_scaled, y_train)
#clf.score(x_test_scaled, y_test)
#from sklearn.linear_model import Log
#clf = LogisticRegression(solver='lbfgs', ).fit(x_train_scaled, y_train)
#clf.score(x_test_scaled, y_test)

In [112]:
clf.score(x_test_scaled, y_test)

ValueError: shapes (435250,14) and (2,) not aligned: 14 (dim 1) != 2 (dim 0)