In [3]:
def read_option_file(fileName):
    import pandas as pd
    import math

    csv = pd.read_csv(fileName,
                    names=['UnderlyingSymbol','UnderlyingPrice','Exchange',
                        'OptionSymbol','OptionExt','Type','Expiration',
                        'DataDate','Strike','Last','Bid','Ask','Volume',
                        'OpenInterest','IV','Delta','Gamma','Theta','Vega','AKA'])

    # Remove the exchange columns...
    csv = csv.drop(columns=['Exchange','OptionExt','AKA'])

    # Parse the date columns into datetimes...
    csv['Expiration'] = pd.to_datetime(csv['Expiration'])
    csv['DataDate'] = pd.to_datetime(csv['DataDate'])
    csv['TimeRemaining'] = (csv['Expiration'] - csv['DataDate']).dt.days

    # Tag the quality of the option
    bins = [0, 0.16, 0.32, 0.64, 1]
    moneynes = pd.cut(x=abs(csv.Delta), bins=bins, labels=['OTM','NTM','ATM','ITM'])
    csv['ITM'] = moneynes == 'ITM'
    csv['NTM'] = moneynes == 'NTM'
    csv['ATM'] = moneynes == 'ATM'
    csv['OTM'] = moneynes == 'OTM'


    # OneHotEncode the Type property
    csv['IsCall'] = csv['Type'] == 'call'
    csv['IsPut'] = csv['Type'] == 'put'
    csv = csv.drop(columns=['Type'])

    # Filter out no/bid records...
    csv = csv[(csv.Bid>0) & (csv.OpenInterest >0)]

    # Sort the values
    csv = csv.sort_values(by=['Expiration','DataDate','Strike'])
    return csv

#spy = read_option_file('SPY.csv')

In [19]:
def split_csv(csv, test_size):
    x_data = csv[['IV','Delta','Gamma','Theta','Vega',
              'TimeRemaining','IsCall','IsPut',
              'OTM','NTM','ATM','NTM']]
    midprice = (csv['Bid']+csv['Ask'])/2
    #ratio = csv['UnderlyingPrice']/csv['Strike']
    y_data = midprice

    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_size, random_state=42)
    
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    return x_train, x_test, y_train, y_test

for test_size in [0.1, 0.2, 0.4, 0.6, 0.8, 0.9]:
    # Split the datasets
    x_train, x_test, y_train, y_test = split_csv(msft, test_size)

    # Train the model
    from sklearn.tree import DecisionTreeRegressor
    clf = DecisionTreeRegressor().fit(x_train, y_train)
    print("test_size: %f - score: %f" % (test_size, clf.score(x_test, y_test)))
    

test_size: 0.100000 - score: 0.975440
test_size: 0.200000 - score: 0.975177
test_size: 0.400000 - score: 0.969589
test_size: 0.600000 - score: 0.961804
test_size: 0.800000 - score: 0.941840
test_size: 0.900000 - score: 0.921469


In [20]:
spy = msft

In [21]:
def split_csv(csv, test_size):
    x_data = csv[['IV','Delta','Gamma','Theta','Vega',
              'TimeRemaining','IsCall','IsPut']]
    
    # Calculate the y_predict
    midprice = (csv['Bid']+csv['Ask'])/2
    notional = csv['UnderlyingPrice'] * csv['Delta']
    
    y_data = midprice / csv['UnderlyingPrice']

    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_size, random_state=42)
    
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    
    return x_train, x_test, y_train, y_test


# Split the datasets
x_train, x_test, y_train, y_test = split_csv(spy, test_size=0.7)

# Train the model
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor().fit(x_train, y_train)
clf.score(x_test, y_test)

0.9790419810101619

In [22]:
iwm = read_option_file("IWM.csv")

In [23]:
# Split the datasets
x_train, x_test, y_train, y_test = split_csv(iwm, test_size=0.7)

# Train the model
clf.score(x_test, y_test)

0.6500562379900882

In [37]:
# Split the datasets
tlt = read_option_file("TLT.csv")
x_train, x_test, y_train, y_test = split_csv(tlt, test_size=0.7)

# Train the model
clf.score(x_test, y_test)


-1.6790360458719482

In [38]:
# Split the datasets
qqq = read_option_file("QQQ.csv")
x_train, x_test, y_train, y_test = split_csv(qqq, test_size=0.3)

# Train the model
clf.score(x_test, y_test)


0.7603279172626035

In [4]:
spy = read_option_file('SPY.csv')
tlt = read_option_file('TLT.csv')

In [5]:
import pandas as pd
combined = pd.concat([spy, tlt], ignore_index=True)

In [6]:
def split_csv(csv, test_size):
    x_data = csv[['IV','Delta','Gamma','Theta','Vega',
              'TimeRemaining','IsCall','IsPut']]

    # Calculate the y_predict
    midprice = (csv['Bid']+csv['Ask'])/2
    notional = csv['UnderlyingPrice'] * csv['Delta']

    y_data = midprice / csv['UnderlyingPrice']

    from sklearn.model_selection import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_size, random_state=42)

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler().fit(x_train)
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)

    return x_train, x_test, y_train, y_test


# Split the datasets
x_train, x_test, y_train, y_test = split_csv(combined, test_size=0.3)

# Train the model
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor().fit(x_train, y_train)
clf.score(x_test, y_test)

0.9873425309886735

In [10]:
qqq = read_option_file('QQQ.csv')
x_train, x_test, y_train, y_test = split_csv(qqq, test_size=0.3)

clf.score(x_test, y_test)

0.7854175346665832

In [8]:
clf.predict(x_test[0:20])

array([6.05331166e-03, 3.70008378e-02, 2.20804299e-01, 2.15965892e-02,
       1.70652956e-02, 5.66639281e-01, 1.45248909e-01, 2.63892127e-01,
       7.19356237e-01, 1.73838887e-01, 8.48155575e-03, 3.52503236e-02,
       4.07468605e-01, 2.39466718e-04, 1.41630747e-01, 1.39384129e-01,
       7.64072179e-03, 2.62914845e-02, 3.03163162e-02, 5.77320791e-02])

In [9]:
y_test[0:20]

2151851     0.006034
9643277     0.036646
409964      0.232573
4356278     0.021770
5887913     0.018521
5627783     0.566230
8658058     0.155650
2083183     0.255774
864762      0.730194
254230      0.161137
6693689     0.008808
2610968     0.034919
478432      0.414078
6673414     0.000053
2058647     0.134447
639922      0.145743
3787667     0.007138
6473848     0.027827
10131491    0.030549
3203165     0.058200
dtype: float64

In [11]:
baba = read_option_file('BABA.csv')
x_train, x_test, y_train, y_test = split_csv(baba, test_size=0.3)

clf.score(x_test, y_test)

0.6496992299420365