In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
#import statsmodels.formula.api as smf
#from sklearn.metrics import roc_curve
#from sklearn.metrics import auc

In [2]:
train_df = pd.read_csv("option_train.csv")
test_df = pd.read_csv("option_test_wolabel.csv") #test model and return predictions for submission

In [3]:
train_df.head()

Unnamed: 0,Value,S,K,tau,r,BS
0,21.670404,431.623898,420,0.34127,0.03013,Under
1,0.125,427.015526,465,0.166667,0.03126,Over
2,20.691244,427.762336,415,0.265873,0.03116,Under
3,1.035002,451.711658,460,0.063492,0.02972,Over
4,39.55302,446.718974,410,0.166667,0.02962,Under


In [4]:
test_df.head()

Unnamed: 0,S,K,tau,r
0,431.6186,460,0.293651,0.03147
1,432.633296,420,0.18254,0.03147
2,432.633296,430,0.18254,0.03147
3,431.6186,415,0.293651,0.03147
4,434.772855,420,0.043651,0.03147


## Data Prep

In [5]:
train_df['BS'].replace({'Under':0, 'Over':1},inplace=True)
train_df = train_df.drop('Value',axis=1) 
train_df.head()

Unnamed: 0,S,K,tau,r,BS
0,431.623898,420,0.34127,0.03013,0
1,427.015526,465,0.166667,0.03126,1
2,427.762336,415,0.265873,0.03116,0
3,451.711658,460,0.063492,0.02972,1
4,446.718974,410,0.166667,0.02962,0


In [6]:
train_df.corr()

Unnamed: 0,S,K,tau,r,BS
S,1.0,0.212302,-0.127183,-0.458171,-0.087638
K,0.212302,1.0,0.139045,-0.174267,0.750451
tau,-0.127183,0.139045,1.0,0.012809,0.097814
r,-0.458171,-0.174267,0.012809,1.0,-0.069523
BS,-0.087638,0.750451,0.097814,-0.069523,1.0


In [7]:
X = train_df.drop('BS',axis=1)
y = train_df['BS']

In [8]:
X.head()

Unnamed: 0,S,K,tau,r
0,431.623898,420,0.34127,0.03013
1,427.015526,465,0.166667,0.03126
2,427.762336,415,0.265873,0.03116
3,451.711658,460,0.063492,0.02972
4,446.718974,410,0.166667,0.02962


In [9]:
Xnew = X.copy()
col = list(Xnew.columns)
col_r = col[::-1]

for i in Xnew.columns:
    for j in col_r:
        if i==j:
            col_r.remove(i)
        else:
            Xnew[i+'_times_'+j] = Xnew[i]*Xnew[j]
            Xnew[i+'_by_'+j] = Xnew[i]/Xnew[j]
            
Xnew = Xnew.replace([np.inf, -np.inf], np.nan)

Xnew.head()

Unnamed: 0,S,K,tau,r,S_times_r,S_by_r,S_times_tau,S_by_tau,S_times_K,S_by_K,K_times_r,K_by_r,K_times_tau,K_by_tau,tau_times_r,tau_by_r
0,431.623898,420,0.34127,0.03013,13.004828,14325.386601,147.300219,1264.758401,181282.037286,1.027676,12.6546,13939.595088,143.333333,1230.697675,0.010282,11.32658
1,427.015526,465,0.166667,0.03126,13.348505,13660.125589,71.169254,2562.09315,198562.219543,0.918313,14.5359,14875.239923,77.5,2789.999994,0.00521,5.331627
2,427.762336,415,0.265873,0.03116,13.329074,13727.931207,113.730463,1608.897145,177521.369606,1.030753,12.9314,13318.356868,110.337302,1560.895522,0.008285,8.53251
3,451.711658,460,0.063492,0.02972,13.42487,15198.91177,28.680105,7114.458665,207787.362588,0.981982,13.6712,15477.792732,29.206349,7245.000056,0.001887,2.136341
4,446.718974,410,0.166667,0.02962,13.231816,15081.666928,74.453163,2680.313841,183154.779504,1.089558,12.1442,13841.99865,68.333333,2459.999995,0.004937,5.626829


In [10]:
Xnew['K - S'] = Xnew['K'] - Xnew['S']
Xnew.head()

Unnamed: 0,S,K,tau,r,S_times_r,S_by_r,S_times_tau,S_by_tau,S_times_K,S_by_K,K_times_r,K_by_r,K_times_tau,K_by_tau,tau_times_r,tau_by_r,K - S
0,431.623898,420,0.34127,0.03013,13.004828,14325.386601,147.300219,1264.758401,181282.037286,1.027676,12.6546,13939.595088,143.333333,1230.697675,0.010282,11.32658,-11.623898
1,427.015526,465,0.166667,0.03126,13.348505,13660.125589,71.169254,2562.09315,198562.219543,0.918313,14.5359,14875.239923,77.5,2789.999994,0.00521,5.331627,37.984474
2,427.762336,415,0.265873,0.03116,13.329074,13727.931207,113.730463,1608.897145,177521.369606,1.030753,12.9314,13318.356868,110.337302,1560.895522,0.008285,8.53251,-12.762336
3,451.711658,460,0.063492,0.02972,13.42487,15198.91177,28.680105,7114.458665,207787.362588,0.981982,13.6712,15477.792732,29.206349,7245.000056,0.001887,2.136341,8.288342
4,446.718974,410,0.166667,0.02962,13.231816,15081.666928,74.453163,2680.313841,183154.779504,1.089558,12.1442,13841.99865,68.333333,2459.999995,0.004937,5.626829,-36.718974


In [11]:
X['K - S'] = X['K'] - X['S']
X.head()

Unnamed: 0,S,K,tau,r,K - S
0,431.623898,420,0.34127,0.03013,-11.623898
1,427.015526,465,0.166667,0.03126,37.984474
2,427.762336,415,0.265873,0.03116,-12.762336
3,451.711658,460,0.063492,0.02972,8.288342
4,446.718974,410,0.166667,0.02962,-36.718974


In [12]:
cols = Xnew.columns

# Use either Standard Scaler or Min Max Scaler

# Standard scaler
# X = StandardScaler().fit_transform(X)

# Min Max scaler
Xnew = MinMaxScaler().fit_transform(Xnew)
Xnew = pd.DataFrame(Xnew, columns = cols )
Xnew.head()

Unnamed: 0,S,K,tau,r,S_times_r,S_by_r,S_times_tau,S_by_tau,S_times_K,S_by_K,K_times_r,K_by_r,K_times_tau,K_by_tau,tau_times_r,tau_by_r,K - S
0,0.202299,0.36,0.867347,0.261603,0.243026,0.498204,0.829308,0.001031,0.3366,0.527326,0.3041,0.420175,0.749947,0.001778,0.858454,0.876363,0.40333
1,0.050749,0.72,0.418367,0.738397,0.516604,0.153673,0.39564,0.012539,0.604273,0.10261,0.83022,0.600782,0.401345,0.015655,0.43009,0.407063,0.854009
2,0.075309,0.32,0.673469,0.696203,0.501137,0.188789,0.638083,0.004083,0.278347,0.539275,0.381509,0.300257,0.575226,0.004716,0.689737,0.657636,0.392988
3,0.862901,0.68,0.153061,0.088608,0.577393,0.950592,0.153608,0.05292,0.747172,0.349871,0.5884,0.717092,0.145619,0.055301,0.149465,0.156928,0.584228
4,0.698712,0.28,0.418367,0.046414,0.423716,0.889872,0.414346,0.013587,0.365609,0.76765,0.161362,0.401335,0.352805,0.012718,0.407007,0.430172,0.175348


In [17]:
Xnew.shape

(1680, 17)

In [13]:
# X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=1,stratify=y)

In [14]:
kfolds = StratifiedKFold(n_splits = 10, random_state = 1, shuffle = True) # random state 1 for reproducability

## Models with all engineered features

In [22]:
lr = LogisticRegression(max_iter = 10000)
lda = LinearDiscriminantAnalysis()
rfc = RandomForestClassifier(n_estimators=150, max_depth= 100)
gbc = GradientBoostingClassifier(n_estimators=500, learning_rate=0.5, max_depth=3)

error_lr_cv = cross_val_score(lr, Xnew, y, cv=kfolds, scoring = 'accuracy')
error_lda_cv = cross_val_score(lda, Xnew, y, cv=kfolds, scoring = 'accuracy')
error_rfc_cv = cross_val_score(rfc, Xnew, y, cv=kfolds, scoring = 'accuracy')
error_gbc_cv = cross_val_score(gbc, Xnew, y, cv=kfolds, scoring = 'accuracy')

print("Logistic Regression: \n")
print("accuracies of 10-folds:",error_lr_cv,"(mean classification error:",1-np.mean(error_lr_cv),')')

print('\n')
print("Linear Discriminant Analysis: \n")
print("accuracies of 10-folds:",error_lda_cv,"(mean classification error:",1-np.mean(error_lda_cv),')')

print('\n')
print("Random Forest Classifier: \n")
print("accuracies of 10-folds:",error_rfc_cv,"(mean classification error:",1-np.mean(error_rfc_cv),')')

print('\n')
print("Gradient Boosting Classifier: \n")
print("accuracies of 10-folds:",error_gbc_cv,"(mean classification error:",1-np.mean(error_gbc_cv),')')

Logistic Regression: 

accuracies of 10-folds: [0.91666667 0.93452381 0.9047619  0.93452381 0.91666667 0.9047619
 0.94047619 0.91666667 0.89285714 0.91666667] (mean classification error: 0.08214285714285707 )


Linear Discriminant Analysis: 

accuracies of 10-folds: [0.92261905 0.94047619 0.9047619  0.94047619 0.91666667 0.91071429
 0.94047619 0.91666667 0.88095238 0.92857143] (mean classification error: 0.0797619047619047 )


Random Forest Classifier: 

accuracies of 10-folds: [0.93452381 0.95238095 0.94642857 0.94642857 0.95238095 0.92261905
 0.94642857 0.92857143 0.9047619  0.94047619] (mean classification error: 0.0625 )


Gradient Boosting Classifier: 

accuracies of 10-folds: [0.95238095 0.95833333 0.93452381 0.96428571 0.92857143 0.93452381
 0.94642857 0.92857143 0.91666667 0.94047619] (mean classification error: 0.059523809523809645 )


In [None]:
from sklearn.model_selection import GridSearchCV
params = {'n_estimators':[400,500,600,700],'learning_rate':[0.2,0.3,0.4,0.5],
          'max_depth':[2,3,4,5]}

gbc = GradientBoostingClassifier()
clf = GridSearchCV(gbc,params,cv=kfolds)
clf.fit(Xnew,y)

In [None]:
clf.best_score_

In [None]:
clf.best_estimator_

## Models only with (K-S) feature in addition to original features

In [30]:
lr = LogisticRegression(max_iter = 10000)
lda = LinearDiscriminantAnalysis()
rfc = RandomForestClassifier(n_estimators=150, max_depth= 100)
gbc = GradientBoostingClassifier(n_estimators=500, learning_rate=0.5, max_depth=3)

error_lr_cv = cross_val_score(lr, X, y, cv=kfolds, scoring = 'accuracy')
error_lda_cv = cross_val_score(lda, X, y, cv=kfolds, scoring = 'accuracy')
error_rfc_cv = cross_val_score(rfc, X, y, cv=kfolds, scoring = 'accuracy')
error_gbc_cv = cross_val_score(gbc, X, y, cv=kfolds, scoring = 'accuracy')

print("Logistic Regression: \n")
print("accuracies of 10-folds:",error_lr_cv,"(mean classification error:",1-np.mean(error_lr_cv),')')

print('\n')
print("Linear Discriminant Analysis: \n")
print("accuracies of 10-folds:",error_lda_cv,"(mean classification error:",1-np.mean(error_lda_cv),')')

print('\n')
print("Random Forest Classifier: \n")
print("accuracies of 10-folds:",error_rfc_cv,"(mean classification error:",1-np.mean(error_rfc_cv),')')

print('\n')
print("Gradient Boosting Classifier: \n")
print("accuracies of 10-folds:",error_gbc_cv,"(mean classification error:",1-np.mean(error_gbc_cv),')')

Logistic Regression: 

accuracies of 10-folds: [0.91666667 0.94047619 0.88095238 0.92261905 0.89285714 0.89880952
 0.93452381 0.9047619  0.875      0.92261905] (mean classification error: 0.09107142857142847 )


Linear Discriminant Analysis: 

accuracies of 10-folds: [0.9047619  0.93452381 0.89285714 0.92857143 0.9047619  0.9047619
 0.93452381 0.91071429 0.89880952 0.91071429] (mean classification error: 0.0874999999999998 )


Random Forest Classifier: 

accuracies of 10-folds: [0.93452381 0.94642857 0.93452381 0.94047619 0.94047619 0.92857143
 0.94642857 0.94047619 0.9047619  0.94047619] (mean classification error: 0.06428571428571439 )


Gradient Boosting Classifier: 

accuracies of 10-folds: [0.95833333 0.95833333 0.93452381 0.95238095 0.93452381 0.92261905
 0.94642857 0.92857143 0.91071429 0.92857143] (mean classification error: 0.06249999999999978 )
