In [54]:
import pandas as pd
import pickle
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
#reference that helped with this:
#https://stackabuse.com/implementing-lda-in-python-with-scikit-learn/

In [2]:
clean = pd.read_pickle("Cleaned_data.pkl")
del clean['key_name']
del clean['mode_name']
del clean['key_mode']
del clean['type']

In [44]:
wanted  = clean.columns[1:13] #remove track name and labels
#separate into features/labels
features = clean[wanted]
labels = clean['is_hit']
#split data 70-30 training/testing
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state= 420)  
#standardize the data
std = StandardScaler()  
X_train = std.fit_transform(X_train)  
X_test = std.transform(X_test)  
# lda = LDA()  
# X_train = lda.fit_transform(X_train, y_train)  
# X_test = lda.transform(X_test)  
# print(X_train)
# lda.coef_
#lda.explained_variance_ratio_

In [5]:
#Run linear SVM 10 times with different C to find out the best one
def runSVM(c):
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    svm.predict(X_test)
    return(svm.score(X_test, y_test))
for c in list(range(1,11)):
    print(runSVM(c=c)) #we can see from our result that the best C can be almost any C. We'll stick with 2 for later

0.886822850238
0.886822850238
0.886822850238
0.886822850238
0.886822850238
0.886822850238
0.886822850238
0.886822850238
0.886822850238
0.886822850238


In [60]:
#now try with polynomial features 
poly = PolynomialFeatures(degree = 3, include_bias = False)
poly_feat = poly.fit_transform(features)
#resplit into testing and training
X_trainP, X_testP, y_trainP, y_testP = train_test_split(poly_feat, labels, test_size=0.3, random_state= 420)  
#redo LinearSVM
svm = LinearSVC(C=2) #kept C = 2 from LinearSVC above
svm.fit(X_trainP, y_trainP)
poly_preds = svm.predict(X_testP)
svm.score(X_testP, y_testP) #accuracy went down a good amount, so polynomial features aren't efficient

0.76646304625092299

In [67]:
#use GridSearchCV() to optimize parameters for SGDClassifier
sgd = SGDClassifier(max_iter =  5, tol = None)
par = {'loss':('hinge', 'huber', 'modified_huber', 'log', 'epsilon_insensitive', 'squared_epsilon_insensitive'), 'alpha':[0.0001, 0.001], 'penalty':['l2', 'l1', 'elasticnet']}
gs = GridSearchCV(sgd, par, return_train_score = True)
gs.fit(X_train, y_train)
print(gs.best_params_)
gs.best_score_
#gs.fit()

{'penalty': 'l1', 'alpha': 0.0001, 'loss': 'hinge'}


0.88753991771915186

In [68]:
#run SGD with best parameters
sgd_best = SGDClassifier(loss = 'hinge', penalty = 'l1', alpha = 0.0001, max_iter = 10000)
sgd_best.fit(X_train, y_train)
sgdPreds = sgd_best.predict(X_test)
sgd_best.score(X_test, y_test)

0.88682285023830298

## we get basically the same accuracy as before, so we can assume 88.7% accuracy is pretty much as accurate as we can get. 

In [59]:
#try seeing auc scores
svm = LinearSVC(C=2)
svm.fit(X_train, y_train)
preds = svm.predict(X_test)
roc_auc_score(y_test, preds)

0.5

In [65]:
print(roc_auc_score(y_testP, poly_preds)) #polynomial performs slightly better from this measure
print(roc_auc_score(y_test, sgdPreds))

0.513893082915
0.5
