In [None]:
# Import necessary packages

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import KFold
from operator import itemgetter
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Import the datasets

dfHistorical = pd.read_csv('historical-rookies.csv')
dfRookies = pd.read_csv('2017-rookies.csv')

In [None]:
# Preview the historical dataset

dfHistorical.head()

In [None]:
# Preview the rookies dataset

dfRookies.head()

# Prepare rookie dataset for predictions

In [None]:
# Prepare rookies dataframe to be plugged into machine learning models by skipping columns that aren't parameters in the model

rookieTest = dfRookies[['G', 'MPG', 'FG/G', 'FGA/G', '2P%', '3P%', 'FT%', 'TRB/G', 'AST/G', 'STL/G', 'BLK/G', 'TOV/G', 'PF/G',
                        'TS%', '3PAr', 'FTr']]

rookieNames = dfRookies.iloc[:, 1]

# Creating data splits for HOF models

In [None]:
train, test = train_test_split(dfHistorical, test_size = 0.25, random_state = 0)

xtrain = train[['G', 'MPG', 'FG/G', 'FGA/G', '2P%', '3P%', 'FT%', 'TRB/G', 'AST/G', 'STL/G', 'BLK/G', 'TOV/G', 'PF/G',
                'TS%', '3PAr', 'FTr']]
ytrain = train[['Hall of Fame']]
 
xtest = test[['G', 'MPG', 'FG/G', 'FGA/G', '2P%', '3P%', 'FT%', 'TRB/G', 'AST/G', 'STL/G', 'BLK/G', 'TOV/G', 'PF/G',
              'TS%', '3PAr', 'FTr']]
ytest = test[['Hall of Fame']]

# Create models and confusion matrices for HOF models

In [None]:
svc = SVC(kernel='rbf', gamma=1e-4, C=10, probability = True)
svc.fit(xtrain, ytrain.values.ravel())

y_svc = svc.predict(xtest)

print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_svc))

proba = svc.predict_proba(xtest)
print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

posProb = proba[:, 1]
print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, posProb))

fprSVC, tprSVC, thresholdSVC = metrics.roc_curve(ytest, posProb)
roc_aucSVC = metrics.auc(fprSVC, tprSVC)

cvScoreSVC = cross_val_score(svc, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
print("Accuracy (cross validation score): %0.2f (+/- %0.2f)" % (cvScoreSVC.mean(), cvScoreSVC.std() * 2))

In [None]:
cm = metrics.confusion_matrix(ytest, y_svc)

plt.style.use("fivethirtyeight")
svcHofCM, ax = plt.subplots()

sns.heatmap(cm, annot=True, ax = ax, linewidth = 2)

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

labels = ["Not HOF", "HOF"]
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)

svcHofCM.suptitle("SVC Confusion Matrix", weight = 'bold', size = 18, y = 1.04, x = .45)
ax.set_title("http://dribbleanalytics.blogspot.com", size = 14, fontname = 'Rockwell', y = 1.02)

svcHofCM.savefig('svc-hof-cm.png', dpi = 400, bbox_inches = 'tight')

In [None]:
rf = RandomForestClassifier(random_state = 999, n_estimators = 100, criterion = 'gini')
rfPred = rf.fit(xtrain, ytrain.values.ravel())
y_rf = rfPred.predict(xtest)

print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_rf))

proba = rf.predict_proba(xtest)
print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

posProb = proba[:, 1]
print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, posProb))

fprRF, tprRF, thresholdRF = metrics.roc_curve(ytest, posProb)
roc_aucRF = metrics.auc(fprRF, tprRF)

cvScoreRF = cross_val_score(rf, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
print("Accuracy (cross validation score): %0.2f (+/- %0.2f)" % (cvScoreRF.mean(), cvScoreRF.std() * 2))

In [None]:
cm = metrics.confusion_matrix(ytest, y_rf)

plt.style.use("fivethirtyeight")
rfHofCM, ax = plt.subplots()

sns.heatmap(cm, annot=True, ax = ax, linewidth = 2)

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

labels = ["Not HOF", "HOF"]
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)

rfHofCM.suptitle("RF Confusion Matrix", weight = 'bold', size = 18, y = 1.04, x = .45)
ax.set_title("http://dribbleanalytics.blogspot.com", size = 14, fontname = 'Rockwell', y = 1.02)

rfHofCM.savefig('rf-hof-cm.png', dpi = 400, bbox_inches = 'tight')

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors = 8, weights = 'uniform')
knn.fit(xtrain, ytrain.values.ravel())

y_knn = knn.predict(xtest)

print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_knn))

proba = knn.predict_proba(xtest)
print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

posProb = proba[:, 1]
print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, posProb))

fprKNN, tprKNN, thresholdKNN = metrics.roc_curve(ytest, posProb)
roc_aucKNN = metrics.auc(fprKNN, tprKNN)

cvScoreKNN = cross_val_score(knn, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
print("Accuracy (cross validation score): %0.2f (+/- %0.2f)" % (cvScoreKNN.mean(), cvScoreKNN.std() * 2))

In [None]:
cm = metrics.confusion_matrix(ytest, y_knn)

plt.style.use("fivethirtyeight")
knnHofCM, ax = plt.subplots()

sns.heatmap(cm, annot=True, ax = ax, linewidth = 2)

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

labels = ["Not HOF", "HOF"]
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)

knnHofCM.suptitle("KNN Confusion Matrix", weight = 'bold', size = 18, y = 1.04, x = .45)
ax.set_title("http://dribbleanalytics.blogspot.com", size = 14, fontname = 'Rockwell', y = 1.02)

knnHofCM.savefig('knn-hof-cm.png', dpi = 400, bbox_inches = 'tight')

In [None]:
dnn = MLPClassifier(
    solver='lbfgs',
    hidden_layer_sizes=100,
    max_iter=10000,
    shuffle=False,
    random_state=0,
    activation='identity')

dnn.fit(xtrain, ytrain.values.ravel())

y_dnn = dnn.predict(xtest)

print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_dnn))

proba = dnn.predict_proba(xtest)
print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

posProb = proba[:, 1]
print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, posProb))

fprDNN, tprDNN, thresholdDNN = metrics.roc_curve(ytest, posProb)
roc_aucDNN = metrics.auc(fprDNN, tprDNN)

cvScoreDNN = cross_val_score(dnn, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
print("Accuracy (cross validation score): %0.2f (+/- %0.2f)" % (cvScoreDNN.mean(), cvScoreDNN.std() * 2))

In [None]:
cm = metrics.confusion_matrix(ytest, y_dnn)

plt.style.use("fivethirtyeight")
dnnHofCM, ax = plt.subplots()

sns.heatmap(cm, annot=True, ax = ax, linewidth = 2)

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

labels = ["Not HOF", "HOF"]
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)

dnnHofCM.suptitle("DNN Confusion Matrix", weight = 'bold', size = 18, y = 1.04, x = .45)
ax.set_title("http://dribbleanalytics.blogspot.com", size = 14, fontname = 'Rockwell', y = 1.02)

dnnHofCM.savefig('dnn-hof-cm.png', dpi = 400, bbox_inches = 'tight')

# Randomized search CV

In [None]:
# SVC

C = [int(x) for x in np.linspace(start = 1, stop = 100, num = 10)]

kernel = ['rbf', 'linear']

gamma = [float(x) for x in np.linspace(start = 1e-5, stop = 10, num = 10)]

random_grid = {'C': C,
               'kernel': kernel,
               'gamma': gamma}

svc_random = RandomizedSearchCV(estimator = svc, param_distributions = random_grid, n_iter = 25, cv = 3, 
                                verbose=2, random_state=42, n_jobs = 20)

In [None]:
svc_random.fit(xtrain, ytrain.values.ravel())

y_svcrand = svc_random.predict(xtest)

searchScore = metrics.accuracy_score(ytest, y_svcrand)
nonSearchScore = metrics.accuracy_score(ytest, y_svc)
improvement = (searchScore - nonSearchScore) / nonSearchScore

print("Percent improvement: %.5f" % improvement)

In [None]:
# Random forest

n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 25, cv = 3, 
                               verbose=2, random_state=42, n_jobs = 20)

In [None]:
rf_random.fit(xtrain, ytrain.values.ravel())

y_rfrand = rf_random.predict(xtest)

searchScore = metrics.accuracy_score(ytest, y_rfrand)
nonSearchScore = metrics.accuracy_score(ytest, y_rf)
improvement = (searchScore - nonSearchScore) / nonSearchScore

print("Percent improvement: %.5f" % improvement)

In [None]:
# KNN

n_neighbors = [int(x) for x in np.linspace(1, 50, num = 10)]

weights = ['distance', 'uniform']

random_grid = {'n_neighbors': n_neighbors,
               'weights': weights}

knn_random = RandomizedSearchCV(estimator = knn, param_distributions = random_grid, n_iter = 25, cv = 3, 
                               verbose=2, random_state=42, n_jobs = 20)

In [None]:
knn_random.fit(xtrain, ytrain.values.ravel())

y_knnrand = knn_random.predict(xtest)

searchScore = metrics.accuracy_score(ytest, y_knnrand)
nonSearchScore = metrics.accuracy_score(ytest, y_knn)
improvement = (searchScore - nonSearchScore) / nonSearchScore

print("Percent improvement: %.5f" % improvement)

In [None]:
# DNN

hidden_layers = [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)]

activation = ['identity', 'logistic', 'relu', 'tanh']

solver = ['lbfgs', 'adam', 'sgd']

random_grid = {'hidden_layers': hidden_layers,
               'activation': activation,
               'solver': solver}

dnn_random = RandomizedSearchCV(estimator = dnn, param_distributions = random_grid, n_iter = 25, cv = 3, 
                               verbose=2, random_state=42, n_jobs = 20)

In [None]:
dnn_random.fit(xtrain, ytrain.values.ravel())

y_dnnrand = dnn_random.predict(xtest)

searchScore = metrics.accuracy_score(ytest, y_dnnrand)
nonSearchScore = metrics.accuracy_score(ytest, y_dnn)
improvement = (searchScore - nonSearchScore) / nonSearchScore

print("Percent improvement: %.5f" % improvement)

# ROC curves for HOF

In [None]:
plt.style.use('fivethirtyeight')

rocHOF, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey = True, sharex = True)

ax1.plot(fprSVC, tprSVC, label = 'ROC curve')
ax1.plot([0, 1], [0, 1], linestyle = '--', label = 'Reference line')
ax1.set_title("SVC: %.2f" % roc_aucSVC, size = 21, x = .485, ha = 'center')

ax2.plot(fprRF, tprRF)
ax2.plot([0, 1], [0, 1], linestyle = '--')
ax2.set_title("RF: %.2f" % roc_aucRF, size = 21, x = .485, ha = 'center')

ax3.plot(fprKNN, tprKNN)
ax3.plot([0, 1], [0, 1], linestyle = '--')
ax3.set_title("KNN: %.2f" % roc_aucKNN, size = 21, x = .485, ha = 'center')

ax4.plot(fprDNN, tprDNN)
ax4.plot([0, 1], [0, 1], linestyle = '--')
ax4.set_title("DNN: %.2f" % roc_aucDNN, size = 21, x = .485, ha = 'center')

rocHOF.legend(loc = (.25, .87), ncol=2, prop={'size': 12, "family": "Rockwell"})
rocHOF.text(-0.03, 0.5, "True positive rate", va='center', rotation='vertical', size = 18)
rocHOF.text(0.5, -0.04, "False positive rate", ha = 'center', size = 18)

rocHOF.suptitle("http://dribbleanalytics.blogspot.com", y = 1.13, fontname = 'Rockwell', size = 14)

rocHOF.savefig('roc-hof.png', dpi = 400, bbox_inches = 'tight')

# Predict rookies

In [None]:
svcPred = svc.predict(rookieTest)

for i, j in zip(svcPred, rookieNames):
    print(i, j)

In [None]:
rfPred = rf.predict(rookieTest)

for i, j in zip(rfPred, rookieNames):
    print(i, j)

In [None]:
knnPred = knn.predict(rookieTest)

for i, j in zip(knnPred, rookieNames):
    print(i, j)

In [None]:
dnnPred = dnn.predict(rookieTest)

for i, j in zip(dnnPred, rookieNames):
    print(i, j)

# Create data splits for All Star models

In [None]:
train, test = train_test_split(dfHistorical, test_size = 0.25, random_state = 0)

xtrain = train[['G', 'MPG', 'FG/G', 'FGA/G', '2P%', '3P%', 'FT%', 'TRB/G', 'AST/G', 'STL/G', 'BLK/G', 'TOV/G', 'PF/G',
                'TS%', '3PAr', 'FTr']]
ytrain = train[['All Star']]
 
xtest = test[['G', 'MPG', 'FG/G', 'FGA/G', '2P%', '3P%', 'FT%', 'TRB/G', 'AST/G', 'STL/G', 'BLK/G', 'TOV/G', 'PF/G',
              'TS%', '3PAr', 'FTr']]
ytest = test[['All Star']]

# Create models and confusion matrices for All Star models

In [None]:
svc = SVC(kernel='rbf', gamma=1e-4, C=100, probability = True)
svc.fit(xtrain, ytrain.values.ravel())

y_svc = svc.predict(xtest)

print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_svc))

proba = svc.predict_proba(xtest)
print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

posProb = proba[:, 1]
print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, posProb))

fprSVC, tprSVC, thresholdSVC = metrics.roc_curve(ytest, posProb)
roc_aucSVC = metrics.auc(fprSVC, tprSVC)

cvScoreSVC = cross_val_score(svc, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
print("Accuracy (cross validation score): %0.2f (+/- %0.2f)" % (cvScoreSVC.mean(), cvScoreSVC.std() * 2))

In [None]:
cm = metrics.confusion_matrix(ytest, y_svc)

plt.style.use("fivethirtyeight")
svcHofAS, ax = plt.subplots()

sns.heatmap(cm, annot=True, ax = ax, linewidth = 2)

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

labels = ["Not All Star", "All Star"]
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)

svcHofAS.suptitle("SVC Confusion Matrix", weight = 'bold', size = 18, y = 1.04, x = .45)
ax.set_title("http://dribbleanalytics.blogspot.com", size = 14, fontname = 'Rockwell', y = 1.02)

svcHofAS.savefig('svc-hof-as.png', dpi = 400, bbox_inches = 'tight')

In [None]:
rf = RandomForestClassifier(random_state = 999, n_estimators = 200, criterion = 'gini')
rfPred = rf.fit(xtrain, ytrain.values.ravel())
y_rf = rfPred.predict(xtest)

print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_rf))

proba = rf.predict_proba(xtest)
print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

posProb = proba[:, 1]
print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, posProb))

fprRF, tprRF, thresholdRF = metrics.roc_curve(ytest, posProb)
roc_aucRF = metrics.auc(fprRF, tprRF)

cvScoreRF = cross_val_score(rf, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
print("Accuracy (cross validation score): %0.2f (+/- %0.2f)" % (cvScoreRF.mean(), cvScoreRF.std() * 2))

In [None]:
cm = metrics.confusion_matrix(ytest, y_rf)

plt.style.use("fivethirtyeight")
rfHofAS, ax = plt.subplots()

sns.heatmap(cm, annot=True, ax = ax, linewidth = 2)

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

labels = ["Not All Star", "All Star"]
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)

rfHofAS.suptitle("RF Confusion Matrix", weight = 'bold', size = 18, y = 1.04, x = .45)
ax.set_title("http://dribbleanalytics.blogspot.com", size = 14, fontname = 'Rockwell', y = 1.02)

rfHofAS.savefig('rf-hof-as.png', dpi = 400, bbox_inches = 'tight')

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors = 7, weights = 'uniform')
knn.fit(xtrain, ytrain.values.ravel())

y_knn = knn.predict(xtest)

print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_knn))

proba = knn.predict_proba(xtest)
print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

posProb = proba[:, 1]
print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, posProb))

fprKNN, tprKNN, thresholdKNN = metrics.roc_curve(ytest, posProb)
roc_aucKNN = metrics.auc(fprKNN, tprKNN)

cvScoreKNN = cross_val_score(knn, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
print("Accuracy (cross validation score): %0.2f (+/- %0.2f)" % (cvScoreKNN.mean(), cvScoreKNN.std() * 2))

In [None]:
cm = metrics.confusion_matrix(ytest, y_knn)

plt.style.use("fivethirtyeight")
knnHofAS, ax = plt.subplots()

sns.heatmap(cm, annot=True, ax = ax, linewidth = 2)

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

labels = ["Not All Star", "All Star"]
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)

knnHofAS.suptitle("KNN Confusion Matrix", weight = 'bold', size = 18, y = 1.04, x = .45)
ax.set_title("http://dribbleanalytics.blogspot.com", size = 14, fontname = 'Rockwell', y = 1.02)

knnHofAS.savefig('knn-hof-as.png', dpi = 400, bbox_inches = 'tight')

In [None]:
dnn = MLPClassifier(
    solver='lbfgs',
    hidden_layer_sizes=100,
    max_iter=10000,
    shuffle=False,
    random_state=0,
    activation='identity')

dnn.fit(xtrain, ytrain.values.ravel())

y_dnn = dnn.predict(xtest)

print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_dnn))

proba = dnn.predict_proba(xtest)
print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

posProb = proba[:, 1]
print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, posProb))

fprDNN, tprDNN, thresholdDNN = metrics.roc_curve(ytest, posProb)
roc_aucDNN = metrics.auc(fprDNN, tprDNN)

cvScoreDNN = cross_val_score(dnn, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
print("Accuracy (cross validation score): %0.2f (+/- %0.2f)" % (cvScoreDNN.mean(), cvScoreDNN.std() * 2))

In [None]:
cm = metrics.confusion_matrix(ytest, y_dnn)

plt.style.use("fivethirtyeight")
dnnHofAS, ax = plt.subplots()

sns.heatmap(cm, annot=True, ax = ax, linewidth = 2)

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

labels = ["Not All Star", "All Star"]
ax.set_xticklabels(labels)
ax.set_yticklabels(labels)

dnnHofAS.suptitle("DNN Confusion Matrix", weight = 'bold', size = 18, y = 1.04, x = .45)
ax.set_title("http://dribbleanalytics.blogspot.com", size = 14, fontname = 'Rockwell', y = 1.02)

dnnHofAS.savefig('dnn-hof-as.png', dpi = 400, bbox_inches = 'tight')

# Randomized search CV

In [None]:
# SVC

C = [int(x) for x in np.linspace(start = 1, stop = 100, num = 10)]

kernel = ['rbf', 'linear']

gamma = [float(x) for x in np.linspace(start = 1e-5, stop = 10, num = 10)]

random_grid = {'C': C,
               'kernel': kernel,
               'gamma': gamma}

svc_random = RandomizedSearchCV(estimator = svc, param_distributions = random_grid, n_iter = 25, cv = 3, 
                                verbose=2, random_state=42, n_jobs = 20)

In [None]:
svc_random.fit(xtrain, ytrain.values.ravel())

y_svcrand = svc_random.predict(xtest)

searchScore = metrics.accuracy_score(ytest, y_svcrand)
nonSearchScore = metrics.accuracy_score(ytest, y_svc)
improvement = (searchScore - nonSearchScore) / nonSearchScore

print("Percent improvement: %.5f" % improvement)

In [None]:
# Random forest

n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 25, cv = 3, 
                               verbose=2, random_state=42, n_jobs = 20)

In [None]:
rf_random.fit(xtrain, ytrain.values.ravel())

y_rfrand = rf_random.predict(xtest)

searchScore = metrics.accuracy_score(ytest, y_rfrand)
nonSearchScore = metrics.accuracy_score(ytest, y_rf)
improvement = (searchScore - nonSearchScore) / nonSearchScore

print("Percent improvement: %.5f" % improvement)

In [None]:
# KNN

n_neighbors = [int(x) for x in np.linspace(1, 50, num = 10)]

weights = ['distance', 'uniform']

random_grid = {'n_neighbors': n_neighbors,
               'weights': weights}

knn_random = RandomizedSearchCV(estimator = knn, param_distributions = random_grid, n_iter = 25, cv = 3, 
                               verbose=2, random_state=42, n_jobs = 20)

In [None]:
knn_random.fit(xtrain, ytrain.values.ravel())

y_knnrand = knn_random.predict(xtest)

searchScore = metrics.accuracy_score(ytest, y_knnrand)
nonSearchScore = metrics.accuracy_score(ytest, y_knn)
improvement = (searchScore - nonSearchScore) / nonSearchScore

print("Percent improvement: %.5f" % improvement)

In [None]:
# DNN

hidden_layers = [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)]

activation = ['identity', 'logistic', 'relu', 'tanh']

solver = ['lbfgs', 'adam', 'sgd']

random_grid = {'hidden_layers': hidden_layers,
               'activation': activation,
               'solver': solver}

dnn_random = RandomizedSearchCV(estimator = dnn, param_distributions = random_grid, n_iter = 25, cv = 3, 
                               verbose=2, random_state=42, n_jobs = 20)

In [None]:
dnn_random.fit(xtrain, ytrain.values.ravel())

y_dnnrand = dnn_random.predict(xtest)

searchScore = metrics.accuracy_score(ytest, y_dnnrand)
nonSearchScore = metrics.accuracy_score(ytest, y_dnn)
improvement = (searchScore - nonSearchScore) / nonSearchScore

print("Percent improvement: %.5f" % improvement)

# ROC curves for All Star

In [None]:
plt.style.use('fivethirtyeight')

rocAS, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey = True, sharex = True)

ax1.plot(fprSVC, tprSVC, label = 'ROC curve')
ax1.plot([0, 1], [0, 1], linestyle = '--', label = 'Reference line')
ax1.set_title("SVC: %.2f" % roc_aucSVC, size = 21, x = .485, ha = 'center')

ax2.plot(fprRF, tprRF)
ax2.plot([0, 1], [0, 1], linestyle = '--')
ax2.set_title("RF: %.2f" % roc_aucRF, size = 21, x = .485, ha = 'center')

ax3.plot(fprKNN, tprKNN)
ax3.plot([0, 1], [0, 1], linestyle = '--')
ax3.set_title("KNN: %.2f" % roc_aucKNN, size = 21, x = .485, ha = 'center')

ax4.plot(fprDNN, tprDNN)
ax4.plot([0, 1], [0, 1], linestyle = '--')
ax4.set_title("DNN: %.2f" % roc_aucDNN, size = 21, x = .485, ha = 'center')

rocAS.legend(loc = (.25, .87), ncol=2, prop={'size': 12, "family": "Rockwell"})
rocAS.text(-0.03, 0.5, "True positive rate", va='center', rotation='vertical', size = 18)
rocAS.text(0.5, -0.04, "False positive rate", ha = 'center', size = 18)

rocAS.suptitle("http://dribbleanalytics.blogspot.com", y = 1.13, fontname = 'Rockwell', size = 14)

rocAS.savefig('roc-as.png', dpi = 400, bbox_inches = 'tight')

# Predict rookies 

In [None]:
svcPred = svc.predict(rookieTest)

for i, j in zip(svcPred, rookieNames):
    print(i, j)

In [None]:
rfPred = rf.predict(rookieTest)

for i, j in zip(rfPred, rookieNames):
    print(i, j)

In [None]:
knnPred = knn.predict(rookieTest)

for i, j in zip(knnPred, rookieNames):
    print(i, j)

In [None]:
dnnPred = dnn.predict(rookieTest)

for i, j in zip(dnnPred, rookieNames):
    print(i, j)