In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
from scikitplot.helpers import cumulative_gain_curve

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import BorderlineSMOTE
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LeakyReLU
from keras.regularizers import l1, l2
from keras.callbacks import EarlyStopping

from keras.models import load_model

In [None]:
df = pd.read_csv('full_nba_data.csv')

In [None]:
features = df.columns.values[10:-1]
output = ['all-nba']

# Create training, validation, and testing sets

In [None]:
train, test = train_test_split(df, test_size = 0.25, stratify = df['all-nba'], random_state = 0)

xtrain = train[features]
ytrain = train[output]

xtest = test[features]
ytest = test[output]

print("Training set size: %.0f" % len(xtrain))
print("Testing set size: %.0f" % len(xtest))

In [None]:
train, test = train_test_split(xtrain.join(ytrain), test_size = 0.33, stratify = ytrain['all-nba'], random_state = 0)

xtrain = train[features]
ytrain = train[output]

xval = test[features]
yval = test[output]

print("Training set size: %.0f" % len(xtrain))
print("Testing set size: %.0f" % len(xval))

# SMOTE and data splitting

In [None]:
sm = BorderlineSMOTE(random_state = 0)
xtrain, ytrain = sm.fit_resample(xtrain.values, ytrain.values.ravel())

# Create model

In [None]:
model = Sequential()

model.add(Dense(47, kernel_regularizer = l1(0.01)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.2, seed = 0))

model.add(Dense(32, kernel_regularizer = l1(0.01)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.2, seed = 0))

model.add(Dense(16, kernel_regularizer = l1(0.01)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.2, seed = 0))

model.add(Dense(8, kernel_regularizer = l2(0.01)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.2, seed = 0))

model.add(Dense(4, kernel_regularizer = l2(0.01)))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.2, seed = 0))

model.add(Dense(1, activation = 'sigmoid', kernel_regularizer = l2(0.01)))

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
history = model.fit(xtrain, ytrain, validation_data = (xval, yval), epochs = 200, batch_size = 32, verbose=2)

In [None]:
model.save('full_model.h5')

In [None]:
plt.style.use('fivethirtyeight')
full_loss, ax = plt.subplots()

ax.plot(history.history['loss'], label = 'train')
ax.plot(history.history['val_loss'], label = 'valid.')

ax.set_ylabel("Loss")
ax.set_xlabel("Epoch #")

ax.legend(loc='best', prop={'size': 15, "family": "Rockwell"})

full_loss.text(x = -0.05, y = -0.07,
    s = '_____________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

full_loss.text(x = -0.05, y = -.13,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

full_loss.savefig('full_loss.png', dpi = 400, bbox_inches = 'tight')

In [None]:
plt.style.use('fivethirtyeight')
es_loss, ax = plt.subplots()

ax.plot(history.history['loss'], label = 'train')
ax.plot(history.history['val_loss'], label = 'valid.')

ax.set_ylabel("Loss")
ax.set_xlabel("Epoch #")

ax.legend(loc='best', prop={'size': 15, "family": "Rockwell"})

es_loss.text(x = -0.05, y = -0.07,
    s = '_____________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

es_loss.text(x = -0.05, y = -.13,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

es_loss.savefig('es_loss.png', dpi = 400, bbox_inches = 'tight')

In [None]:
model.save('es_model.h5')

In [None]:
print('Train accuracy: %.2f' % (np.mean(history.history['accuracy'])))
print('Validation accuracy: %.2f' % (np.mean(history.history['val_accuracy'])))

# Model evaluation

In [None]:
y_prob = model.predict(xtest)
rounded = [round(x[0]) for x in y_prob]
y_pred = np.array(rounded, dtype = 'int64')

In [None]:
cm = metrics.confusion_matrix(ytest, y_pred)

plt.style.use("fivethirtyeight")
cm_plot, ax = plt.subplots()

sns.heatmap(cm, annot=True, ax = ax, linewidth = 2, fmt='g')

ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

cm_plot.suptitle("Model Confusion Matrix", weight = 'bold', size = 18, x = .45)

cm_plot.text(x = -0.02, y = -0.08,
    s = '___________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

cm_plot.text(x = -0.02, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

cm_plot.savefig('cm.png', dpi = 400, bbox_inches = 'tight')

In [None]:
print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_pred))
print("Recall: %.3f" % metrics.recall_score(ytest, y_pred))
print("Precision: %.3f" % metrics.precision_score(ytest, y_pred))
print("F1: %.3f" % metrics.f1_score(ytest, y_pred))

print("Log loss: %.3f" % metrics.log_loss(ytest, y_prob))
print("Breir score: %.3f" % metrics.brier_score_loss(ytest, y_prob))

In [None]:
# plot roc curve and calculate auc-roc

plt.style.use('fivethirtyeight')

roc, ax = plt.subplots()

fpr, tpr, _ = metrics.roc_curve(ytest, y_prob)
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], linestyle = '--')    

ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title("AUC-ROC: %.2f" % metrics.roc_auc_score(ytest, y_prob), fontname = 'Rockwell', fontsize = 14)

roc.suptitle("ROC Curve", weight = 'bold', y = 1.007, size = 18)

roc.text(x = -0.03, y = -0.08,
        s = '______________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

roc.text(x = -0.03, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

roc.savefig('roc.png', dpi = 400, bbox_inches = 'tight')

In [None]:
# these two functions are adapted directly from scikitplot's own functions. i adapted them to be able to customize the plots
# better (such as being able to move the legend, changing axis titles, etc.)
# the original code is found here: https://github.com/reiinakano/scikit-plot/blob/master/scikitplot/metrics.py

def plot_cumulative_gain(y_true, y_probas, ax = None):

    y_true = np.array(y_true)
    y_probas = np.array(y_probas)

    classes = np.unique(y_true)

    percentages, gains1 = cumulative_gain_curve(y_true, y_probas[:, 0],
                                                classes[0])
    percentages, gains2 = cumulative_gain_curve(y_true, y_probas[:, 1],
                                                classes[1])

    ax.plot(percentages, gains1, lw=3, label='Class {}'.format(classes[0]))
    ax.plot(percentages, gains2, lw=3, label='Class {}'.format(classes[1]))

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.01])


    ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Baseline')

    ax.grid('on')
    ax.legend(loc='best', prop={'size': 15, "family": "Rockwell"})

    return ax


def plot_lift_curve(y_true, y_probas, ax = None):

    y_true = np.array(y_true)
    y_probas = np.array(y_probas)

    classes = np.unique(y_true)
    percentages, gains1 = cumulative_gain_curve(y_true, y_probas[:, 0],
                                                classes[0])
    percentages, gains2 = cumulative_gain_curve(y_true, y_probas[:, 1],
                                                classes[1])

    percentages = percentages[1:]
    gains1 = gains1[1:]
    gains2 = gains2[1:]

    gains1 = gains1 / percentages
    gains2 = gains2 / percentages

    ax.plot(percentages, gains1, lw=3, label='Class {}'.format(classes[0]))
    ax.plot(percentages, gains2, lw=3, label='Class {}'.format(classes[1]))

    ax.plot([0, 1], [1, 1], 'k--', lw=2, label='Baseline')

    ax.grid('on')
    ax.legend(loc='best', prop={'size': 15, "family": "Rockwell"})

    return ax

In [None]:
dual_prob = []
for i in y_prob:
    dual_prob.append([1-i[0], i[0]])
    
gain, ax = plt.subplots()

plot_cumulative_gain(ytest, dual_prob, ax = ax)

ax.set_xlabel('% of sample')
ax.set_ylabel('Gain')

gain.suptitle("Model Cumulative Gain", weight = 'bold', size = 18)

gain.text(x = -0.03, y = -0.08,
        s = '______________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

gain.text(x = -0.03, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

gain.savefig('gain.png', dpi = 400, bbox_inches = 'tight')

In [None]:
lift, ax = plt.subplots()
plot_lift_curve(ytest, dual_prob, ax = ax)

ax.set_xlabel('% of sample')
ax.set_ylabel('Lift')

lift.suptitle("Model Lift Curve", weight = 'bold', size = 18)

lift.text(x = -0.03, y = -0.08,
        s = '______________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

lift.text(x = -0.03, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

lift.savefig('lift.png', dpi = 400, bbox_inches = 'tight')

# Make predictions

In [None]:
model = load_model('es_model.h5')

In [None]:
df_curr = pd.read_csv('current_data.csv')

x_pred = df_curr[features]

In [None]:
df_pred = pd.DataFrame(zip(df_curr['player'], [i[0] for i in model.predict(x_pred)]), columns = ['player', 'all-nba-prob'])

df_pred.sort_values(by = ['all-nba-prob'], ascending = False).to_csv('results.csv', index = False)