In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from operator import itemgetter
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
import shap

In [None]:
df_historical = pd.read_csv('historical-draft-stats.csv')
df_current = pd.read_csv('draft-predict.csv')

In [None]:
df_historical.head()

In [None]:
df_historical.columns

# Visualizing All-NBA selections

In [None]:
all_star = df_historical.loc[df_historical['All-Star'] == 1]
non_all_star = df_historical.loc[df_historical['All-Star'] == 0]

In [None]:
def boxplot(stat, title_stat, file_name):
    
    plt.style.use('fivethirtyeight')

    fig, ax = plt.subplots()
    
    box = [all_star[stat].dropna(), non_all_star[stat].dropna()]

    bp = ax.boxplot(box, patch_artist = True)

    for box in bp['boxes']:
        box.set(color = 'black', linewidth = 2)
        box.set(facecolor = 'C0')

    for whisker in bp['whiskers']:
        whisker.set(color='black', linewidth=2)

    for cap in bp['caps']:
        cap.set(color='black', linewidth=2)

    for median in bp['medians']:
        median.set(color='yellow', linewidth=2.5)

    ax.set_xticklabels(['All-Stars', 'Not All-Stars'])

    fig.suptitle("%s boxplot for top-10 picks" % title_stat, weight = 'bold', size = 18)

    fig.text(x = 0.02, y = 0,
        s = '____________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')

    fig.text(x = 0.02, y = -.06,
        s = 'https://dribbleanalytics.blog                     ',
        fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

    fig.savefig('%s.png' % file_name, dpi = 400, bbox_inches = 'tight')

In [None]:
boxplot('PTS', 'PPG', 'ppg')

In [None]:
boxplot('TRB', 'REB', 'trb')

In [None]:
boxplot('AST', 'AST', 'ast')

In [None]:
boxplot('SOS', 'SOS', 'sos')

In [None]:
boxplot('Pick', 'Pick', 'pick')

# Create models

In [None]:
features = ['Pick', 'PTS', 'TRB', 'AST', 'STL', 'BLK', 'TS%', '3PAr', 'FTr', 'SOS']

output = ['All-Star']

df_historical = df_historical.dropna(subset=features)

train, test = train_test_split(df_historical, test_size = 0.25, random_state = 0)

xtrain = train[features]
ytrain = train[output]

xtest = test[features]
ytest = test[output]

print("Training set size: %.0f" % len(xtrain))
print("Testing set size: %.0f" % len(xtest))

In [None]:
print("All-Star percentage in testing set: %.2f" % (ytrain[ytrain['All-Star'] == 1].count() / ytrain['All-Star'].count()))
print("All-Star percentage in testing set: %.2f" % (ytest[ytest['All-Star'] == 1].count() / ytest['All-Star'].count()))

In [None]:
def scores(model):
    
    model.fit(xtrain, ytrain.values.ravel())
    y_pred = model.predict(xtest)
    
    print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_pred))
    print("Recall: %.3f" % metrics.recall_score(ytest, y_pred))
    print("Precision: %.3f" % metrics.precision_score(ytest, y_pred))
    print("F1: %.3f" % metrics.f1_score(ytest, y_pred))
    
    proba = model.predict_proba(xtest)
    print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

    pos_prob = proba[:, 1]
    print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, pos_prob))
    
    cv = cross_val_score(model, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
    print("Accuracy (cross validation score): %0.3f (+/- %0.3f)" % (cv.mean(), cv.std() * 2))
    
    return y_pred

In [None]:
cv = StratifiedKFold(n_splits = 3, random_state = 0)

def grid_search(model, grid):
    clf = GridSearchCV(model, grid, cv = cv, n_jobs = -1, verbose = 2, iid = False)
    scores(clf)
    
    print(clf.best_params_)

In [None]:
log = LogisticRegression(solver = 'liblinear')

y_log = scores(log)

In [None]:
C = [int(x) for x in np.linspace(start = 1, stop = 50, num = 20)]
penalty = ['l1', 'l2']
solver = ['liblinear']

grid = {'C': C,
        'penalty': penalty,
        'solver': solver}

In [None]:
grid_search(log, grid)

In [None]:
svc = SVC(kernel = 'rbf', gamma = 1e-2, C = 10, probability = True)

y_svc = scores(svc)

In [None]:
gamma = [x for x in np.logspace(-4, 1, num = 6, endpoint=10)]
C = [x for x in np.logspace(-2, 2, num = 5, endpoint=100)]
kernel = ['rbf', 'sigmoid', 'linear']
probability = [True]

grid = {'gamma': gamma,
        'C': C,
        'kernel': kernel,
        'probability': probability}

In [None]:
grid_search(svc, grid)

In [None]:
svc = SVC(C = .1, gamma = .0001, kernel = 'linear', probability = True)

y_svc = scores(svc)

In [None]:
rf = RandomForestClassifier(random_state = 99, n_estimators = 50)

y_rf = scores(rf)

In [None]:
max_depth = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_features = ['auto', 'sqrt']
n_estimators = [int(x) for x in np.linspace(start = 25, stop = 250, num = 10)]
random_state = [0]

grid = {'max_depth': max_depth,
        'max_features': max_features,
        'n_estimators': n_estimators,
        'random_state': random_state}

In [None]:
grid_search(rf, grid)

In [None]:
rf = RandomForestClassifier(max_depth = 10, max_features = 'auto', n_estimators = 125, random_state = 0)

y_rf = scores(rf)

These hyerparameters for the above random forest and the final gradient booster were found using grid search cv with no random_state on the stratified k-fold. So, we use those hyperparameters that the grid search found. However, to keep results consistent, I assigned a random_state to the k-fold, meaning that the grid search will technically not find the previous hyperparameters in this instance. However, the hyperparameters were achieved via a previous grid search.

In [None]:
gbc = GradientBoostingClassifier()

y_gbc = scores(gbc)

In [None]:
loss = ['deviance']
max_depth = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_features = ['auto', 'sqrt']
n_estimators = [int(x) for x in np.linspace(start = 25, stop = 250, num = 10)]
random_state = [66]

grid = {'loss': loss,
        'max_depth': max_depth,
        'max_features': max_features,
        'n_estimators': n_estimators,
        'random_state': random_state}

In [None]:
grid_search(gbc, grid)

In [None]:
gbc = GradientBoostingClassifier(loss = 'deviance', max_depth = 30, max_features = 'sqrt', n_estimators = 200, random_state = 66)

y_gbc = scores(gbc)

# Dummy Classifier

In [None]:
dummy = DummyClassifier(strategy="stratified", random_state = 99)
y_dummy = scores(dummy)

# Create confusion matrices

In [None]:
def confusion_matrix(y_pred, model_name):
    cm = metrics.confusion_matrix(ytest, y_pred)

    plt.style.use("fivethirtyeight")
    z, ax = plt.subplots()

    sns.heatmap(cm, annot=True, ax = ax, linewidth = 2, fmt='g')

    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

    z.suptitle("%s Confusion Matrix" % model_name.upper(), weight = 'bold', size = 18, x = .45)
    
    z.text(x = 0, y = -0.08,
        s = '__________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')

    z.text(x = 0, y = -.14,
        s = 'dribbleanalytics.blogspot.com                     ',
        fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

    z.savefig('%s_cm.png' % model_name, dpi = 400, bbox_inches = 'tight')

In [None]:
confusion_matrix(y_log, 'log')

In [None]:
confusion_matrix(y_svc, 'svc')

In [None]:
confusion_matrix(y_rf, 'rf')

In [None]:
confusion_matrix(y_gbc, 'gbc')

# Create ROC curve

In [None]:
def roc_curve(model):

    proba = model.predict_proba(xtest)
    pos_prob = proba[:, 1]
    fpr, tpr, threshold = metrics.roc_curve(ytest, pos_prob)
    
    return (fpr, tpr, pos_prob)

In [None]:
plt.style.use('fivethirtyeight')

roc, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey = True, sharex = True)

fpr, tpr, pos_prob = roc_curve(log)
ax1.plot(fpr, tpr)
ax1.plot([0, 1], [0, 1], linestyle = '--')
ax1.set_title("LOG: %.2f" % metrics.roc_auc_score(ytest, pos_prob), size = 15, x = .485, ha = 'center')

fpr, tpr, pos_prob = roc_curve(svc)
ax2.plot(fpr, tpr)
ax2.plot([0, 1], [0, 1], linestyle = '--')
ax2.set_title("SVC: %.2f" % metrics.roc_auc_score(ytest, pos_prob), size = 15, x = .485, ha = 'center')

fpr, tpr, pos_prob = roc_curve(rf)
ax3.plot(fpr, tpr)
ax3.plot([0, 1], [0, 1], linestyle = '--')
ax3.set_title("RF: %.2f" % metrics.roc_auc_score(ytest, pos_prob), size = 15, x = .485, ha = 'center')

fpr, tpr, pos_prob = roc_curve(gbc)
ax4.plot(fpr, tpr)
ax4.plot([0, 1], [0, 1], linestyle = '--')
ax4.set_title("GBC: %.2f" % metrics.roc_auc_score(ytest, pos_prob), size = 15, x = .485, ha = 'center')

roc.text(-0.03, 0.5, "True positive rate", va='center', rotation='vertical', size = 18)
roc.text(0.5, -0.045, "False positive rate", ha = 'center', size = 18)

roc.suptitle("Model ROC Curves", y = 1.045, weight = 'bold', size = 18)

roc.text(x = -0.03, y = -0.08,
        s = '______________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')

roc.text(x = -0.03, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

roc.savefig('roc.png', dpi = 400, bbox_inches = 'tight')

# Feature importance/coefficients

In [None]:
plt.style.use('fivethirtyeight')

coef, ax = plt.subplots()

x1 = log.coef_[0]
x2 = svc.coef_[0]
y = np.arange(len(x1))

ax.scatter(x1, y, label = 'LOG')
ax.scatter(x2, y, label = 'SVC')
ax.axvline(x = 0, c = 'black', alpha = .3)

ticks = features
ax.set_yticks(np.arange(len(x1)))
ax.set_yticklabels(ticks)

ax.set_xlabel('Coefficient')

coef.suptitle("Model Coefficients", y = .95, weight = 'bold', size = 18)
ax.legend(loc = 'best')

coef.text(x = 0, y = -0.08,
        s = '___________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')

coef.text(x = 0, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

coef.savefig('coef.png', dpi = 400, bbox_inches = 'tight')

In [None]:
plt.style.use('fivethirtyeight')

fi, ax = plt.subplots()

x1 = rf.feature_importances_
x2 = gbc.feature_importances_
y = np.arange(len(x1))

ax.scatter(x1, y, label = 'RF')
ax.scatter(x2, y, label = 'GBC')
ax.axvline(x = 0.1, c = 'C2', alpha = .7)

ticks = features
ax.set_yticks(np.arange(len(x1)))
ax.set_yticklabels(ticks)

ax.set_xlabel('Feature Importance')

fi.suptitle("Model Feature Importance", y = .95, weight = 'bold', size = 18)
ax.legend(loc = 'best')

fi.text(x = .4, y = .6,
        s = 'Average\nfeature importance',
        fontsize = 14, color = 'C2', rotation = 90, horizontalalignment = 'center')

fi.text(x = 0, y = -0.08,
        s = '___________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')

fi.text(x = 0, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

fi.savefig('fi.png', dpi = 400, bbox_inches = 'tight')

# SHAP and model explanations

In [None]:
shap.initjs()
plt.rcParams.update(plt.rcParamsDefault)
plt.style.use('fivethirtyeight')

In [None]:
k_sample = shap.kmeans(xtrain, 5)
# use shap.kmeans to allow KernelExplainer to run on fewer data points - recommended for speed

explainer = shap.KernelExplainer(log.predict, k_sample)
shap_values = explainer.shap_values(xtrain)

shap.summary_plot(shap_values, xtrain)

In [None]:
shap.summary_plot(shap_values, xtrain, plot_type = 'bar')

In [None]:
explainer = shap.KernelExplainer(svc.predict, k_sample)
shap_values = explainer.shap_values(xtrain)

shap.summary_plot(shap_values, xtrain)

In [None]:
shap.summary_plot(shap_values, xtrain, plot_type = 'bar')

In [None]:
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(xtrain)

shap.summary_plot(shap_values[1], xtrain)

In [None]:
shap.summary_plot(shap_values[1], xtrain, plot_type = 'bar')

In [None]:
explainer = shap.TreeExplainer(gbc)
shap_values = explainer.shap_values(xtrain)

shap.summary_plot(shap_values, xtrain)

In [None]:
shap.summary_plot(shap_values, xtrain, plot_type = 'bar')

# Prediction

In [None]:
df_current_names = df_current.iloc[:, 0]
df_current_predict = df_current[features]

df_current.head()

In [None]:
def make_pred(model, sort):

    proba = model.predict_proba(df_current_predict)
    pos_prob = proba[:, 1]
    
    combined_list = [[i, j] for i, j in zip(df_current_names, pos_prob)]
    if sort:
        combined_list = sorted(combined_list, key = itemgetter(1), reverse = True)
        for i in combined_list:
            print(i)
    else:
        None
        
    return combined_list

In [None]:
def pred_graph(pred_list, model_name, text_lim):
    
    fig, ax = plt.subplots()
    
    y = [i[1] for i in pred_list]
    labels = [i[0] for i in pred_list]
    
    x = np.arange(len(y))
    
    ax.bar(x, y, color = 'C2', edgecolor = 'white', linewidth = 2.5)
    
    ax.xaxis.set_visible(False)
    
    rects = ax.patches
    for rect, label in zip(rects, labels):
        if(rect.get_x() > text_lim):
            ax.text(rect.get_x() + rect.get_width() / 1.75, rect.get_height() + .02, label,
            ha='center', va='bottom', rotation = 'vertical', color = 'black')
        else:
            ax.text(rect.get_x() + rect.get_width() / 1.75, .02, label,
            ha='center', va='bottom', rotation = 'vertical', color = 'black')

    ax.set_ylabel('All-Star probability')
    vals = ax.get_yticks()
    ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])
    
    fig.suptitle("%s predictions" % model_name.upper(), size = 18, weight = 'bold', y = .95)
    
    fig.text(x = -0.05, y = 0.01,
        s = '______________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')

    fig.text(x = -0.05, y = -.05,
        s = 'https://dribbleanalytics.blog                     ',
        fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')
    
    fig.savefig('%s-predict.png' % model_name, dpi = 400, bbox_inches = 'tight')

In [None]:
log_prob = make_pred(log, True)

In [None]:
pred_graph(log_prob, 'log', 2)

In [None]:
svc_prob = make_pred(svc, True)

In [None]:
pred_graph(svc_prob, 'svc', 2)

In [None]:
rf_prob = make_pred(rf, True)

In [None]:
pred_graph(rf_prob, 'rf', 5)

In [None]:
gbc_prob = make_pred(gbc, True)

In [None]:
pred_graph(gbc_prob, 'gbc', 3)

In [None]:
avg_prob = []

for i, j, k, l in zip(make_pred(log, False), make_pred(svc, False), make_pred(rf, False), make_pred(gbc, False)):
    avg_prob.append((i[1] + j[1] + k[1] + l[1]) / 4)
    
avg_list = [[i, j] for i, j in zip(df_current_names, avg_prob)]
avg_list = sorted(avg_list, key = itemgetter(1), reverse = True)

for i in avg_list:
    print(i)

In [None]:
pred_graph(avg_list, 'avg', 4)

In [None]:
exp_star = []

for i in range(1, 11):
    exp_star.append(df_historical['All-Star'].loc[df_historical['Pick'] == i].sum() / 
                    df_historical['All-Star'].loc[df_historical['Pick'] == i].count())

In [None]:
diff_star = []

for i, j, k, l in zip(make_pred(log, False), make_pred(svc, False), make_pred(rf, False), make_pred(svc, False)):
    avg_prob.append((i[1] + j[1] + k[1] + l[1]) / 4)
    
avg_list = [[i, j] for i, j in zip(df_current_names, avg_prob)]

for i, j in zip(avg_list, exp_star):
    diff_star.append(i[1] - j)
    
diff_star = [[i, j] for i, j in zip(df_current_names, diff_star)]

In [None]:
fig, ax = plt.subplots()

y = [i[1] for i in diff_star]
labels = [i[0] for i in diff_star]

x = np.arange(len(y))

colors = []

for i in y:
    if(i < 0):
        colors.append('C1')
    else:
        colors.append('C3')

ax.barh(x, y, color = colors, edgecolor = 'white', linewidth = 2.5)

ax.yaxis.set_visible(False)

rects = ax.patches
for rect, label in zip(rects, labels):
    if(rect.get_width() < 0):
        ax.text(rect.get_x() + .01, rect.get_y() + .75, label,
        ha='left', va='bottom', rotation = 'horizontal', color = 'black')
    else:
        ax.text(rect.get_x() - .01, rect.get_y() + .75, label,
        ha='right', va='bottom', rotation = 'horizontal', color = 'black')
            
vals = ax.get_xticks()
ax.set_xticklabels(['{:,.0%}'.format(x) for x in vals])

ax.invert_yaxis()
ax.grid(alpha = .5)

fig.suptitle("All-Star probability above\nAll-Star percent at player pick #", size = 18, weight = 'bold', y = 1.03)

fig.text(x = .05, y = -0.01,
    s = '_______________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

fig.text(x = .05, y = -.07,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

fig.savefig('prob-above-average.png', dpi = 400, bbox_inches = 'tight')

In [None]:
diff_star = sorted(diff_star, key = itemgetter(1), reverse = True)

fig, ax = plt.subplots()

y = [i[1] for i in diff_star]
labels = [i[0] for i in diff_star]

x = np.arange(len(y))

colors = []

for i in y:
    if(i < 0):
        colors.append('C1')
    else:
        colors.append('C3')

ax.barh(x, y, color = colors, edgecolor = 'white', linewidth = 2.5)

ax.yaxis.set_visible(False)

rects = ax.patches
for rect, label in zip(rects, labels):
    if(rect.get_width() < 0):
        ax.text(rect.get_x() + .01, rect.get_y() + .75, label,
        ha='left', va='bottom', rotation = 'horizontal', color = 'black')
    else:
        ax.text(rect.get_x() - .01, rect.get_y() + .75, label,
        ha='right', va='bottom', rotation = 'horizontal', color = 'black')
            
vals = ax.get_xticks()
ax.set_xticklabels(['{:,.0%}'.format(x) for x in vals])

ax.invert_yaxis()
ax.grid(alpha = .5)

fig.suptitle("All-Star probability above\nAll-Star percent at player pick #", size = 18, weight = 'bold', y = 1.03)

fig.text(x = .05, y = -0.01,
    s = '_______________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')

fig.text(x = .05, y = -.07,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

fig.savefig('prob-above-average-sorted.png', dpi = 400, bbox_inches = 'tight')