In [None]:
# Import necessary packages

%matplotlib inline

# basic packages used throughout
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn packages for All-NBA models
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
np.random.seed(0)

In [None]:
df_all_players = pd.read_csv('full_nba_data.csv')

# Create All-NBA models from 1979-2009 data

In [None]:
features = ['g', 'mp', 'pts', 'trb', 'ast', 'vorp', 'ws']
output = ['all-nba']

In [None]:
df_subset = df_all_players[df_all_players['season_start'] < 2009]

In [None]:
train, test = train_test_split(df_subset, test_size = 0.25)

xtrain = train[features]
ytrain = train[output]

xtest = test[features]
ytest = test[output]

print("Training set size: %.0f" % len(xtrain))
print("Testing set size: %.0f" % len(xtest))

In [None]:
print(sum(train['all-nba']), sum(test['all-nba']))
print(sum(train['all-nba']) / len(xtrain), sum(test['all-nba']) / len(xtest))

In [None]:
cv = StratifiedKFold(n_splits = 3, random_state = 0)

def grid_search(model, grid):
    clf = GridSearchCV(model, grid, cv = cv, n_jobs = -1, verbose = 2, iid = False, scoring = 'recall')
    scores(clf)
    
    print(clf.best_params_)

In [None]:
def scores(model):
    
    model.fit(xtrain, ytrain.values.ravel())
    y_pred = model.predict(xtest)
    
    print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_pred))
    print("Recall: %.3f" % metrics.recall_score(ytest, y_pred))
    print("Precision: %.3f" % metrics.precision_score(ytest, y_pred))
    print("F1: %.3f" % metrics.f1_score(ytest, y_pred))
    
    proba = model.predict_proba(xtest)
    print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

    pos_prob = proba[:, 1]
    print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, pos_prob))
    
    cv = cross_val_score(model, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
    print("Accuracy (cross validation score): %0.3f (+/- %0.3f)" % (cv.mean(), cv.std() * 2))
    
    cv = cross_val_score(model, xtest, ytest.values.ravel(), cv = 3, scoring = 'recall')
    print("Recall (cross validation score): %0.3f (+/- %0.3f)" % (cv.mean(), cv.std() * 2))
    
    return y_pred

In [None]:
svc = SVC(probability = True, gamma = 'auto')

y_svc = scores(svc)

In [None]:
gamma = [x for x in np.logspace(-4, 0, num = 5)]
C = [x for x in np.logspace(-1, 3, num = 5)]
probability = [True]

grid = {'gamma': gamma,
        'C': C,
        'probability': probability}

grid_search(svc, grid)

In [None]:
svc = SVC(kernel = 'rbf', gamma = 0.01, C = 10, probability = True)

y_svc = scores(svc)

In [None]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 0)

y_rf = scores(rf)

In [None]:
max_depth = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_features = ['auto', 'sqrt']
n_estimators = [int(x) for x in np.linspace(start = 25, stop = 250, num = 10)]
random_state = [0]

grid = {'max_depth': max_depth,
        'max_features': max_features,
        'n_estimators': n_estimators,
        'random_state': random_state}

grid_search(rf, grid)

In [None]:
rf = RandomForestClassifier(max_depth = 20, max_features = 'auto', n_estimators = 175, random_state = 0)

y_rf = scores(rf)

In [None]:
knn = neighbors.KNeighborsClassifier()

y_knn = scores(knn)

In [None]:
n_neighbors = [x for x in np.arange(5, 21)]
weights = ['uniform', 'distance']

grid = {'n_neighbors': n_neighbors,
        'weights': weights}

grid_search(knn, grid)

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors = 5, weights = 'distance')

y_knn = scores(knn)

In [None]:
gbc = GradientBoostingClassifier(random_state = 0)

y_gbc = scores(gbc)

In [None]:
loss = ['deviance']
max_depth = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_features = ['auto', 'sqrt']
n_estimators = [int(x) for x in np.linspace(start = 25, stop = 250, num = 10)]
random_state = [0]

grid = {'loss': loss,
        'max_depth': max_depth,
        'max_features': max_features,
        'n_estimators': n_estimators,
        'random_state': random_state}

grid_search(gbc, grid)

In [None]:
gbc = GradientBoostingClassifier(random_state = 0)

y_gbc = scores(gbc)

In [None]:
dummy = DummyClassifier(strategy= "stratified", random_state = 0)

y_dummy = scores(dummy)

# Confusion matrices

In [None]:
def confusion_matrix(y_pred, model_name):
    cm = metrics.confusion_matrix(ytest, y_pred)

    plt.style.use("fivethirtyeight")
    fig, ax = plt.subplots()

    sns.heatmap(cm, annot=True, ax = ax, linewidth = 2, fmt='g')

    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

    fig.suptitle("%s Confusion Matrix" % model_name.upper(), weight = 'bold', size = 18, x = .45)
    
    fig.text(x = -0.02, y = -0.08,
        s = '__________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')

    fig.text(x = -0.02, y = -.14,
        s = 'https://dribbleanalytics.blog                     ',
        fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

    fig.savefig('%s_cm.png' % model_name, dpi = 400, bbox_inches = 'tight')

In [None]:
confusion_matrix(y_svc, 'svc')

In [None]:
confusion_matrix(y_rf, 'rf')

In [None]:
confusion_matrix(y_knn, 'knn')

In [None]:
confusion_matrix(y_gbc, 'gbc')

# ROC curves

In [None]:
def roc_curve(model):

    proba = model.predict_proba(xtest)
    pos_prob = proba[:, 1]
    fpr, tpr, threshold = metrics.roc_curve(ytest, pos_prob)
    
    return (fpr, tpr, pos_prob)

In [None]:
plt.style.use('fivethirtyeight')

roc, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey = True, sharex = True)

fpr, tpr, pos_prob = roc_curve(svc)
ax1.plot(fpr, tpr)
ax1.plot([0, 1], [0, 1], linestyle = '--')
ax1.set_title("SVC: %.2f" % metrics.roc_auc_score(ytest, pos_prob), size = 15, x = .485, ha = 'center')

fpr, tpr, pos_prob = roc_curve(rf)
ax2.plot(fpr, tpr)
ax2.plot([0, 1], [0, 1], linestyle = '--')
ax2.set_title("RF: %.2f" % metrics.roc_auc_score(ytest, pos_prob), size = 15, x = .485, ha = 'center')

fpr, tpr, pos_prob = roc_curve(knn)
ax3.plot(fpr, tpr)
ax3.plot([0, 1], [0, 1], linestyle = '--')
ax3.set_title("KNN: %.2f" % metrics.roc_auc_score(ytest, pos_prob), size = 15, x = .485, ha = 'center')

fpr, tpr, pos_prob = roc_curve(gbc)
ax4.plot(fpr, tpr)
ax4.plot([0, 1], [0, 1], linestyle = '--')
ax4.set_title("GBC: %.2f" % metrics.roc_auc_score(ytest, pos_prob), size = 15, x = .485, ha = 'center')

roc.text(-0.03, 0.5, "True positive rate", va='center', rotation='vertical', size = 18)
roc.text(0.5, -0.045, "False positive rate", ha = 'center', size = 18)

roc.suptitle("Model ROC Curves", y = 1.045, weight = 'bold', size = 18)

roc.text(x = -0.03, y = -0.08,
        s = '______________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')

roc.text(x = -0.03, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

roc.savefig('roc.png', dpi = 400, bbox_inches = 'tight')

# Predict All-NBA score

In [None]:
def make_pred(model_list, df_pred):
    prob_list = []
    for i in model_list:
        proba = i.predict_proba(df_pred)
        pos_prob = proba[:, 1]
        prob_list.append(pos_prob)
        
    return prob_list

In [None]:
df_pred = df_all_players[df_all_players['season_start'] >= 2009].reset_index(drop = True)

prob_list = make_pred([svc, rf, knn, gbc], df_pred[features])

In [None]:
pred_vals = pd.DataFrame(data = np.transpose(prob_list), columns = ['svc', 'rf', 'knn', 'gbc'])
pred_vals['avg'] = (pred_vals['svc'] + pred_vals['rf'] + pred_vals['knn'] + pred_vals['gbc']) / 4

In [None]:
df = pd.DataFrame(data = df_pred[['player', 'season_start']], columns =
                       ['player', 'season_start'])

df[['svc', 'rf', 'knn', 'gbc']] = pred_vals[['svc', 'rf', 'knn', 'gbc']]
df['avg'] = pred_vals['avg']

df.to_csv('all-nba-predictions.csv', index = False)

In [None]:
df_sum = df[['player', 'season_start']].copy()

df_sum[['svc', 'rf', 'knn', 'gbc', 'avg']] = df.groupby(
    by = ['player'])['svc', 'rf', 'knn', 'gbc', 'avg'].transform(pd.Series.cumsum)

df_sum.to_csv('all-nba-cumulative.csv', index = False)

# All-decade teams by model

In [None]:
df = pd.read_csv('all-nba-predictions.csv')

In [None]:
svc_nba = df.groupby(by = ['player'])['svc'].sum()
rf_nba = df.groupby(by = ['player'])['rf'].sum()
knn_nba = df.groupby(by = ['player'])['knn'].sum()
gbc_nba = df.groupby(by = ['player'])['gbc'].sum()

tot_nba = df.groupby(by = ['player'])['avg'].sum()

In [None]:
svc_nba.sort_values(ascending = False)

In [None]:
rf_nba.sort_values(ascending = False)

In [None]:
knn_nba.sort_values(ascending = False)

In [None]:
gbc_nba.sort_values(ascending = False)

In [None]:
tot_nba.sort_values(ascending = False)

# Best player seasons in the decade

In [None]:
max_nba = df.groupby(by = ['player', 'season_start'])['avg'].max()

max_nba.sort_values(ascending = False)