In [None]:
%matplotlib inline

# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier

In [None]:
# load full data
df_all_players = pd.read_csv('final_csv_data/full_nba_data.csv')

# Create All-NBA models from full data

In [None]:
# define features and output for model
features = ['g', 'mp', 'pts', 'trb', 'ast', 'vorp', 'ws']
output = ['all-nba']

In [None]:
# split data into training and testing
train, test = train_test_split(df_all_players, test_size=0.25, random_state=0)

xtrain = train[features]
ytrain = train[output]

xtest = test[features]
ytest = test[output]

print("Training set size: %.0f" % len(xtrain))
print("Testing set size: %.0f" % len(xtest))

In [None]:
# function that fits model and returns performance metrics
def scores(model):
    
    model.fit(xtrain, ytrain.values.ravel())
    y_pred = model.predict(xtest)
    
    print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_pred))
    print("Recall: %.3f" % metrics.recall_score(ytest, y_pred))
    print("Precision: %.3f" % metrics.precision_score(ytest, y_pred))
    print("F1: %.3f" % metrics.f1_score(ytest, y_pred))
    
    proba = model.predict_proba(xtest)
    print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

    pos_prob = proba[:, 1]
    print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, pos_prob))
    
    cv = cross_val_score(model, xtest, ytest.values.ravel(), cv=3, scoring='accuracy')
    print("Accuracy (cross validation score): %0.3f (+/- %0.3f)" % (cv.mean(), cv.std() * 2))
    
    cv = cross_val_score(model, xtest, ytest.values.ravel(), cv=3, scoring='recall')
    print("Recall (cross validation score): %0.3f (+/- %0.3f)" % (cv.mean(), cv.std() * 2))
    
    return y_pred

# Define models

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)

y_rf = scores(rf)

In [None]:
gbc = GradientBoostingClassifier(random_state=0)

y_gbc = scores(gbc)

In [None]:
xgb = xgboost.XGBClassifier(random_state=0)

y_xgb = scores(xgb)

In [None]:
dummy = DummyClassifier(strategy="stratified", random_state=0)

y_dummy = scores(dummy)

# Predict historical All-NBA score

In [None]:
# function that trians models on all data except for year x, then outputs All-NBA probabilities for year x
def make_pred(model_list, df):
    
    df_year_order = df.sort_values(by='season_start')
    df_pred_list = []
    for year in range(1979, 2019):
        df_curr = df_year_order[df_year_order['season_start']==year].reset_index(drop=True)
        df_train = df_year_order[df_year_order['season_start']!=year].reset_index(drop=True)
        prob_list = []
        for i in model_list:
            i.fit(df_train[features], df_train[output].values.ravel())
            proba = i.predict_proba(df_curr[features])
            pos_prob = proba[:, 1]
            prob_list.append(pos_prob)
        df_curr['pred_all_nba'] = np.mean(prob_list, axis=0)
        df_pred_list.append(df_curr)
    return pd.concat(df_pred_list).sort_values(by='season_id').reset_index(drop=True)

In [None]:
df_pred_list = make_pred([rf, gbc, xgb], df_all_players)

In [None]:
# function to get metrics of leave-one-out predictions
def get_loo_metrics(df):
    
    ytest = df['all-nba'].values
    yprob = df['pred_all_nba'].values
    ypred = (yprob > 0.5).astype(int)
    
    print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, ypred))
    print("Recall: %.3f" % metrics.recall_score(ytest, ypred))
    print("Precision: %.3f" % metrics.precision_score(ytest, ypred))
    print("F1: %.3f" % metrics.f1_score(ytest, ypred))

    print("Log loss: %.3f" % metrics.log_loss(ytest, np.array([1 - yprob, yprob]).T))

    print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, yprob))

In [None]:
get_loo_metrics(df_pred_list)

In [None]:
# take necessary parts of df, save to csv
out_df = df_pred_list[['player', 'player_id', 'age', 'player_season', 'season_start', 'pred_all_nba']]
out_df.to_csv('results/all_nba_preds.csv')