In [None]:
# import necessary packages

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
# set random seed for models

np.random.seed(0)

In [None]:
# load game logs data

df = pd.read_csv('full_game_logs.csv')
df_pred = pd.read_csv('full_game_logs_2019.csv')

In [None]:
# specify model features and outputs

features = ['home_away', 'mp', '3p', 'ft', 'trb', 'ast', 'stl', 'blk', 'tov', 'pts']
output = ['win_loss']

In [None]:
# create grid search function to return best parameters for each model

cv = StratifiedKFold(n_splits = 3, random_state = 0)

def grid_search(model, grid):
    clf = GridSearchCV(model, grid, cv = cv, n_jobs = -1, verbose = 2, iid = False, scoring = 'accuracy')
    scores(clf)
    
    print(clf.best_params_)

In [None]:
# create function to evaluate model performance

def scores(model):
    
    model.fit(xtrain, ytrain.values.ravel())
    y_pred = model.predict(xtest)
    
    print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_pred))
    print("Recall: %.3f" % metrics.recall_score(ytest, y_pred))
    print("Precision: %.3f" % metrics.precision_score(ytest, y_pred))
    print("F1: %.3f" % metrics.f1_score(ytest, y_pred))
    
    proba = model.predict_proba(xtest)
    print("Log loss: %.3f" % metrics.log_loss(ytest, proba))

    pos_prob = proba[:, 1]
    print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, pos_prob))
    
    cv = cross_val_score(model, xtest, ytest.values.ravel(), cv = 3, scoring = 'accuracy')
    print("Accuracy (cross validation score): %0.3f (+/- %0.3f)" % (cv.mean(), cv.std() * 2))
    
    cv = cross_val_score(model, xtest, ytest.values.ravel(), cv = 3, scoring = 'recall')
    print("Recall (cross validation score): %0.3f (+/- %0.3f)" % (cv.mean(), cv.std() * 2))
    
    return y_pred

# Create stat line models from 2014-15 to 2017-18 data

# Run grid search cv on randomly selected subset of data

In [None]:
# randomly select 10% of the data set for the purpose of running grid search on it

df_sample = df.sample(n = int(len(df) / 10))

train, test = train_test_split(df_sample, test_size = 0.25, random_state = 0)

xtrain = train[features]
ytrain = train[output]

xtest = test[features]
ytest = test[output]

print("Training set size: %.0f" % len(xtrain))
print("Testing set size: %.0f" % len(xtest))

In [None]:
log = LogisticRegression(solver = 'liblinear')

y_log = scores(log)

In [None]:
C = [int(x) for x in np.linspace(start = 1, stop = 50, num = 20)]
penalty = ['l1', 'l2']
solver = ['liblinear']

grid = {'C': C,
        'penalty': penalty,
        'solver': solver}

grid_search(log, grid)

In [None]:
lda = LinearDiscriminantAnalysis()

y_lda = scores(lda)

In [None]:
solver = ['svd', 'lsqr', 'eigen']

grid = {'solver': solver}

grid_search(lda, grid)

In [None]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 0)

y_rf = scores(rf)

In [None]:
max_depth = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_features = ['auto', 'sqrt']
n_estimators = [int(x) for x in np.linspace(start = 25, stop = 250, num = 10)]
random_state = [0]

grid = {'max_depth': max_depth,
        'max_features': max_features,
        'n_estimators': n_estimators,
        'random_state': random_state}

grid_search(rf, grid)

In [None]:
gbc = GradientBoostingClassifier(random_state = 0)

y_gbc = scores(gbc)

In [None]:
loss = ['deviance']
max_depth = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_features = ['auto', 'sqrt']
n_estimators = [int(x) for x in np.linspace(start = 25, stop = 250, num = 10)]
random_state = [0]

grid = {'loss': loss,
        'max_depth': max_depth,
        'max_features': max_features,
        'n_estimators': n_estimators,
        'random_state': random_state}

grid_search(gbc, grid)

In [None]:
dnn = MLPClassifier(max_iter=1000, random_state = 0)

y_dnn = scores(dnn)

In [None]:
hidden_layer_sizes = [(50, 50, 50), (100, 50, 25), (100, 100, 100)]
activation = ['tanh', 'relu']
solver = ['sgd', 'adam']
alpha = [.0001, .001, .01, .05]
learning_rate = ['constant', 'adaptive']
max_iter = [1000]
random_state = [0]

grid = {'hidden_layer_sizes': hidden_layer_sizes,
        'activation': activation,
        'solver': solver,
        'alpha': alpha,
        'learning_rate': learning_rate,
        'max_iter' : max_iter,
        'random_state': random_state}

grid_search(dnn, grid)

# Create models on full data

In [None]:
# now, use hyperparmeters from grid search on full data set

train, test = train_test_split(df, test_size = 0.25, random_state = 0)

xtrain = train[features]
ytrain = train[output]

xtest = test[features]
ytest = test[output]

print("Training set size: %.0f" % len(xtrain))
print("Testing set size: %.0f" % len(xtest))

In [None]:
# find ytrain class distribution

sum(ytrain['win_loss']) / len(ytrain['win_loss'])

In [None]:
log = LogisticRegression(solver = 'liblinear')

y_log = scores(log)

In [None]:
lda = LinearDiscriminantAnalysis(solver = 'svd')

y_lda = scores(lda)

In [None]:
rf = RandomForestClassifier(max_depth = 10, max_features = 'auto', n_estimators = 75, random_state = 0)

y_rf = scores(rf)

In [None]:
gbc = GradientBoostingClassifier(random_state = 0)

y_gbc = scores(gbc)

In [None]:
dnn = MLPClassifier(activation = 'relu', alpha = 0.01, hidden_layer_sizes = (100, 50, 25), learning_rate = 'adaptive',
                    max_iter=1000, random_state = 0, solver = 'sgd')

y_dnn = scores(dnn)

In [None]:
dummy = DummyClassifier(strategy= "stratified", random_state = 0)

y_dummy = scores(dummy)

# Confusion matrices

In [None]:
# function to create confusion matrices for each model

def confusion_matrix(y_pred, model_name):
    cm = metrics.confusion_matrix(ytest, y_pred)

    plt.style.use("fivethirtyeight")
    fig, ax = plt.subplots()

    sns.heatmap(cm, annot=True, ax = ax, linewidth = 2, fmt='g')

    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

    fig.suptitle("%s Confusion Matrix" % model_name.upper(), weight = 'bold', size = 18, x = .45)
    
    fig.text(x = -0.02, y = -0.08,
        s = '__________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')

    fig.text(x = -0.02, y = -.14,
        s = 'https://dribbleanalytics.blog                     ',
        fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

    fig.savefig('%s_cm.png' % model_name, dpi = 400, bbox_inches = 'tight')

In [None]:
confusion_matrix(y_log, 'log')

In [None]:
confusion_matrix(y_lda, 'lda')

In [None]:
confusion_matrix(y_rf, 'rf')

In [None]:
confusion_matrix(y_gbc, 'gbc')

In [None]:
confusion_matrix(y_dnn, 'dnn')

# Predict wins given stats

In [None]:
# function to create predictions from each model

def make_pred(model_list, df_pred):
    prob_list = []
    
    for i in model_list:
        proba = i.predict_proba(df_pred)
        pos_prob = proba[:, 1]
        prob_list.append(pos_prob)
        
    return prob_list

In [None]:
# subset 2018-19 data to those who played at least 41 games, then calculate their true win shares

game_count = df_pred.groupby(['player'])['g'].count()
game_index = game_count[game_count > 41].index

df_pred = df_pred[df_pred['player'].isin(game_index)].reset_index(drop = True)

prob_list = make_pred([log, lda, rf, gbc, dnn], df_pred[features])

pred_vals = pd.DataFrame(data = np.transpose(prob_list), columns = ['log', 'lda', 'rf', 'gbc', 'dnn'])

pred_vals['avg'] = (pred_vals['log'] + pred_vals['lda'] + pred_vals['rf'] + pred_vals['gbc'] + pred_vals['dnn']) / 5

In [None]:
df_games_pred = df_pred.join(pred_vals)
df_games_pred['cumulative_tws'] = df_games_pred.groupby(['player'])['avg'].transform(pd.Series.cumsum)

df_games_pred.to_csv('game-log-pred.csv', index = False)

# Prepare true win shares

In [None]:
df_results = pd.read_csv('game-log-pred.csv')

In [None]:
# aggregate true win shares by player

tws_sum = df_results.groupby(['player'])[['log', 'lda', 'rf', 'gbc', 'dnn', 'avg']].sum().sort_values(
    ascending = False, by = 'avg')

tws_sum = tws_sum.reset_index()

tws_sum.to_csv('cumulative-tws.csv')

In [None]:
tws_avg = df_results.groupby(['player'])[['log', 'lda', 'rf', 'gbc', 'dnn', 'avg']].mean().sort_values(
    ascending = False, by = 'avg')

tws_avg = tws_avg.reset_index()

tws_avg.to_csv('average-tws.csv')