In [286]:
# Importing Libraries & Functions
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import re
# import warnings
# warnings.filterwarnings('ignore')
from IPython.lib.display import Audio
framerate = 4410
# play_time_seconds = 3
# t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
# audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)
# Audio(audio_data, rate=framerate, autoplay=True)

#SciKit Learn 
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import r2_score, mean_squared_error, confusion_matrix, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import statsmodels.formula.api as smf

import xgboost as xgb
from xgboost import XGBClassifier

def outputs(cm):
    acc = np.round((cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel()),4)
    tpr = np.round(cm.ravel()[3] / (cm.ravel()[3] + cm.ravel()[2]),4)
    fpr = np.round(cm.ravel()[1] / (cm.ravel()[1] + cm.ravel()[0]),4)
    outputs = [acc, tpr, fpr]
    return outputs

## Loading Data + Train/Test Split

First, we load in the two CSVs from the FP1_CUM_STATS and FP2_XFACTOR notebooks and join them on the 'GAME_ID' as the key. We then set our training data to be all data from the 2010 Season to the 2016 Season, and our testing data to be all data from the 2017 Season to the 2018 Season. The cell below performs this task, and outputs the Counts and Percentages of Training and Testing data.

In [420]:
# Splitting Data
FP1 = pd.read_csv('FP1.csv')
FP2 = pd.read_csv('FP2.csv')
df = pd.merge(FP1, FP2, on = 'GAME_ID').rename(columns={'SEASON_y':'SEASON'})
df = df.drop(['TIMESTAMP','HOME_TEAM_NAME', 'SEASON_x'], axis=1)

train_df = df[df['SEASON'].between(2007,2016)]
test_df = df[df['SEASON'].between(2017,2018)]

train_df, test_df = train_df.drop(['SEASON', 'GAME_ID'], axis = 1), test_df.drop(['SEASON', 'GAME_ID'], axis = 1)
X_train, y_train = train_df.drop('HOME_TEAM_WINS', axis = 1), train_df['HOME_TEAM_WINS']
X_test, y_test = test_df.drop('HOME_TEAM_WINS', axis = 1), test_df['HOME_TEAM_WINS']

train_prop = np.round(len(X_train) / (len(df)), 3)
test_prop = np.round(len(X_test) / (len(df)), 3)

print('Training Data: ' + str(len(X_train)) + ' rows -- ' + str(np.round(train_prop*100,2)) + '%')
print('Testing Data: ' + str(len(X_test)) + ' rows -- ' + str(np.round(test_prop*100,2)) + '%')

Training Data: 12841 rows -- 83.0%
Testing Data: 2624 rows -- 17.0%


## Model \#0: Baseline (Dummy) Model

Our dummy model predicts the most frequent label in the training set, which is HOME_TEAM_WINS = 1. This is the same as always predicting the home team to win.

In [422]:
# Baseline Model 
model_0 = DummyClassifier(strategy = "most_frequent")
model_0.fit(X_train, y_train)
y_pred_0 = model_0.predict(X_test)

# Confusion Matrix & Outputs 
cm_0 = confusion_matrix(y_test, y_pred_0)
outputs_0 = outputs(cm_0)
print ("\nConfusion Matrix : \n", cm_0) 
print ("\nAccuracy : ", outputs_0[0])


Confusion Matrix : 
 [[   0 1079]
 [   0 1545]]

Accuracy :  0.5888


## PCA for Feature Reduction


In [178]:
# # Implementing PCA

# # Fitting Scaler to X_train 
# scaler = StandardScaler()
# scaler.fit(X_train)

# # Apply transform to both the training set and the test set.
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

# # Make an instance of the Model ==> 0.95 means 95% of variance explained
# pca = PCA(n_components = 0.999999999999999)
# pca.fit(X_train, y_train)
# print('Number of Components: ', pca.n_components_)

# X_train = pca.transform(X_train)
# X_test = pca.transform(X_test)

In [203]:
# # Scree Plots
# yy = pca.explained_variance_ratio_
# xx = np.arange(1,len(yy)+1)

# sns.set_style('darkgrid')
# fig, ax = plt.subplots(figsize = (12,6))

# plt.plot(xx, yy)
# plt.xlabel('PCA #', fontsize = 16), plt.ylabel('Explained Variance %', fontsize = 16);

## Model \#1: Logistic Regression

The cell below cretes a Logistic Regression model and calculates the outputs to be presented later.

In [423]:
# Logistic Regression
# formula = 'HOME_TEAM_WINS' + '~' + ' + '.join([col for col in train_df.columns if 'HOME_TEAM_WINS' not in col])
# model = smf.logit(formula, data = train_df).fit_regularized(maxiter = 1000)

# y_prob_1 = model_1.predict_proba(test_df)
# y_pred_1 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_1[:,1]], index = y_test.index)

# # Confusion Matrix & Outputs 
# cm_1 = confusion_matrix(y_test, y_pred_1)
# outputs_1 = outputs(cm_1)
# print ("\nConfusion Matrix : \n", cm_1) 
# print ("\nAccuracy : ", outputs_1[0]) 
# out = pd.DataFrame(index = model.params.index, data = {'Coefficient' : np.round(model.params,4), 'P-Values' : model.pvalues})

# # Logistic Regression

model_1 = LogisticRegression(random_state = 69, tol = 0.001, max_iter = 100, 
                             verbose = 3, n_jobs = -1, penalty = 'l2', 
                             fit_intercept = True, C = 0.25)
model_1.fit(X_train, y_train)
y_prob_1 = model_1.predict_proba(X_test)
y_pred_1 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_1[:,1]], index = y_test.index)

# Confusion Matrix & Outputs 
cm_1 = confusion_matrix(y_test, y_pred_1)
outputs_1 = outputs(cm_1)
print ("\nConfusion Matrix : \n", cm_1) 
print ("\nAccuracy : ", outputs_1[0]) 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.



Confusion Matrix : 
 [[ 474  605]
 [ 282 1263]]

Accuracy :  0.662


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.2s finished


In [None]:
# cm_df = pd.DataFrame(cm_1).rename(columns = {0: 'TRUE HOME WINS', 1: 'TRUE AWAY WINS'})
# cm_df = cm_df.rename({0: 'PREDICT HOME WINS', 1: 'PREDICT AWAY WINS'})
# print(cm_df.to_latex())

# out_df = pd.DataFrame(outputs_1).rename({0:'Accuracy', 1:'TPR', 2:'FPR'})
# out_df = out_df.rename(columns = {0:'Test Set Statistic'})
# print(out_df.to_latex())

## Model \#2: Random Forest 

This cell below is a Random Forest model with cross-validation on various parameters.

In [415]:
# # Random Forest

# tic = time.time()

# model_2_acc_list = []
# max_feature_list = np.linspace(1, len(X_train.columns), 100, dtype = 'int32')

# for i in max_feature_list: 
    
#     model_2 = RandomForestClassifier(max_features = i, n_estimators = 250, random_state = 69, 
#                                      n_jobs = -1, min_samples_leaf = 5)
#     model_2.fit(X_train, y_train)
   
#     y_prob_2 = model_2.predict_proba(X_val)
#     y_pred_2 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_2[:,1]], index = y_val.index)
    
#     # Confusion Matrix & Outputs 
#     cm_2 = confusion_matrix(y_val, y_pred_2)
#     accuracy = outputs(cm_2)[0]
    
#     model_2_acc_list.append(accuracy)
#     print(f'Max Features = {str(i)} -- {time.time()-tic}')
    
# opt_max_features = max_feature_list[np.argmax(model_2_acc_list)]
    
# print('Optimal Max Features: ', opt_max_features)
# print('Optimal Validation Accuracy: ', np.max(model_2_acc_list))

# toc = time.time()
# minutes, seconds = np.floor((toc-tic)/60), np.round((toc-tic) - (60*minutes),3)
# print(f'Elapsed Time:  {int(minutes)} min  {seconds} sec\n')

In [424]:
# First Model w Reasonable Values 

model_2 = RandomForestClassifier(n_estimators = 1000, random_state = 69, n_jobs = -1, 
                                 min_samples_split = 5, min_samples_leaf = 5, verbose = 3)
model_2.fit(X_train, y_train)

y_prob_2 = model_2.predict_proba(X_test)
y_pred_2 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_2[:,1]], index = y_test.index)
# 
# Confusion Matrix & Outputs 
cm_2 = confusion_matrix(y_test, y_pred_2)
outputs_2 = outputs(cm_2)
print ("\nConfusion Matrix : \n", cm_2) 
print ("\nAccuracy : ", outputs_2[0])


Confusion Matrix : 
 [[ 431  648]
 [ 247 1298]]

Accuracy :  0.6589


In [425]:
# Selecting top features
out = pd.DataFrame({'Feature' : list(X_train.columns), 
                   'Importance Score': model_2.feature_importances_}).sort_values('Importance Score', ascending = False)
top = out.iloc[:30,:]['Feature'].values
# top = out[out['Importance Score'] > 0.0018]['Feature'].values
out.iloc[:30]

Unnamed: 0,Feature,Importance Score
773,cum_10_G_PLUS_MINUS_away_H2H,0.009961
766,cum_5_F_PLUS_MINUS_away_H2H,0.009459
772,cum_5_G_PLUS_MINUS_away_H2H,0.009407
376,cum_5_G_PLUS_MINUS_home_H2H,0.009193
374,cum_10_G_PLUS_MINUS_home,0.009114
368,cum_10_F_PLUS_MINUS_home,0.009064
377,cum_10_G_PLUS_MINUS_home_H2H,0.008848
767,cum_10_F_PLUS_MINUS_away_H2H,0.008273
370,cum_5_F_PLUS_MINUS_home_H2H,0.008163
770,cum_10_G_PLUS_MINUS_away,0.007913


In [435]:
X_train_top = X_train[top]

grid = {'max_features': np.arange(1, len(X_train_top.columns)+1)}


model_2 = RandomForestClassifier(n_estimators = 500, random_state = 69, n_jobs = -1, 
                                 min_samples_split = 5, min_samples_leaf = 5)

model_2 = GridSearchCV(model_2, param_grid=grid, scoring='accuracy', 
                       cv=5, verbose=2, n_jobs = -1).fit(X_train_top, y_train)

print('Optimal Max Features: ', model_2.best_params_)
y_prob_2 = model_2.best_estimator_.predict_proba(X_test[top])
y_pred_2 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_2[:,1]], index = y_test.index)

# Confusion Matrix & Outputs 
cm_2 = confusion_matrix(y_test, y_pred_2)
outputs_2 = outputs(cm_2)
print ("\nConfusion Matrix : \n", cm_2) 
print ("\nAccuracy : ", outputs_2[0])

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 11.0min finished


Optimal Max Features:  {'max_features': 2}

Confusion Matrix : 
 [[ 476  603]
 [ 305 1240]]

Accuracy :  0.654


In [440]:
# Run our Best Model TOP

model_2 = RandomForestClassifier(max_features = 2, n_estimators = 500, random_state = 69, 
                                 n_jobs = -1, min_samples_split = 5, min_samples_leaf = 5, verbose = 1)
model_2.fit(X_train_top, y_train)

y_prob_2 = model_2.predict_proba(X_test[top])
y_pred_2 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_2[:,1]], index = y_test.index)
# 
# Confusion Matrix & Outputs 
cm_2 = confusion_matrix(y_test, y_pred_2)
outputs_2 = outputs(cm_2)
print ("\nConfusion Matrix : \n", cm_2) 
print ("\nAccuracy : ", outputs_2[0])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    1.2s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s



Confusion Matrix : 
 [[ 476  603]
 [ 305 1240]]

Accuracy :  0.654


[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.1s finished


In [None]:
# max_features_RF = RF_model.cv_results_['param_max_features'].data
# acc_scores_RF = RF_model.cv_results_['mean_test_score']

# sns.set_style('darkgrid')
# plt.figure(figsize=(8, 6))
# plt.title('\nMean Cross-Validation Accuracy vs. max_features\n', fontsize=16)
# plt.xlabel('max_features', fontsize=16)
# plt.ylabel('Mean Cross-Validation Accuracy', fontsize=16)
# plt.plot(max_features_RF, acc_scores_RF, linewidth=3, color='orange')
# plt.scatter(max_features_RF[acc_scores_RF.argmax(axis=0)], np.max(acc_scores_RF), s=125, marker='o')
# plt.grid(True, which='both')
# plt.show()

## Model \#3: XGBoost

The cells below go through an entire process of creating and cross-validating an XGBoost model. First, we set default parameters and using the XGB Cross-Validation function to determine the optimal number of trees for the specified learning rate. We then use this number of estimators to cross-validate and select an approximate values for *max_depth* and *min_child_weight*, which we then find more optimally by shrinking the search space.

In [320]:
# Combining train and val

train_df_3 = df[df['SEASON'].between(2007,2016)]

train_df_3 = train_df_3.drop(['SEASON', 'GAME_ID'], axis = 1)
X_train_3, y_train_3 = train_df_3.drop('HOME_TEAM_WINS', axis = 1), train_df_3['HOME_TEAM_WINS']

In [321]:
# Defining the Cross-Validation Function 
def modelfit(alg, train, predictors, useTrainCV = True, cv_folds = 5, early_stopping_rounds = 50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train[predictors].values, label=train[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold = cv_folds,
                          metrics = 'error', early_stopping_rounds = early_stopping_rounds, verbose_eval = True)
        alg.set_params(n_estimators = cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(train[predictors], train['HOME_TEAM_WINS'], eval_metric = 'error')
        
    #Predict training set:
    train_predictions = alg.predict(train[predictors])
    train_predprob = alg.predict_proba(train[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % accuracy_score(train['HOME_TEAM_WINS'].values, train_predictions))
    print("AUC Score (Train): %f" % roc_auc_score(train['HOME_TEAM_WINS'], train_predprob))

Run the first cell below and the final number it outputs in square brackets is the n_estimators that you plug into gridsearchcv object below.

STEP \#1 : Calculate n_estimators for learning_rate = 0.1 (VERY HIGH)

STEP \#2 : Use Cross-Validation to tune the other hyperparameters

STEP \#3 : Now reduce the learning rate to 0.01 or something smaller and calculate the new n_estimators for the hyperparameters found above. Obviously, this will mean much more trees are used.

STEP \#4 : Run our final model using the learning rate set above, the hyperparameters found using cross-validation, and the corresponding (now higher) n_estimators.

In [322]:
# Determining Optimal Number of Estimators
target = 'HOME_TEAM_WINS'
train = pd.concat([X_train_3, y_train_3], axis = 1)
predictors = [x for x in train.columns if x not in [target]]

model_3 = XGBClassifier(learning_rate = 0.1, n_estimators = 1000, max_depth = 5, min_child_weight = 1,
                     gamma = 0, subsample = 0.8, colsample_bytree = 0.8, objective = 'binary:logistic',
                     nthread = 4, scale_pos_weight = 1, seed = 69)
modelfit(model_3, train, X_train_3.columns, useTrainCV = True)

[0]	train-error:0.30743+0.00701	test-error:0.34211+0.00763
[1]	train-error:0.29597+0.00441	test-error:0.33541+0.01080
[2]	train-error:0.28857+0.00521	test-error:0.33284+0.00777
[3]	train-error:0.28421+0.00481	test-error:0.33043+0.00546
[4]	train-error:0.28035+0.00391	test-error:0.32786+0.00736
[5]	train-error:0.27778+0.00277	test-error:0.32934+0.00575
[6]	train-error:0.27632+0.00419	test-error:0.32871+0.00749
[7]	train-error:0.27459+0.00516	test-error:0.32614+0.00784
[8]	train-error:0.27217+0.00425	test-error:0.32404+0.01012
[9]	train-error:0.26955+0.00450	test-error:0.32326+0.00956
[10]	train-error:0.26778+0.00359	test-error:0.32256+0.00879
[11]	train-error:0.26598+0.00419	test-error:0.32311+0.00988
[12]	train-error:0.26474+0.00307	test-error:0.32069+0.00922
[13]	train-error:0.26308+0.00306	test-error:0.32030+0.00915
[14]	train-error:0.26020+0.00282	test-error:0.31952+0.00832
[15]	train-error:0.25859+0.00295	test-error:0.32100+0.00865
[16]	train-error:0.25621+0.00329	test-error:0.3197

In [323]:
# Cross-Validating MAX_DEPTH and MIN_CHILD_WEIGHT

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=81, gamma=0, 
                                                  subsample=0.8, colsample_bytree=0.8, 
                                                  objective= 'binary:logistic', nthread=4, 
                                                  scale_pos_weight=1, seed=27), 
                        param_grid = param_test1, 
                        scoring = 'accuracy', n_jobs = -1, iid=False , cv=5, verbose = 3)
gsearch1.fit(train[predictors],train[target])
print('Best Approx. Parameters: \n', gsearch1.best_params_, gsearch1.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  60 | elapsed:  7.2min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  7.3min finished


Best Approx. Parameters: 
 {'max_depth': 3, 'min_child_weight': 3} 0.6632677963594208


In [324]:
# Reducing Search Space
param_test2 = {
 'max_depth':[2,3,4],
 'min_child_weight':[2,3,4]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=81, gamma=0, subsample=0.8, 
                                                  colsample_bytree=0.8, objective= 'binary:logistic', 
                                                  nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test2, scoring='accuracy', n_jobs=-1, iid=False, cv=5, verbose = 3)
gsearch2.fit(train[predictors],train[target])
print('Best Parameters: \n', gsearch2.best_params_, gsearch2.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:  1.5min remaining:   45.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  2.6min finished


Best Parameters: 
 {'max_depth': 2, 'min_child_weight': 4} 0.6689529727193024


We then tune *gamma* in the same way and then calculate the new optimal number of parameters for the new set of hyperparameters. We then tune *subsample*, *colsample_bytree*, and *reg_alpha* in the same way as the rest with the new optimal estimators.

In [325]:
# Cross-Validating GAMMA
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=81, max_depth=2, 
                                                  min_child_weight=4, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, 
                                                  scale_pos_weight=1,seed=27), 
                        param_grid = param_test3, scoring='accuracy',n_jobs=-1,iid=False, cv=5, verbose = 3)
gsearch3.fit(train[predictors],train[target])
gsearch3.best_params_, gsearch3.best_score_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  25 | elapsed:   33.2s remaining:  4.1min
[Parallel(n_jobs=-1)]: Done  12 out of  25 | elapsed:   33.3s remaining:   36.1s
[Parallel(n_jobs=-1)]: Done  21 out of  25 | elapsed:   56.6s remaining:   10.8s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   56.6s finished




({'gamma': 0.0}, 0.6689529727193024)

In [327]:
# Determining NEW Optimal Number of Estimators
xgb2 = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=2, min_child_weight=4,
                     gamma=0.0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic',
                     nthread=4, scale_pos_weight=1,seed=27)
modelfit(xgb2, train, predictors, useTrainCV = True)

[0]	train-error:0.34569+0.00356	test-error:0.35021+0.00850
[1]	train-error:0.34242+0.00348	test-error:0.34694+0.01157
[2]	train-error:0.33590+0.00651	test-error:0.34257+0.00894
[3]	train-error:0.33235+0.00343	test-error:0.33525+0.01369
[4]	train-error:0.32994+0.00409	test-error:0.33596+0.00954
[5]	train-error:0.32719+0.00269	test-error:0.33222+0.01084
[6]	train-error:0.32605+0.00258	test-error:0.33191+0.00894
[7]	train-error:0.32540+0.00152	test-error:0.32973+0.01075
[8]	train-error:0.32404+0.00173	test-error:0.32972+0.00819
[9]	train-error:0.32313+0.00142	test-error:0.32902+0.00993
[10]	train-error:0.32112+0.00161	test-error:0.32887+0.00861
[11]	train-error:0.31925+0.00157	test-error:0.32832+0.00851
[12]	train-error:0.31976+0.00107	test-error:0.32677+0.00939
[13]	train-error:0.31915+0.00109	test-error:0.32568+0.00929
[14]	train-error:0.31799+0.00118	test-error:0.32474+0.00996
[15]	train-error:0.31691+0.00177	test-error:0.32451+0.00963
[16]	train-error:0.31713+0.00189	test-error:0.3249

[137]	train-error:0.27531+0.00165	test-error:0.31259+0.00865
[138]	train-error:0.27478+0.00165	test-error:0.31197+0.00825
[139]	train-error:0.27441+0.00149	test-error:0.31205+0.00770
[140]	train-error:0.27408+0.00142	test-error:0.31205+0.00767
[141]	train-error:0.27401+0.00165	test-error:0.31181+0.00754
[142]	train-error:0.27387+0.00152	test-error:0.31189+0.00758
[143]	train-error:0.27358+0.00162	test-error:0.31189+0.00808
[144]	train-error:0.27327+0.00175	test-error:0.31150+0.00807
[145]	train-error:0.27291+0.00142	test-error:0.31135+0.00885
[146]	train-error:0.27291+0.00165	test-error:0.31127+0.00882
[147]	train-error:0.27268+0.00178	test-error:0.31174+0.00943
[148]	train-error:0.27233+0.00166	test-error:0.31220+0.00978
[149]	train-error:0.27216+0.00200	test-error:0.31283+0.00940
[150]	train-error:0.27179+0.00182	test-error:0.31259+0.00978
[151]	train-error:0.27167+0.00171	test-error:0.31275+0.00963
[152]	train-error:0.27173+0.00180	test-error:0.31298+0.00893
[153]	train-error:0.2711

[272]	train-error:0.24722+0.00240	test-error:0.31150+0.00916
[273]	train-error:0.24679+0.00230	test-error:0.31080+0.00949
[274]	train-error:0.24653+0.00195	test-error:0.31127+0.00896
[275]	train-error:0.24628+0.00188	test-error:0.31111+0.00966
[276]	train-error:0.24609+0.00211	test-error:0.31158+0.00910
[277]	train-error:0.24626+0.00195	test-error:0.31189+0.00904
[278]	train-error:0.24566+0.00206	test-error:0.31142+0.00968
[279]	train-error:0.24554+0.00184	test-error:0.31213+0.00881
[280]	train-error:0.24556+0.00184	test-error:0.31158+0.00903
[281]	train-error:0.24535+0.00170	test-error:0.31142+0.00923
[282]	train-error:0.24517+0.00175	test-error:0.31072+0.00961
[283]	train-error:0.24504+0.00189	test-error:0.31002+0.00932
[284]	train-error:0.24445+0.00204	test-error:0.31018+0.00976
[285]	train-error:0.24435+0.00209	test-error:0.31033+0.00956
[286]	train-error:0.24451+0.00220	test-error:0.31096+0.00948
[287]	train-error:0.24400+0.00220	test-error:0.31096+0.01008
[288]	train-error:0.2441

In [328]:
# Cross-Validating SUBSAMPLE and COLSAMPLE_BYTREE
param_test4 = {
 'subsample':[i/10.0 for i in range(7,9)],
 'colsample_bytree':[i/10.0 for i in range(7,9)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=289, max_depth=2, 
                                                  min_child_weight=4, gamma=0.0, objective= 'binary:logistic', 
                                                  nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test4, scoring='accuracy',n_jobs=-1,iid=False, cv=5, verbose = 3)
gsearch4.fit(train[predictors],train[target])
gsearch4.best_params_, gsearch4.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:  2.1min remaining: 11.8min
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:  2.2min remaining:  2.2min
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:  2.8min remaining:   29.9s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.8min finished




({'colsample_bytree': 0.7, 'subsample': 0.7}, 0.6544674461498164)

In [329]:
#Cross-Validating REG_ALPHA
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=289, max_depth=2, 
                                                  min_child_weight=4, gamma=0.0, subsample=0.7, 
                                                  colsample_bytree=0.7, objective= 'binary:logistic', 
                                                  nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test6, scoring='accuracy',n_jobs=-1,iid=False, cv=5, verbose = 3)
gsearch6.fit(train[predictors],train[target])
gsearch6.best_params_, gsearch6.best_score_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  25 | elapsed:  2.0min remaining: 15.0min
[Parallel(n_jobs=-1)]: Done  12 out of  25 | elapsed:  2.0min remaining:  2.2min
[Parallel(n_jobs=-1)]: Done  21 out of  25 | elapsed:  3.0min remaining:   34.0s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  3.1min finished




({'reg_alpha': 100}, 0.6755701819804548)

In [331]:
#Cross-Validating REG_ALPHA
param_test6 = {
 'reg_alpha':[50, 100, 150]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=289, max_depth=2, 
                                                  min_child_weight=4, gamma=0.0, subsample=0.7, 
                                                  colsample_bytree=0.7, objective= 'binary:logistic', 
                                                  nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test6, scoring='accuracy',n_jobs=-1,iid=False, cv=5, verbose = 3)
gsearch6.fit(train[predictors],train[target])
gsearch6.best_params_, gsearch6.best_score_

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  1.3min remaining:  8.3min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  1.4min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.5min finished




({'reg_alpha': 100}, 0.6755701819804548)

We then calculate the optimal number of trees for our final set of hyperparameters, but decrease the learning rate to 0.01 to get our final hyperparameters and run this model.

In [332]:
# Determining Optimal Number of Trees w/ New (LOWER) Learning Rate 

xgb3 = XGBClassifier(learning_rate = 0.005, n_estimators = 5000, max_depth = 2, min_child_weight = 4, gamma = 0.0, 
                     subsample = 0.7, colsample_bytree = 0.7, reg_alpha = 100, objective = 'binary:logistic', 
                     nthread=4, scale_pos_weight = 1, seed = 69)
modelfit(xgb3, train, predictors, useTrainCV = True)

[0]	train-error:0.35611+0.00759	test-error:0.36321+0.01368
[1]	train-error:0.34729+0.00540	test-error:0.35208+0.01187
[2]	train-error:0.34178+0.00800	test-error:0.34725+0.01427
[3]	train-error:0.33975+0.00624	test-error:0.34655+0.01388
[4]	train-error:0.33732+0.00264	test-error:0.34530+0.01364
[5]	train-error:0.33599+0.00301	test-error:0.34522+0.01395
[6]	train-error:0.33529+0.00270	test-error:0.34491+0.01446
[7]	train-error:0.33337+0.00169	test-error:0.34172+0.01755
[8]	train-error:0.33292+0.00135	test-error:0.34141+0.01555
[9]	train-error:0.33202+0.00135	test-error:0.34156+0.01610
[10]	train-error:0.33220+0.00138	test-error:0.34180+0.01615
[11]	train-error:0.33226+0.00094	test-error:0.34125+0.01588
[12]	train-error:0.33165+0.00111	test-error:0.34071+0.01417
[13]	train-error:0.33173+0.00129	test-error:0.33907+0.01395
[14]	train-error:0.33144+0.00177	test-error:0.34008+0.01154
[15]	train-error:0.33167+0.00140	test-error:0.33876+0.01034
[16]	train-error:0.33173+0.00155	test-error:0.3387

In [333]:
# Running our Final XgBoost Model
xgb5 = XGBClassifier(learning_rate = 0.005, n_estimators = 120, max_depth = 2, min_child_weight = 4, gamma = 0.0,
                     subsample = 0.7, colsample_bytree = 0.7, reg_alpha = 100, objective = 'binary:logistic',
                     nthread = 4, scale_pos_weight = 1, seed = 69)

model_3 = xgb5
model_3.fit(X_train_3, y_train_3)
y_prob_3 = model_3.predict_proba(X_test)
y_pred_3 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_3[:,1]], index = y_test.index)

# Confusion Matrix & Outputs 
cm_3 = confusion_matrix(y_test, y_pred_3)
outputs_3 = outputs(cm_3)
print ("\nConfusion Matrix : \n", cm_3) 
print ("\nAccuracy : ", outputs_3[0]) 


Confusion Matrix : 
 [[ 497  582]
 [ 316 1229]]

Accuracy :  0.6578


In [359]:
len(model_3.feature_importances_)
# model_3.get_booster().get_scores(importance = 'gain')

833

In [361]:
# Selecting top features
out = pd.DataFrame({'Feature' : list(X_train.columns), 
                   'Importance Score': model_3.feature_importances_}).sort_values('Importance Score', ascending = False)
top_3 = out.iloc[:30,:]['Feature'].values
# top = out[out['Importance Score'] > 0.0018]['Feature'].values
# len(top)

In [363]:
[i for i in top_3 if i in top]

['cum_10_G_PLUS_MINUS_away_H2H',
 'cum_10_G_PLUS_MINUS_home_H2H',
 'cum_5_G_PLUS_MINUS_away_H2H',
 'cum_5_G_PLUS_MINUS_home_H2H',
 'cum_5_F_PLUS_MINUS_away_H2H',
 'cum_10_F_PLUS_MINUS_away_H2H',
 'cum_5_F_PLUS_MINUS_home_H2H',
 'cum_10_G_PLUS_MINUS_home',
 'cum_5_G_PLUS_MINUS_home',
 'cum_10_C_PLUS_MINUS_home',
 'cum_10_G_PLUS_MINUS_away',
 'cum_10_F_PLUS_MINUS_home',
 'cum_10_F_PLUS_MINUS_away',
 'HOME_TEAM_home_win_pct_past4yrs']

In [414]:
# Running our Final XgBoost Model
xgb5 = XGBClassifier(learning_rate = 0.005, n_estimators = 120, max_depth = 2, min_child_weight = 4, gamma = 0.0,
                     subsample = 0.7, colsample_bytree = 0.7, reg_alpha = 100, objective = 'binary:logistic',
                     nthread = 4, scale_pos_weight = 1, seed = 69)

model_3 = xgb5
model_3.fit(X_train_3[top], y_train_3)
y_prob_3 = model_3.predict_proba(X_test[top])
y_pred_3 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_3[:,1]], index = y_test.index)

# Confusion Matrix & Outputs 
cm_3 = confusion_matrix(y_test, y_pred_3)
outputs_3 = outputs(cm_3)
print ("\nConfusion Matrix : \n", cm_3) 
print ("\nAccuracy : ", outputs_3[0]) 


Confusion Matrix : 
 [[ 497  582]
 [ 324 1221]]

Accuracy :  0.6547


## Boosting (round2)

In [353]:
# # Running our Final XgBoost Model
# grid = {'n_estimators': [500, 1000, 2500], 
#         'max_leaf_nodes': [4, 5, 6]}

# model_3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.005, objective= 'binary:logistic', seed=6969), 
#                         param_grid = grid, scoring='accuracy', n_jobs=-1, iid=False, cv=3, verbose = 3)
# model_3.fit(train[predictors],train[target])
# print('Optimal Hyperparameters: ', model_3.best_params_, model_3.best_score_)



# y_prob_3 = model_3.best_estimator_.predict_proba(X_test)
# y_pred_3 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_3[:,1]], index = y_test.index)

# # Confusion Matrix & Outputs 
# cm_3 = confusion_matrix(y_test, y_pred_3)
# outputs_3 = outputs(cm_3)
# print ("\nConfusion Matrix : \n", cm_3) 
# print ("\nAccuracy : ", outputs_3[0]) 

In [209]:
# # Running our Final XgBoost Model
# grid = {'n_estimators': [485], 
#         'max_leaf_nodes': [2,3,4]}

# model_3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.005, objective= 'binary:logistic', seed=6969), 
#                         param_grid = grid, scoring='accuracy', n_jobs=-1, iid=False, cv=3, verbose = 3)
# model_3.fit(train[predictors],train[target])
# print('Optimal Hyperparameters: ', model_3.best_params_, model_3.best_score_)



# y_prob_3 = model_3.best_estimator_.predict_proba(X_test)
# y_pred_3 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_3[:,1]], index = y_test.index)

# # Confusion Matrix & Outputs 
# cm_3 = confusion_matrix(y_test, y_pred_3)
# outputs_3 = outputs(cm_3)
# print ("\nConfusion Matrix : \n", cm_3) 
# print ("\nAccuracy : ", outputs_3[0]) 

## Model \#4: Neural Network

The cells below go through an entire process of creating and cross-validating our neural network model using Sci-Kit Learn's MLPClassifier.

In [349]:
# Neural Network 
from sklearn.neural_network import MLPClassifier
# model_3 = MLPClassifier(hidden_layer_sizes =(75,), max_iter = 200,
#                         activation = 'relu', solver = 'adam', random_state = 69)
# model_3.fit(X_train, y_train)
# y_prob_3 = model_3.predict_proba(X_test)
# y_pred_3 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_3[:,1]], index = y_test.index)

# grid = {'hidden_layer_sizes': [(50,),(100,),(200,),(400,),(800,),(50,50),(100,50)]}

model_4 = MLPClassifier(max_iter = 10000, alpha = 0.5, activation = 'relu', hidden_layer_sizes=(100,25), 
                        solver = 'adam', learning_rate_init = 0.001, random_state = 69, tol = 0.0000000000000001)
# model_4 = GridSearchCV(model_4, grid, n_jobs = -1, cv = 2, verbose = 3)
model_4.fit(X_train_3, y_train_3)

y_prob_4 = model_4.predict_proba(X_test)
# print(model_4.best_params_)
# y_pred_4 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_4[:,1]])
y_pred_4 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_4[:,1]], index = y_test.index)
# Confusion Matrix & Outputs 
cm_4 = confusion_matrix(y_test, y_pred_4)
outputs_4 = outputs(cm_4)
print ("\nConfusion Matrix : \n", cm_4) 
print ("\nAccuracy : ", outputs_4[0]) 


Confusion Matrix : 
 [[ 500  579]
 [ 322 1223]]

Accuracy :  0.6566


# Blending Model

In [408]:
# Splitting Data
FP1 = pd.read_csv('FP1.csv')
FP2 = pd.read_csv('FP2.csv')
df = pd.merge(FP1, FP2, on = 'GAME_ID').rename(columns={'SEASON_y':'SEASON'})
df = df.drop(['TIMESTAMP','HOME_TEAM_NAME', 'SEASON_x'], axis=1)

train_df = df[df['SEASON'].between(2007,2014)]
val_df = df[df['SEASON'].between(2015,2016)]
test_df = df[df['SEASON'].between(2017,2018)]

train_df, val_df, test_df = train_df.drop(['SEASON', 'GAME_ID'], axis = 1), val_df.drop(['SEASON', 'GAME_ID'], axis = 1), test_df.drop(['SEASON', 'GAME_ID'], axis = 1)
X_train, y_train = train_df.drop('HOME_TEAM_WINS', axis = 1), train_df['HOME_TEAM_WINS']
X_val, y_val = val_df.drop('HOME_TEAM_WINS', axis = 1), val_df['HOME_TEAM_WINS']
X_test, y_test = test_df.drop('HOME_TEAM_WINS', axis = 1), test_df['HOME_TEAM_WINS']

train_prop = np.round(len(X_train) / (len(df)), 3)
val_prop = np.round(len(X_val) / (len(df)), 3)
test_prop = np.round(len(X_test) / (len(df)), 3)

print('Training Data: ' + str(len(X_train)) + ' rows -- ' + str(np.round(train_prop*100,2)) + '%')
print('Validation Data: ' + str(len(X_val)) + ' rows -- ' + str(np.round(val_prop*100,2)) + '%')
print('Testing Data: ' + str(len(X_test)) + ' rows -- ' + str(np.round(test_prop*100,2)) + '%')

Training Data: 10216 rows -- 66.1%
Validation Data: 2625 rows -- 17.0%
Testing Data: 2624 rows -- 17.0%


In [409]:
# Creating blend_df 
y_prob_1_val = model_1.predict_proba(X_val)
y_prob_2_val = model_2.predict_proba(X_val[top])
y_prob_3_val = model_3.predict_proba(X_val)
y_prob_4_val = model_4.predict_proba(X_val)



val_df_blend = pd.DataFrame({'Logistic Regression': list(y_prob_1_val[:,1]), 
                             'Random Forest': list(y_prob_2_val[:,1]), 
                             'XGBoost': list(y_prob_3_val[:,1]), 
                             'Neural Network': y_prob_4_val[:,1], 
                             'HOME_TEAM_WINS': y_val})
test_df_blend = pd.DataFrame({'Logistic Regression': y_prob_1[:,1], 
                             'Random Forest': y_prob_2[:,1], 
                             'XGBoost': y_prob_3[:,1], 
                             'Neural Network': y_prob_4[:,1], 
                             'HOME_TEAM_WINS': y_test})
X_val_blend, y_val_blend = val_df_blend.drop('HOME_TEAM_WINS', axis=1), val_df_blend['HOME_TEAM_WINS']
X_test_blend, y_test_blend = test_df_blend.drop('HOME_TEAM_WINS', axis=1), test_df_blend['HOME_TEAM_WINS']


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.1s finished


XGBoostError: [03:13:59] /Users/travis/build/dmlc/xgboost/src/predictor/cpu_predictor.cc:258: Check failed: m->NumColumns() == model.learner_model_param->num_feature (832 vs. 833) : Number of columns in data must equal to trained model.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000114936074 dmlc::LogMessageFatal::~LogMessageFatal() + 116
  [bt] (1) 2   libxgboost.dylib                    0x0000000114a5157d void xgboost::predictor::CPUPredictor::DispatchedInplacePredict<xgboost::data::ArrayAdapter>(dmlc::any const&, std::__1::shared_ptr<xgboost::DMatrix>, xgboost::gbm::GBTreeModel const&, float, xgboost::PredictionCacheEntry*, unsigned int, unsigned int) const + 365
  [bt] (2) 3   libxgboost.dylib                    0x0000000114a4d8b1 xgboost::predictor::CPUPredictor::InplacePredict(dmlc::any const&, std::__1::shared_ptr<xgboost::DMatrix>, xgboost::gbm::GBTreeModel const&, float, xgboost::PredictionCacheEntry*, unsigned int, unsigned int) const + 401
  [bt] (3) 4   libxgboost.dylib                    0x00000001149d48a8 xgboost::gbm::GBTree::InplacePredict(dmlc::any const&, std::__1::shared_ptr<xgboost::DMatrix>, float, xgboost::PredictionCacheEntry*, unsigned int, unsigned int) const + 424
  [bt] (4) 5   libxgboost.dylib                    0x00000001149e932b xgboost::LearnerImpl::InplacePredict(dmlc::any const&, std::__1::shared_ptr<xgboost::DMatrix>, xgboost::PredictionType, float, xgboost::HostDeviceVector<float>**, unsigned int, unsigned int) + 123
  [bt] (5) 6   libxgboost.dylib                    0x000000011492fb5b void InplacePredictImpl<xgboost::data::ArrayAdapter>(std::__1::shared_ptr<xgboost::data::ArrayAdapter>, std::__1::shared_ptr<xgboost::DMatrix>, char const*, xgboost::Learner*, unsigned long, unsigned long, unsigned long long const**, unsigned long long*, float const**) + 843
  [bt] (6) 7   libxgboost.dylib                    0x000000011492f4e3 XGBoosterPredictFromDense + 339
  [bt] (7) 8   libffi.7.dylib                      0x000000010182fead ffi_call_unix64 + 85



In [352]:
# Blended Logistic Regression

model_5 = LogisticRegression(random_state = 69,  verbose = 3, n_jobs = -1)
model_5.fit(X_val_blend, y_val_blend)
y_prob_5 = model_5.predict_proba(X_test_blend)
y_pred_5 = pd.Series([1 if x > 0.5 else 0 for x in y_prob_5[:,1]], index = y_test_blend.index)

# Confusion Matrix & Outputs 
cm_5 = confusion_matrix(y_test_blend, y_pred_5)
outputs_5 = outputs(cm_5)
print ("\nConfusion Matrix : \n", cm_5) 
print ("\nAccuracy : ", outputs_5[0]) 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.



Confusion Matrix : 
 [[ 482  597]
 [ 294 1251]]

Accuracy :  0.6604


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.0s finished


## Final Model Results

The cell below calculates the Accuracy (our primary performance metric), TPR, and FPR for each of the four models above.

In [None]:
# # Generating the Output Table
# df = pd.DataFrame({'Baseline Model': np.round(outputs_0,4),
#                   'Logistic Regression': np.round(outputs_1,4),
#                   'Random Forest': np.round(outputs_2,4),
#                   'XGBoost': np.round(outputs_3,4),
#                   'Neural Network': np.round(outputs_4,4)}, index = ['Accuracy', 'TPR', 'FPR'], ).T
# df