In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [14]:
def join_feature_name_with_importance_value(features, importances):
    """
    Join via a list of tuples, feature names with their importance values
    :param features: data frame whose features are represented by columns used by classifier
    :param importances: feature importance scores assigned by classifier
    :return: sorted list (highest importances first) of feature,importance tuples
    """
    if features.columns.shape[0] != importances.shape[0]:
        return []

    feature_importances = []
    for item in range(features.columns.shape[0]):
        feature_importances.append((features.columns[item], importances[item]))
    feature_importances_sorted = sorted(feature_importances, reverse=True, key=lambda kv: kv[1])

    return feature_importances_sorted

In [53]:
game_data = pd.read_csv('D1_2015_Processed_Stats.csv')


game_data = game_data[game_data['game_date'] > '2015-12-15']
print("Number of records= ", game_data.shape[0])
game_data.info()

Number of records=  3927
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3927 entries, 1222 to 5148
Data columns (total 15 columns):
game_id                      3927 non-null object
game_date                    3927 non-null object
principal_team               3927 non-null object
opponent_team                3927 non-null object
home_team                    3927 non-null bool
principal_score              3927 non-null int64
opponent_score               3927 non-null int64
delta_avg_points_per_game    3927 non-null float64
delta_field_goals_pct        3927 non-null float64
delta_avg_off_rebounds       3927 non-null float64
delta_avg_free_throws_att    3927 non-null float64
delta_avg_free_throws_pct    3927 non-null float64
delta_avg_turnovers          3927 non-null float64
delta_win_pct                3927 non-null float64
game_result                  3927 non-null object
dtypes: bool(1), float64(7), int64(2), object(5)
memory usage: 464.0+ KB


In [54]:
features = ['delta_avg_points_per_game','delta_field_goals_pct','delta_avg_off_rebounds','delta_avg_free_throws_att',
                'delta_avg_free_throws_pct','delta_avg_turnovers','delta_win_pct','home_team']

X = game_data[features].copy()
X = pd.get_dummies(X, columns=['home_team'], drop_first=True)
y= game_data['game_result'].copy()
y = pd.get_dummies(y, columns=['game_result'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 10)
print(X_test.shape)

(786, 8)


In [55]:
tree_count =2000

rf_classifier: RandomForestClassifier = RandomForestClassifier(tree_count, max_features='sqrt',
                                                                   min_samples_split=10, min_samples_leaf=2,
                                                                   max_depth=10, random_state=0)
    
rf_classifier.fit(X_train, y_train.WIN)
score = rf_classifier.score(X_train, y_train.WIN)
print("Training Model Score= ", score)

Training Model Score=  0.8685132123527539


In [56]:
y_pred = rf_classifier.predict(X_test)
print("RF model accuracy is %2.2f" % metrics.accuracy_score(y_test, y_pred))

RF model accuracy is 0.69


In [57]:
prediction_probabilities = rf_classifier.predict_proba(X_test)
prediction_probabilities[:,1]
pred_probs = pd.Series(prediction_probabilities[:,1], index=X_test.index)

In [58]:
predictions = pd.Series(y_pred, index=y_test.index)
test_games = game_data[game_data.index.isin(X_test.index)].copy()
test_games.shape

test_games['predicted_result'] = predictions
test_games['pred_win_prob'] = pred_probs
test_games.head()


Unnamed: 0,game_id,game_date,principal_team,opponent_team,home_team,principal_score,opponent_score,delta_avg_points_per_game,delta_field_goals_pct,delta_avg_off_rebounds,delta_avg_free_throws_att,delta_avg_free_throws_pct,delta_avg_turnovers,delta_win_pct,game_result,predicted_result,pred_win_prob
1223,44baa74b-74e6-42a6-8dc2-f581d3c32e9b,2015-12-16,UC Davis,San Diego,False,55,61,18.0,0.103625,-1.041667,4.5,0.017083,1.333333,0.25,LOSS,1,0.635985
1231,4b551d48-7774-4365-a95c-3fb12d339da7,2015-12-16,UC Riverside,Houston Baptist,False,59,72,9.410714,0.016714,2.125,-1.964286,-0.041429,-0.964286,0.357143,LOSS,1,0.795741
1238,23ac6027-7c73-4acf-820c-b0d170ccf342,2015-12-16,Stanford,DePaul,True,79,60,-0.063492,-0.020349,4.634921,4.349206,-0.042571,0.079365,0.015873,WIN,1,0.530162
1239,94ef20cb-0fd5-4b8c-87cc-84db0a5fb902,2015-12-16,Northern Kentucky,Michigan,False,62,77,-9.333333,-0.037167,2.388889,1.222222,-0.091,7.0,-0.5,LOSS,0,0.087908
1246,d3a3dccd-b480-4330-a592-d0a795b49ab2,2015-12-16,Memphis,Southern University,True,72,67,7.666667,-0.010042,2.0,11.25,0.050917,0.791667,0.083333,WIN,1,0.639763


In [59]:
encode_pred_result = {"predicted_result": {0: "LOSS", 1: "WIN"}}
test_games.replace(encode_pred_result, inplace=True)
incorrrect_predictions =test_games[test_games['game_result'] != test_games['predicted_result']].sort_values(by='pred_win_prob', 
                                                                                                            ascending=False)
incorrrect_predictions

Unnamed: 0,game_id,game_date,principal_team,opponent_team,home_team,principal_score,opponent_score,delta_avg_points_per_game,delta_field_goals_pct,delta_avg_off_rebounds,delta_avg_free_throws_att,delta_avg_free_throws_pct,delta_avg_turnovers,delta_win_pct,game_result,predicted_result,pred_win_prob
4965,f1424672-3ec5-4822-af70-a4759bf77e43,2016-03-12,Houston,Tulane,True,69,72,12.104167,0.068852,1.097917,2.643750,0.024115,-2.704167,0.389583,LOSS,WIN,0.959668
4365,52f185d5-9ca3-4a00-b2de-a6a1e17fdd34,2016-02-27,Radford,Longwood,True,81,92,0.962963,0.002593,2.333333,-1.222222,-0.029741,-1.296296,0.333333,LOSS,WIN,0.869548
2131,9d73eb4a-b890-40cb-b803-a2e6ebf16a19,2016-01-09,LIU Brooklyn,St. Francis (PA),True,65,72,7.769231,0.006242,1.917582,3.131868,0.033225,1.851648,0.115385,LOSS,WIN,0.859534
2819,d9d7aa26-9551-4d67-9af2-df808cdbb68d,2016-01-23,High Point,Liberty,False,67,69,15.479167,0.034472,4.972222,5.638889,0.048014,-2.777778,0.458333,LOSS,WIN,0.856827
1855,295f7a2b-377c-49eb-b216-a1b83cc4bb1b,2016-01-02,Alabama A&M,Grambling State,False,78,81,13.666667,0.073028,2.833333,-1.527778,-0.039194,-4.638889,0.375000,LOSS,WIN,0.854159
3745,c287964c-e703-4592-b84c-0e8e669c0d85,2016-02-13,Maryland,Wisconsin,True,57,70,6.583333,0.055542,-2.416667,-2.125000,0.070583,1.875000,0.250000,LOSS,WIN,0.846217
2082,f122419f-7dda-4318-b0e9-1e2787302d20,2016-01-08,LIU Brooklyn,Robert Morris,True,60,70,7.666667,0.002210,3.153846,6.769231,0.042318,-0.533333,0.405128,LOSS,WIN,0.836970
4300,508716fe-cf09-4250-b4c2-a77e6abfba29,2016-02-26,Hawaii,UC Riverside,True,71,77,9.392977,0.030634,0.809365,6.734114,-0.032982,-1.192308,0.446488,LOSS,WIN,0.826147
4708,3fcdf401-eb94-4bc2-be03-254f45893a63,2016-03-05,Montana,Northern Colorado,True,72,78,-0.520000,0.013167,0.127407,-3.325926,-0.021785,-1.118519,0.380741,LOSS,WIN,0.825170
3962,0dbadc5c-0450-4020-9c8c-fba68bb3c68e,2016-02-18,Arkansas,Auburn,True,86,90,8.063333,0.052880,-0.145000,-0.351667,0.081503,-2.086667,0.105000,LOSS,WIN,0.821879


In [60]:
incorrrect_predictions.shape

(240, 17)

In [61]:
importances = rf_classifier.feature_importances_
feature_importances = join_feature_name_with_importance_value(X_train, importances)
print(feature_importances)   

[('delta_win_pct', 0.2647121780281331), ('delta_field_goals_pct', 0.13607676913544728), ('delta_avg_points_per_game', 0.11781221490135306), ('home_team_True', 0.10913465201400077), ('delta_avg_off_rebounds', 0.10803620216364766), ('delta_avg_turnovers', 0.10657433561067883), ('delta_avg_free_throws_pct', 0.07996059059653968), ('delta_avg_free_throws_att', 0.07769305755019955)]


In [18]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

In [19]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}


In [32]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2,
                               random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,  y_train.WIN)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.4min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [33]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_depth': 10}