In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [5]:
def join_feature_name_with_importance_value(features, importances):
    """
    Join via a list of tuples, feature names with their importance values
    :param features: data frame whose features are represented by columns used by classifier
    :param importances: feature importance scores assigned by classifier
    :return: sorted list (highest importances first) of feature,importance tuples
    """
    if features.columns.shape[0] != importances.shape[0]:
        return []

    feature_importances = []
    for item in range(features.columns.shape[0]):
        feature_importances.append((features.columns[item], importances[item]))
    feature_importances_sorted = sorted(feature_importances, reverse=True, key=lambda kv: kv[1])

    return feature_importances_sorted

In [7]:
game_data = pd.read_csv('D1_2015_Training_Data.csv')


game_data = game_data[game_data['scheduled_date'] > '2015-12-15']
encode_game_result = {"game_result": {False: "LOSS", True: "WIN"}}
game_data.replace(encode_game_result, inplace=True)
print("Number of records= ", game_data.shape[0])
game_data.info()

Number of records=  4061
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4061 entries, 1454 to 5514
Data columns (total 36 columns):
scheduled_date                  4061 non-null object
game_id                         4061 non-null object
home_team                       4061 non-null bool
market                          4061 non-null object
opp_market                      4061 non-null object
points_game                     4061 non-null int64
game_result                     4061 non-null object
opp_points_game                 4061 non-null int64
prn_enter_pts_avg               4061 non-null float64
prn_enter_allow_pts_avg         4061 non-null float64
prn_enter_possesion_avg         4061 non-null float64
prn_enter_fg_pct                4061 non-null float64
prn_enter_allow_fg_pct          4061 non-null float64
prn_enter_off_rebs_avg          4061 non-null float64
prn_enter_allow_off_rebs_avg    4061 non-null float64
prn_enter_def_rebs_avg          4061 non-null float64
prn_enter_ft_

In [7]:
features = ['home_team','prn_enter_pts_avg','prn_enter_fg_pct','prn_enter_off_rebs_avg','prn_enter_ft_att_avg',
            'prn_enter_ft_pct','prn_enter_turnover_avg','prn_enter_win_pct','prn_enter_allow_pts_avg',
            'prn_enter_allow_fg_pct','prn_enter_allow_off_rebs_avg','prn_enter_allow_ft_att_avg','prn_enter_take_away_avg',
            'opp_enter_pts_avg','opp_enter_fg_pct','opp_enter_off_rebs_avg','opp_enter_ft_att_avg',
            'opp_enter_ft_pct','opp_enter_turnover_avg','opp_enter_win_pct','opp_enter_allow_pts_avg',
            'opp_enter_allow_fg_pct','opp_enter_allow_off_rebs_avg','opp_enter_allow_ft_att_avg','opp_enter_take_away_avg',
            'prn_enter_possesion_avg','']

X = game_data[features].copy()
X = pd.get_dummies(X, columns=['home_team'], drop_first=True)
y= game_data['game_result'].copy()
y = pd.get_dummies(y, columns=['game_result'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 10)
print(X_test.shape)

(786, 25)


In [8]:
tree_count =2000

rf_classifier: RandomForestClassifier = RandomForestClassifier(tree_count, max_features='sqrt',
                                                                   min_samples_split=10, min_samples_leaf=2,
                                                                   max_depth=10, random_state=0)
    
rf_classifier.fit(X_train, y_train.WIN)
score = rf_classifier.score(X_train, y_train.WIN)
print("Training Model Score= ", score)

Training Model Score=  0.9340974212034384


In [9]:
y_pred = rf_classifier.predict(X_test)
print("RF model accuracy is %2.2f" % metrics.accuracy_score(y_test, y_pred))

RF model accuracy is 0.70


In [10]:
prediction_probabilities = rf_classifier.predict_proba(X_test)
prediction_probabilities[:,1]
pred_probs = pd.Series(prediction_probabilities[:,1], index=X_test.index)

In [11]:
predictions = pd.Series(y_pred, index=y_test.index)
test_games = game_data[game_data.index.isin(X_test.index)].copy()
test_games.shape

test_games['predicted_result'] = predictions
test_games['pred_win_prob'] = pred_probs
test_games.head()


Unnamed: 0,game_id,home_team,principal_team,opponent_team,game_result,game_date,principal_score,opponent_score,prn_pts_avg,prn_fg_pct,...,opp_ft_pct,opp_turnover_avg,opp_win_pct,opp_allow_pts_avg,opp_allow_fg_pct,opp_allow_off_rebs_avg,opp_allow_ft_att_avg,opp_take_away_avg,predicted_result,pred_win_prob
1,000bc8c4-042e-4986-90c3-64e60bf1566d,True,Presbyterian,Coastal Carolina,LOSS,2016-02-04,66,69,61.42105,0.429735,...,0.666667,12.157895,0.555556,64.52631,0.407795,8.368421,19.31579,9.473684,0,0.293726
17,00be2316-61e0-4451-8ad6-f867e77b1ee1,False,Sam Houston State,Stephen F. Austin,LOSS,2016-01-24,64,76,65.6875,0.400447,...,0.716157,10.785714,0.615385,65.0,0.484067,6.785714,19.642857,15.714286,0,0.283798
27,0149515a-045f-4c64-8223-e86ad86b6912,False,Memphis,Connecticut,LOSS,2016-03-13,58,72,74.85294,0.413927,...,0.78456,10.294118,0.69697,61.382355,0.381867,8.617647,19.117647,11.294118,0,0.272154
28,01565384-ecae-4cca-8b94-ecbf471a2973,True,Loyola (IL),Cleveland State,LOSS,2015-12-17,54,60,57.11111,0.427918,...,0.666667,11.333333,0.125,62.77778,0.510363,7.222222,21.222221,12.777778,1,0.627725
36,01ece451-d68c-4918-b704-a87c0a201134,False,Wichita State,Bradley,WIN,2016-01-03,85,58,64.75,0.401562,...,0.65748,15.214286,0.076923,67.14286,0.432133,9.5,26.571428,10.214286,1,0.659669


In [12]:
encode_pred_result = {"predicted_result": {0: "LOSS", 1: "WIN"}}
test_games.replace(encode_pred_result, inplace=True)
incorrrect_predictions =test_games[test_games['game_result'] != test_games['predicted_result']].sort_values(by='pred_win_prob', 
                                                                                                            ascending=False)
incorrrect_predictions

Unnamed: 0,game_id,home_team,principal_team,opponent_team,game_result,game_date,principal_score,opponent_score,prn_pts_avg,prn_fg_pct,...,opp_ft_pct,opp_turnover_avg,opp_win_pct,opp_allow_pts_avg,opp_allow_fg_pct,opp_allow_off_rebs_avg,opp_allow_ft_att_avg,opp_take_away_avg,predicted_result,pred_win_prob
256,0c397460-7c98-4264-acb3-47750c2f5757,True,UC Irvine,UC Santa Barbara,LOSS,2016-01-31,60,76,64.619050,0.436929,...,0.726592,12.473684,0.388889,67.736840,0.419741,7.947369,24.210526,11.000000,WIN,0.846316
4269,d717fbbb-d711-4e4d-9f35-f4b326f8ec5d,True,Coastal Carolina,Gardner-Webb,LOSS,2016-03-05,65,69,68.615390,0.449038,...,0.649329,12.793103,0.464286,71.896550,0.453224,8.448276,21.827587,12.862069,WIN,0.845923
547,1bb99b67-32fe-4f47-969e-daea45f6bb04,False,Dayton,La Salle,LOSS,2016-01-09,57,61,69.600000,0.471895,...,0.671875,11.083333,0.272727,69.833336,0.455118,9.500000,19.833334,12.083333,WIN,0.827885
2610,81d95e71-d925-40f2-b63b-9a5002156b91,True,UC Riverside,Cal State Northridge,LOSS,2016-02-04,71,73,66.260870,0.428571,...,0.716553,13.100000,0.263158,72.700000,0.447826,8.650000,19.200000,11.350000,WIN,0.787284
70,03727982-6637-4d13-b3f5-5685c460ecf9,True,Cincinnati,Temple,LOSS,2015-12-29,70,77,72.571430,0.459318,...,0.751295,7.727273,0.500000,64.272730,0.411864,8.818182,19.181818,8.909091,WIN,0.775779
772,2568c966-969c-4295-83b3-c30f59516259,True,Villanova,Providence,LOSS,2016-01-24,76,82,71.800000,0.465741,...,0.714286,10.150000,0.842105,63.250000,0.423913,7.800000,15.200000,13.650000,WIN,0.775374
4042,cc2225c3-7e6c-4e18-8b97-6f38a99957e8,True,Cal State Fullerton,Cal State Northridge,LOSS,2016-01-10,75,85,67.076920,0.417889,...,0.714765,12.928572,0.230769,72.857140,0.445409,8.500000,18.785715,9.642858,WIN,0.767134
2222,6f476e01-0cf8-4636-a82e-7700863e89e5,True,Samford,South Alabama,LOSS,2015-12-19,70,72,63.900000,0.423695,...,0.643836,11.625000,0.142857,65.750000,0.476440,6.000000,19.500000,12.125000,WIN,0.760279
3818,c1504d0c-3024-471d-8832-03d4ec08a615,False,Princeton,Harvard,LOSS,2016-03-05,71,73,75.440000,0.464968,...,0.583519,13.407408,0.384615,65.259260,0.414993,8.185185,18.666666,10.777778,WIN,0.755200
3206,a16cd5df-fdb5-4794-8dd3-1cee79322b50,True,Winthrop,Campbell,LOSS,2015-12-31,83,90,72.818184,0.445393,...,0.669202,10.727273,0.500000,68.545456,0.453782,8.272727,20.363636,11.909091,WIN,0.754618


In [13]:
incorrrect_predictions.shape

(238, 34)

In [14]:
importances = rf_classifier.feature_importances_
feature_importances = join_feature_name_with_importance_value(X_train, importances)
print(feature_importances)   

[('home_team_True', 0.09002898103989619), ('prn_win_pct', 0.09000185808212349), ('opp_win_pct', 0.07281393795532239), ('prn_pts_avg', 0.0486810152291656), ('opp_fg_pct', 0.04682153353702121), ('prn_fg_pct', 0.045706785574935974), ('prn_allow_pts_avg', 0.04454944815122255), ('opp_allow_pts_avg', 0.043359993675971466), ('opp_pts_avg', 0.04244777681490297), ('prn_turnover_avg', 0.037280357553260354), ('prn_off_rebs_avg', 0.03700298368565242), ('prn_allow_fg_pct', 0.03659307844708432), ('opp_allow_fg_pct', 0.03407789176653632), ('opp_allow_off_rebs_avg', 0.03331006394172894), ('prn_allow_ft_att_avg', 0.028958486612102204), ('prn_allow_off_rebs_avg', 0.028606184662534263), ('opp_take_away_avg', 0.028228772306313828), ('opp_allow_ft_att_avg', 0.028209715677315762), ('opp_turnover_avg', 0.02811146377253532), ('prn_ft_att_avg', 0.02773046570423836), ('opp_off_rebs_avg', 0.02748959932741904), ('prn_take_away_avg', 0.025943924580933617), ('opp_ft_att_avg', 0.02528877453017781), ('opp_ft_pct', 0.

In [18]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

In [19]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}


In [32]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2,
                               random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,  y_train.WIN)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.4min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [33]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_depth': 10}