In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [5]:
def join_feature_name_with_importance_value(features, importances):
    """
    Join via a list of tuples, feature names with their importance values
    :param features: data frame whose features are represented by columns used by classifier
    :param importances: feature importance scores assigned by classifier
    :return: sorted list (highest importances first) of feature,importance tuples
    """
    if features.columns.shape[0] != importances.shape[0]:
        return []

    feature_importances = []
    for item in range(features.columns.shape[0]):
        feature_importances.append((features.columns[item], importances[item]))
    feature_importances_sorted = sorted(feature_importances, reverse=True, key=lambda kv: kv[1])

    return feature_importances_sorted

In [7]:
game_data = pd.read_csv('D1_2015_Training_Data.csv')


game_data = game_data[game_data['scheduled_date'] > '2015-12-15']
encode_game_result = {"game_result": {False: "LOSS", True: "WIN"}}
game_data.replace(encode_game_result, inplace=True)
print("Number of records= ", game_data.shape[0])
game_data.info()

Number of records=  4061
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4061 entries, 1454 to 5514
Data columns (total 36 columns):
scheduled_date                  4061 non-null object
game_id                         4061 non-null object
home_team                       4061 non-null bool
market                          4061 non-null object
opp_market                      4061 non-null object
points_game                     4061 non-null int64
game_result                     4061 non-null object
opp_points_game                 4061 non-null int64
prn_enter_pts_avg               4061 non-null float64
prn_enter_allow_pts_avg         4061 non-null float64
prn_enter_possesion_avg         4061 non-null float64
prn_enter_fg_pct                4061 non-null float64
prn_enter_allow_fg_pct          4061 non-null float64
prn_enter_off_rebs_avg          4061 non-null float64
prn_enter_allow_off_rebs_avg    4061 non-null float64
prn_enter_def_rebs_avg          4061 non-null float64
prn_enter_ft_

In [8]:
features = ['home_team','prn_enter_pts_avg','prn_enter_fg_pct','prn_enter_off_rebs_avg','prn_enter_ft_att_avg',
            'prn_enter_ft_pct','prn_enter_turnover_avg','prn_enter_win_pct','prn_enter_allow_pts_avg',
            'prn_enter_allow_fg_pct','prn_enter_allow_off_rebs_avg','prn_enter_allow_ft_att_avg','prn_enter_take_away_avg',
            'opp_enter_pts_avg','opp_enter_fg_pct','opp_enter_off_rebs_avg','opp_enter_ft_att_avg',
            'opp_enter_ft_pct','opp_enter_turnover_avg','opp_enter_win_pct','opp_enter_allow_pts_avg',
            'opp_enter_allow_fg_pct','opp_enter_allow_off_rebs_avg','opp_enter_allow_ft_att_avg','opp_enter_take_away_avg',
            'prn_enter_possesion_avg','opp_enter_possesion_avg']

X = game_data[features].copy()
X = pd.get_dummies(X, columns=['home_team'], drop_first=True)
y= game_data['game_result'].copy()
y = pd.get_dummies(y, columns=['game_result'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 10)
print(X_test.shape)

(813, 27)


In [9]:
tree_count =2000

rf_classifier: RandomForestClassifier = RandomForestClassifier(tree_count, max_features='sqrt',
                                                                   min_samples_split=10, min_samples_leaf=2,
                                                                   max_depth=10, random_state=0)
    
rf_classifier.fit(X_train, y_train.WIN)
score = rf_classifier.score(X_train, y_train.WIN)
print("Training Model Score= ", score)

Training Model Score=  0.9384236453201971


In [10]:
y_pred = rf_classifier.predict(X_test)
print("RF model accuracy is %2.2f" % metrics.accuracy_score(y_test, y_pred))

RF model accuracy is 0.71


In [11]:
prediction_probabilities = rf_classifier.predict_proba(X_test)
prediction_probabilities[:,1]
pred_probs = pd.Series(prediction_probabilities[:,1], index=X_test.index)

In [12]:
predictions = pd.Series(y_pred, index=y_test.index)
test_games = game_data[game_data.index.isin(X_test.index)].copy()
test_games.shape

test_games['predicted_result'] = predictions
test_games['pred_win_prob'] = pred_probs
test_games.head()


Unnamed: 0,scheduled_date,game_id,home_team,market,opp_market,points_game,game_result,opp_points_game,prn_enter_pts_avg,prn_enter_allow_pts_avg,...,opp_enter_allow_off_rebs_avg,opp_enter_def_rebs_avg,opp_enter_ft_att_avg,opp_enter_allow_ft_att_avg,opp_enter_ft_pct,opp_enter_turnover_avg,opp_enter_take_away_avg,opp_enter_win_pct,predicted_result,pred_win_prob
1455,2015-12-15 00:00:00.0000000,85f37e83-6b6f-4512-b97a-ea06dca185cd,False,Coastal Carolina,Wofford,71,WIN,63,66.67,71.0,...,7.14,20.71,19.0,22.43,0.744361,10.86,11.0,0.29,0,0.407124
1463,2015-12-15 00:00:00.0000000,a2064df4-beee-4969-94d5-9b2c08160342,False,Prairie View A&M,Tulane,49,LOSS,63,57.25,79.75,...,9.56,24.89,21.89,18.67,0.654822,14.56,13.78,0.56,0,0.173984
1470,2015-12-16 00:00:00.0000000,23ac6027-7c73-4acf-820c-b0d170ccf342,False,DePaul,Stanford,60,LOSS,79,70.78,70.22,...,9.86,22.43,29.57,18.14,0.63285,13.86,10.86,0.57,0,0.334117
1471,2015-12-16 00:00:00.0000000,3e705f68-d06a-4c1b-8b0f-06e87be50430,False,Drexel,South Carolina,54,LOSS,79,68.67,73.17,...,8.63,26.13,28.88,20.63,0.688312,12.25,16.13,1.0,0,0.136769
1479,2015-12-16 00:00:00.0000000,6966e23e-f98e-48b1-a24e-70c7245dd67e,False,Jackson State,Miami (OH),53,LOSS,64,69.88,74.75,...,9.25,19.25,20.5,19.75,0.756098,12.0,15.88,0.5,0,0.358594


In [13]:
encode_pred_result = {"predicted_result": {0: "LOSS", 1: "WIN"}}
test_games.replace(encode_pred_result, inplace=True)
incorrrect_predictions =test_games[test_games['game_result'] != test_games['predicted_result']].sort_values(by='pred_win_prob', 
                                                                                                            ascending=False)
incorrrect_predictions

Unnamed: 0,scheduled_date,game_id,home_team,market,opp_market,points_game,game_result,opp_points_game,prn_enter_pts_avg,prn_enter_allow_pts_avg,...,opp_enter_allow_off_rebs_avg,opp_enter_def_rebs_avg,opp_enter_ft_att_avg,opp_enter_allow_ft_att_avg,opp_enter_ft_pct,opp_enter_turnover_avg,opp_enter_take_away_avg,opp_enter_win_pct,predicted_result,pred_win_prob
4382,2016-02-20 00:00:00.0000000,13e6d46f-60e3-4983-a5a1-eac010c849f5,True,Chattanooga,UNC Greensboro,64,LOSS,79,76.15,67.19,...,7.52,22.48,16.48,22.04,0.689320,12.16,11.92,0.32,WIN,0.914823
5327,2016-03-12 00:00:00.0000000,f1424672-3ec5-4822-af70-a4759bf77e43,True,Houston,Tulane,69,LOSS,72,78.17,68.77,...,9.03,23.48,18.12,20.97,0.672241,13.09,12.67,0.33,WIN,0.905208
3735,2016-02-06 00:00:00.0000000,2ef775d5-f331-4694-b9e7-b9fdb001a949,True,Jackson State,Alcorn State,64,LOSS,72,68.85,68.00,...,9.11,20.39,21.56,23.28,0.654639,14.61,11.94,0.33,WIN,0.848416
5234,2016-03-10 00:00:00.0000000,7a4d0eb9-f9f1-4f0d-a5f6-911c293b7ba8,True,Iowa,Illinois,66,LOSS,68,78.50,68.73,...,8.16,22.50,18.28,16.06,0.747009,10.09,12.81,0.44,WIN,0.798883
2043,2016-01-01 00:00:00.0000000,070bb82b-cc23-4944-bc81-8711e0e8398d,True,South Dakota,IUPUI,66,LOSS,77,78.00,82.23,...,8.64,22.79,19.00,24.79,0.687970,14.36,13.14,0.21,WIN,0.793645
4885,2016-03-02 00:00:00.0000000,c0c81cc5-1025-4b8c-83a4-a2ae5a630e7e,True,Buffalo,Miami (OH),59,LOSS,67,76.29,76.43,...,8.79,20.54,19.29,20.61,0.733333,12.25,13.79,0.36,WIN,0.777024
2921,2016-01-20 00:00:00.0000000,b4562323-5bd2-4e10-8f61-e9718fa72cfe,True,Pittsburgh,North Carolina State,61,LOSS,78,82.56,66.13,...,10.44,24.78,25.89,17.28,0.675966,10.39,10.67,0.56,WIN,0.757229
4357,2016-02-19 00:00:00.0000000,cb7b1b6e-4377-46f1-8732-f2ca34d6cdef,True,St. Francis (PA),St. Francis (BKN),58,LOSS,70,69.00,71.42,...,8.73,22.00,20.38,20.00,0.654717,13.00,12.88,0.38,WIN,0.753947
2544,2016-01-12 00:00:00.0000000,7262b2d9-548b-4a52-9f54-9709c9c7bf05,True,Alabama A&M,Mississippi Valley State,73,LOSS,79,69.55,71.00,...,11.56,21.94,18.06,22.06,0.636678,12.00,14.44,0.06,WIN,0.749233
2583,2016-01-13 00:00:00.0000000,68e00449-79ed-4a3f-a946-b528274aa916,True,Ohio,Bowling Green,75,LOSS,91,78.23,72.92,...,9.31,26.46,23.46,20.54,0.672131,12.92,12.00,0.62,WIN,0.743070


In [14]:
incorrrect_predictions.shape

(237, 38)

In [15]:
importances = rf_classifier.feature_importances_
feature_importances = join_feature_name_with_importance_value(X_train, importances)
print(feature_importances)   

[('opp_enter_win_pct', 0.0746909945619098), ('home_team_True', 0.0737513208473952), ('prn_enter_win_pct', 0.07104770574449341), ('opp_enter_allow_pts_avg', 0.05036030810647399), ('opp_enter_fg_pct', 0.049802106168305355), ('prn_enter_pts_avg', 0.04328483651829173), ('opp_enter_pts_avg', 0.04304809858237933), ('prn_enter_allow_pts_avg', 0.04166863061327338), ('prn_enter_allow_fg_pct', 0.03894762200488538), ('opp_enter_allow_fg_pct', 0.03782633461561301), ('prn_enter_fg_pct', 0.03770227435180364), ('prn_enter_turnover_avg', 0.03488318146033689), ('opp_enter_allow_off_rebs_avg', 0.03222676208393415), ('opp_enter_off_rebs_avg', 0.030699734531647125), ('prn_enter_allow_ft_att_avg', 0.030567374680136594), ('prn_enter_off_rebs_avg', 0.029434871567043774), ('opp_enter_turnover_avg', 0.02814219749734867), ('opp_enter_allow_ft_att_avg', 0.027637753171134424), ('opp_enter_ft_pct', 0.02611997866167957), ('opp_enter_possesion_avg', 0.025989170759758738), ('prn_enter_take_away_avg', 0.02550884440341

In [18]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

In [19]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}


In [32]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2,
                               random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,  y_train.WIN)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.4min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [33]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_depth': 10}