## Decision Tree, Random Forest, XGBoost Models on Diff Data
USA World Series Results,
Run on "Diff" data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Import Data - only want USA matches
df = pd.read_csv('../data/output/final_diffs_all.csv')
df.head()

In [6]:
# Replace NaN's with zero
df.fillna(value=0, inplace=True)

In [7]:
# create result label: change from 1,0,2 to win, loss, tie
# if 'Score_Diff' is > 0 = win, if < 0, else tie
def createResult(x):
    if x > 0:
        return "W"
    elif x < 0:
        return "L"
    elif x == 0:
        return "T"

df.loc[:,'Result'] = df.loc[:,'Score_Diff'].apply(createResult)

In [8]:
df.head()

Unnamed: 0,Opp,Tournament,Poss_Time_Diff,Score_Diff,Conv_Diff,Tries_Diff,Passes_Diff,Contestable_KO_Win_pct_Diff,PenFK_Against_Diff,RuckMaul_Diff,Ruck_Win_pct_Diff,Cards_diff,Lineout_Win_Pct_Diff,Scrum_Win_Pct_Diff,Result
0,AUSTRALIA,2015_Cape_Town,13.96648,-10.638298,-14.285714,0.25,25.925926,-50.0,0.0,0.0,0.083333,50.0,0.333333,1.0,L
1,WALES,2015_Cape_Town,7.471264,15.555556,14.285714,0.083333,27.868852,25.0,-20.0,-100.0,0.25,0.0,-1.0,0.0,W
2,KENYA,2015_Cape_Town,-33.136095,-44.444444,-33.333333,-0.75,-10.638298,-16.666667,66.666667,60.0,-0.55,0.0,-1.0,0.0,L
3,NEW ZEALAND,2015_Cape_Town,51.758794,33.333333,33.333333,0.0,76.119403,-75.0,-50.0,-100.0,0.25,0.0,0.0,-1.0,W
4,FIJI,2015_Cape_Town,12.880562,-20.833333,-25.0,0.266667,38.461538,-66.666667,-33.333333,-33.333333,0.208333,0.0,-1.0,0.0,L


In [None]:
df.Contestable_KO_Win_pct_Diff.describe()

In [None]:
df.Contestable_KO_Win_pct_Diff

In [None]:
#Create columns for KO win % bands based on Contestable_KO_Win_pct_Diff
# Tmp DF to hold values
tmp = pd.DataFrame(columns=['-175 : -150', '-149 : -125', '-124 : -100', '-99 : -75', '-74 : -25', '-24 : -1', '0 : 25', '26 : 50', '51 : 75', '76 : 100', '101 : 125', '126 : 150'])

#Iterate through rows and create classification for KO Win%
for index, row in df.iterrows():
    
    if row['Contestable_KO_Win_pct_Diff'] > -175.0 and row['Contestable_KO_Win_pct_Diff'] <= -150.0:
        bin0 = row['-175 : -150'] = float(-1.50*50)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)
        
    elif row['Contestable_KO_Win_pct_Diff'] > -150.0 and row['Contestable_KO_Win_pct_Diff'] <= -125.0:
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(-1.25*50)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)  
      
    elif row['Contestable_KO_Win_pct_Diff'] > -125.0 and row['Contestable_KO_Win_pct_Diff'] <= -100.0:
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(-1.00*50)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)         
         
    elif row['Contestable_KO_Win_pct_Diff'] > -100.0 and row['Contestable_KO_Win_pct_Diff'] <= -75.0:
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(-0.75*50)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)        
          
    elif row['Contestable_KO_Win_pct_Diff'] > -75.0 and row['Contestable_KO_Win_pct_Diff'] <= -25.0:
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(-0.25*50)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)        
          
    elif row['Contestable_KO_Win_pct_Diff'] > -25.0 and row['Contestable_KO_Win_pct_Diff'] <= -1.0:
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(-0.10*50)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)          
        
    elif row['Contestable_KO_Win_pct_Diff'] >= 0 and row['Contestable_KO_Win_pct_Diff'] <= 25.0:
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0.25*50)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)

    elif row['Contestable_KO_Win_pct_Diff'] > 25.0 and row['Contestable_KO_Win_pct_Diff'] <= 50.0:
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0.50*50)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)

    elif row['Contestable_KO_Win_pct_Diff'] > 50.0 and row['Contestable_KO_Win_pct_Diff'] <= 75.0:
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0.75*50)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)

    elif row['Contestable_KO_Win_pct_Diff'] > 75.0 and row['Contestable_KO_Win_pct_Diff'] <= 100.0:
        #zero = row['0'] = float(0)
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(1.00*50)
        bin10 = row['101 : 125'] = float(0)
        bin11 = row['126 : 150'] = float(0)

    elif row['Contestable_KO_Win_pct_Diff'] > 100.0 and row['Contestable_KO_Win_pct_Diff'] <= 125.0:
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 : 25'] = float(0)
        bin7 = row['26 : 50'] = float(0)
        bin8 = row['51 : 75'] = float(0)
        bin9 = row['76 : 100'] = float(0)
        bin10 = row['101 : 125'] = float(1.25*50)
        bin11 = row['126 : 150'] = float(0)        
        

    elif row['Contestable_KO_Win_pct_Diff'] > 125.0 and row['Contestable_KO_Win_pct_Diff'] <= 150.0:
        #zero = row['0'] = float(0)
        bin0 = row['-175 : -150'] = float(0)
        bin1 = row['-149 : -125'] = float(0)
        bin2 = row['-124 : -100'] = float(0)
        bin3 = row['-99 : -75'] = float(0)
        bin4 = row['-74 : -25'] = float(0)
        bin5 = row['-24 : -1'] = float(0)
        bin6 = row['0 - 25'] = float(0)
        bin7 = row['26 - 50'] = float(0)
        bin8 = row['51 - 75'] = float(0)
        bin9 = row['76 - 100'] = float(0)
        bin10 = row['101 - 125'] = float(0)
        bin11 = row['126 - 150'] = float(1.50*50) 
        
    tmp.loc[index] = (bin0, bin1, bin2, bin3, bin4, bin5, bin6, bin7, bin8, bin9, bin10, bin11)
    
#df.info()
#type(df.Result[1])
df = pd.concat([df, tmp], axis=1)

df.to_csv("../data/output/matchdata_ko_bands.csv", header=True, index=False)

#print(list(df.columns))
# ['Team', 'Date', 'Tournament', 'Match', 'Possession Time', 'Scores', 'Tries', 'Conversions', 'Passes', 'Contestable_KO_Win_pct_Diff', 'Pens_Frees Against', 'Ruck_Maul', 'Yellow_Red Cards', 'TurnoversConceded', 'Ruck_retention', 'Lineout_Win_Pct', 'Scrum_Win_Pct', 'Result', '0', '0 - 25', '25 - 50', '50 - 75', '75 - 100']
df

In [16]:
df.info()
list(df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 27 columns):
Opp                            156 non-null object
Tournament                     156 non-null object
Poss_Time_Diff                 156 non-null float64
Score_Diff                     156 non-null float64
Conv_Diff                      156 non-null float64
Tries_Diff                     156 non-null float64
Passes_Diff                    156 non-null float64
Contestable_KO_Win_pct_Diff    156 non-null float64
PenFK_Against_Diff             156 non-null float64
RuckMaul_Diff                  156 non-null float64
Ruck_Win_pct_Diff              156 non-null float64
Cards_diff                     156 non-null float64
Lineout_Win_Pct_Diff           156 non-null float64
Scrum_Win_Pct_Diff             156 non-null float64
Result                         156 non-null object
-175 : -150                    156 non-null float64
-149 : -125                    156 non-null float64
-124 : -100   

['Opp',
 'Tournament',
 'Poss_Time_Diff',
 'Score_Diff',
 'Conv_Diff',
 'Tries_Diff',
 'Passes_Diff',
 'Contestable_KO_Win_pct_Diff',
 'PenFK_Against_Diff',
 'RuckMaul_Diff',
 'Ruck_Win_pct_Diff',
 'Cards_diff',
 'Lineout_Win_Pct_Diff',
 'Scrum_Win_Pct_Diff',
 'Result',
 '-175 : -150',
 '-149 : -125',
 '-124 : -100',
 '-99 : -75',
 '-74 : -25',
 '-24 : -1',
 '0 : 25',
 '26 : 50',
 '51 : 75',
 '76 : 100',
 '101 : 125',
 '126 : 150']

In [18]:
# reorder columns
df = df[['Opp','Tournament','Poss_Time_Diff','Score_Diff','Conv_Diff','Tries_Diff','Passes_Diff','Contestable_KO_Win_pct_Diff','PenFK_Against_Diff','RuckMaul_Diff','Ruck_Win_pct_Diff','Cards_diff','Lineout_Win_Pct_Diff','Scrum_Win_Pct_Diff','-175 : -150','-149 : -125','-124 : -100','-99 : -75','-74 : -25','-24 : -1','0 : 25','26 : 50','51 : 75','76 : 100', '101 : 125','126 : 150','Result']]
df.head()

Unnamed: 0,Opp,Tournament,Poss_Time_Diff,Score_Diff,Conv_Diff,Tries_Diff,Passes_Diff,Contestable_KO_Win_pct_Diff,PenFK_Against_Diff,RuckMaul_Diff,...,-99 : -75,-74 : -25,-24 : -1,0 : 25,26 : 50,51 : 75,76 : 100,101 : 125,126 : 150,Result
0,AUSTRALIA,2015_Cape_Town,13.96648,-10.638298,-14.285714,0.25,25.925926,-50.0,0.0,0.0,...,0.0,-12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L
1,WALES,2015_Cape_Town,7.471264,15.555556,14.285714,0.083333,27.868852,25.0,-20.0,-100.0,...,0.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0,W
2,KENYA,2015_Cape_Town,-33.136095,-44.444444,-33.333333,-0.75,-10.638298,-16.666667,66.666667,60.0,...,0.0,0.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,L
3,NEW ZEALAND,2015_Cape_Town,51.758794,33.333333,33.333333,0.0,76.119403,-75.0,-50.0,-100.0,...,-37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,W
4,FIJI,2015_Cape_Town,12.880562,-20.833333,-25.0,0.266667,38.461538,-66.666667,-33.333333,-33.333333,...,0.0,-12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L


In [12]:
from sklearn.model_selection import train_test_split

In [19]:
#Drop features that are unneccessary/str or will bias the prediction
rf_data = df.drop((['Opp', 'Score_Diff', 'Tries_Diff','Tournament', 'Conv_Diff']), axis=1)

#Pull out the variable we're trying to predict: 'Result'
X = rf_data.drop('Result',axis=1)
y = rf_data['Result']

In [20]:
rf_data.head()

Unnamed: 0,Poss_Time_Diff,Passes_Diff,Contestable_KO_Win_pct_Diff,PenFK_Against_Diff,RuckMaul_Diff,Ruck_Win_pct_Diff,Cards_diff,Lineout_Win_Pct_Diff,Scrum_Win_Pct_Diff,-175 : -150,...,-99 : -75,-74 : -25,-24 : -1,0 : 25,26 : 50,51 : 75,76 : 100,101 : 125,126 : 150,Result
0,13.96648,25.925926,-50.0,0.0,0.0,0.083333,50.0,0.333333,1.0,0.0,...,0.0,-12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L
1,7.471264,27.868852,25.0,-20.0,-100.0,0.25,0.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,12.5,0.0,0.0,0.0,0.0,0.0,W
2,-33.136095,-10.638298,-16.666667,66.666667,60.0,-0.55,0.0,-1.0,0.0,0.0,...,0.0,0.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,L
3,51.758794,76.119403,-75.0,-50.0,-100.0,0.25,0.0,0.0,-1.0,0.0,...,-37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,W
4,12.880562,38.461538,-66.666667,-33.333333,-33.333333,0.208333,0.0,-1.0,0.0,0.0,...,0.0,-12.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,L


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## Decision Tree

In [None]:
#y_test

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
dtree = DecisionTreeClassifier()

In [24]:
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [25]:
predictions = dtree.predict(X_test)

In [26]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [27]:
#get the model's accuracy score
accuracy_score(y_test, predictions)

0.34042553191489361

In [28]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          L       0.30      0.33      0.32        21
          T       0.00      0.00      0.00         1
          W       0.39      0.36      0.37        25

avg / total       0.34      0.34      0.34        47



In [29]:
print(confusion_matrix(y_test,predictions))

[[ 7  1 13]
 [ 0  0  1]
 [16  0  9]]


## Tree Visualization

In [None]:
from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import pydot 

features = list(rf_data.columns[0:-1])
features

In [None]:
dot_data = StringIO()  
export_graphviz(dtree, out_file=dot_data,feature_names=features,filled=True,rounded=True)

graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph[0].create_png())

## Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200, verbose=0)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [31]:
rfc_pred = rfc.predict(X_test)

In [None]:
#import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
conf = confusion_matrix(y_test,rfc_pred)
plt.imshow(conf, cmap='binary', interpolation='None')
plt.show()

In [38]:
impt = df[['Poss_Time_Diff','Passes_Diff','Contestable_KO_Win_pct_Diff','PenFK_Against_Diff',
             'Ruck_Win_pct_Diff', 'Result']]

In [None]:
sns.pairplot(impt,hue='Result', palette='Set1') #hue='Result'

## Random Forest Model Eval

In [41]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

#Output confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,rfc_pred))

#import matplotlib.pyplot as plt
#print("\n")
#print("Confusion Matrix Plot")
#from sklearn.metrics import confusion_matrix
#conf = confusion_matrix(y_test,rfc_pred)
#plt.imshow(conf, cmap='binary', interpolation='None')
#plt.show()

#import libraries to ignore UndefinedMetricWarning
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

#get the model's accuracy score
accuracy_score(y_test, rfc_pred)
print("\n")
print("Classification Report")
print(classification_report(y_test,rfc_pred))

#print accuracy score
print("\n")
print("Accuracy Score")
print(rfc.score(X_test, y_test))

Confusion Matrix
[[ 9  0 12]
 [ 0  0  1]
 [ 7  0 18]]


Classification Report
             precision    recall  f1-score   support

          L       0.56      0.43      0.49        21
          T       0.00      0.00      0.00         1
          W       0.58      0.72      0.64        25

avg / total       0.56      0.57      0.56        47



Accuracy Score
0.574468085106


In [42]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

print("Feature Importance")
print(feature_importances)

Feature Importance
                             importance
Poss_Time_Diff                 0.162604
Ruck_Win_pct_Diff              0.136296
Passes_Diff                    0.131532
Contestable_KO_Win_pct_Diff    0.119748
PenFK_Against_Diff             0.113640
RuckMaul_Diff                  0.084305
Lineout_Win_Pct_Diff           0.065398
Scrum_Win_Pct_Diff             0.048053
Cards_diff                     0.030054
0 : 25                         0.026790
-74 : -25                      0.021165
51 : 75                        0.018365
26 : 50                        0.014886
-24 : -1                       0.010186
76 : 100                       0.004941
-99 : -75                      0.003673
-175 : -150                    0.003500
126 : 150                      0.003093
-124 : -100                    0.001770
101 : 125                      0.000000
-149 : -125                    0.000000


## Train an XGBoost Classifier Model
Info from https://jessesw.com/XG-Boost/

In [61]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

#grid_search

## Set up hyperparameter tuning/Grid Search¶

In [62]:
#Set parameters for first grid search. Start tuning on the maximum depth of the trees first, 
#along with the min_child_weight, which is very similar to min_samples_split in sklearn’s version 
#of gradient boosted trees. We set the objective to ‘binary:logistic’ since this is a binary 
#classification problem
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 4, n_jobs = -1) 
# Optimize for accuracy


ERROR: /Users/admin/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:553: Warning: The least populated class in y has only 4 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)
  
  The Tie ('T') result is likely the cause of this.  For now, change to CV = 4

In [49]:
# Run the grid search
optimized_GBM.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.8),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [50]:
# Check grid scores
optimized_GBM.grid_scores_
# increased score to 61% - over 57% of Random Forest:
# mean: 0.61468, std: 0.05469, params: {'max_depth': 3, 'min_child_weight': 1}

[mean: 0.61468, std: 0.05469, params: {'max_depth': 3, 'min_child_weight': 1},
 mean: 0.58716, std: 0.01909, params: {'max_depth': 3, 'min_child_weight': 3},
 mean: 0.55963, std: 0.05080, params: {'max_depth': 3, 'min_child_weight': 5},
 mean: 0.60550, std: 0.01515, params: {'max_depth': 5, 'min_child_weight': 1},
 mean: 0.59633, std: 0.00630, params: {'max_depth': 5, 'min_child_weight': 3},
 mean: 0.55046, std: 0.03262, params: {'max_depth': 5, 'min_child_weight': 5},
 mean: 0.60550, std: 0.04002, params: {'max_depth': 7, 'min_child_weight': 1},
 mean: 0.59633, std: 0.00630, params: {'max_depth': 7, 'min_child_weight': 3},
 mean: 0.55046, std: 0.03262, params: {'max_depth': 7, 'min_child_weight': 5}]

use these parameters (resulting in mean: 0.61468):

** params: {'max_depth': 3, 'min_child_weight': 1} **

Adjust subsampling along with lowering the learning rate to see if that helps

In [56]:
cv_params = {'learning_rate': [0.1, 0.01], 'subsample': [0.7,0.8,0.9]}
ind_params = {'n_estimators': 100, 'seed':0, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 1}


optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 4, n_jobs = -1)
# Run the grid search again
optimized_GBM.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.01], 'subsample': [0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [57]:
# check the Grid Scores again - increased to 0.63303
# mean: 0.63303, std: 0.05657, params: {'learning_rate': 0.1, 'subsample': 0.7}
optimized_GBM.grid_scores_

[mean: 0.63303, std: 0.05657, params: {'learning_rate': 0.1, 'subsample': 0.7},
 mean: 0.59633, std: 0.04255, params: {'learning_rate': 0.1, 'subsample': 0.8},
 mean: 0.62385, std: 0.08564, params: {'learning_rate': 0.1, 'subsample': 0.9},
 mean: 0.54128, std: 0.07335, params: {'learning_rate': 0.01, 'subsample': 0.7},
 mean: 0.53211, std: 0.05764, params: {'learning_rate': 0.01, 'subsample': 0.8},
 mean: 0.52294, std: 0.06194, params: {'learning_rate': 0.01, 'subsample': 0.9}]

### Final parameters:
Got to 63% (mean: 0.63303, std: 0.05657) with the parameters below:

params: {'learning_rate': 0.1, 'subsample': 0.7, 'max_depth': 3, 'min_child_weight': 1}

ind_params = {'n_estimators': 100, 'seed':0, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth': 3, 'min_child_weight': 1}

## Create a DMatrix
To increase the performance of XGBoost's speed through many iterations of the training set, and since we are using only XGBoost's API and not sklearn's anymore, we can create a DMatrix. This sorts the data initially to optimize for XGBoost when it builds trees, making the algorithm more efficient. This is especially helpful when you have a very large number of training examples. To create a DMatrix:

In [60]:
# Won't work because labels are not 0,1
# Create our DMatrix to make XGBoost more efficient
xgdmat = xgb.DMatrix(X_train, y_train) 

ValueError: could not convert string to float: 'W'