In [36]:
from numpy import loadtxt 
import numpy as np
import pandas as pd
from xgboost import XGBClassifier 
import pickle 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import accuracy_score 
import warnings

In [6]:
dataset = loadtxt('pima.txt', delimiter=",")

In [7]:
X = dataset[:,0:8] 
Y = dataset[:,8]

In [19]:
X[:5]

array([[  6.00000000e+00,   1.48000000e+02,   7.20000000e+01,
          3.50000000e+01,   0.00000000e+00,   3.36000000e+01,
          6.27000000e-01,   5.00000000e+01],
       [  1.00000000e+00,   8.50000000e+01,   6.60000000e+01,
          2.90000000e+01,   0.00000000e+00,   2.66000000e+01,
          3.51000000e-01,   3.10000000e+01],
       [  8.00000000e+00,   1.83000000e+02,   6.40000000e+01,
          0.00000000e+00,   0.00000000e+00,   2.33000000e+01,
          6.72000000e-01,   3.20000000e+01],
       [  1.00000000e+00,   8.90000000e+01,   6.60000000e+01,
          2.30000000e+01,   9.40000000e+01,   2.81000000e+01,
          1.67000000e-01,   2.10000000e+01],
       [  0.00000000e+00,   1.37000000e+02,   4.00000000e+01,
          3.50000000e+01,   1.68000000e+02,   4.31000000e+01,
          2.28800000e+00,   3.30000000e+01]])

In [20]:
Y[:5]

array([ 1.,  0.,  1.,  0.,  1.])

In [21]:
seed = 7 
test_size = 0.20

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [10]:
model = XGBClassifier()  

In [11]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [52]:
#Evaluate model errors with test set in each iteration during model training - check how many iterations are needed

eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_metric="error", eval_set=eval_set, verbose=True)

[0]	validation_0-error:0.214286
[1]	validation_0-error:0.214286
[2]	validation_0-error:0.253247
[3]	validation_0-error:0.227273
[4]	validation_0-error:0.220779
[5]	validation_0-error:0.220779
[6]	validation_0-error:0.220779
[7]	validation_0-error:0.227273
[8]	validation_0-error:0.233766
[9]	validation_0-error:0.233766
[10]	validation_0-error:0.233766
[11]	validation_0-error:0.24026
[12]	validation_0-error:0.24026
[13]	validation_0-error:0.233766
[14]	validation_0-error:0.227273
[15]	validation_0-error:0.233766
[16]	validation_0-error:0.233766
[17]	validation_0-error:0.227273
[18]	validation_0-error:0.227273
[19]	validation_0-error:0.220779
[20]	validation_0-error:0.227273
[21]	validation_0-error:0.233766
[22]	validation_0-error:0.227273
[23]	validation_0-error:0.220779
[24]	validation_0-error:0.233766
[25]	validation_0-error:0.214286
[26]	validation_0-error:0.214286
[27]	validation_0-error:0.214286
[28]	validation_0-error:0.207792
[29]	validation_0-error:0.207792
[30]	validation_0-erro

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [12]:
# K fold Cross Validation

kfold = StratifiedKFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print(results)



[ 0.76623377  0.81818182  0.77922078  0.64935065  0.74025974  0.79220779
  0.80519481  0.84415584  0.68421053  0.81578947]


In [56]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

n_estimate = list(np.arange(50,400,50)) #n_estimators means no. of trees needed
sub_sample = list(np.arange(0.1,1.1,0.1))

In [57]:
print("n_estimate:", n_estimate)
print("sub_sample:", sub_sample)

n_estimate: [50, 100, 150, 200, 250, 300, 350]
sub_sample: [0.10000000000000001, 0.20000000000000001, 0.30000000000000004, 0.40000000000000002, 0.5, 0.59999999999999998, 0.70000000000000007, 0.80000000000000004, 0.90000000000000002, 1.0]


In [64]:
## model hyperparameter tuning with Gridsearch

#initialize model
clf = XGBClassifier()
 
#Create the parameters list you wish to tune.
parameters = {'max_depth':[2,3,4,5],'learning_rate':[0.001,0.01,0.1,0.2,0.3], 'n_estimators':n_estimate, 'subsample': sub_sample}
# parameters = {'max_depth':[2,3,4,5,6,7],'learning_rate':[0.001,0.01,0.1,0.2,0.3]}

#Make an recall_score scoring object.
scorer = make_scorer(recall_score)

#create StratifiedKFold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

#Perform grid search on the classifier using 'scorer' as the scoring method.
grid_obj = GridSearchCV(clf, parameters, scoring=scorer, cv=kfold)

#Fit the grid search object to the training data and find the optimal parameters.
grid_fit = grid_obj.fit(X_train, y_train)

#Get the estimator.
best_clf = grid_fit.best_estimator_

#Fit the new model.
best_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=50,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=0.90000000000000002)

In [59]:
grid_fit.cv_results_['mean_test_score'][:10]

array([ 0.51151717,  0.48372093,  0.46478405,  0.45526024,  0.44562569,
        0.46434109,  0.45049834,  0.45049834,  0.42668882,  0.42668882])

In [72]:
df_cv = pd.DataFrame(grid_fit.cv_results_)

In [38]:
df_cv.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.010167,0.000908,0.000987,0.000114,0.05,2,50,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est...",0.619048,0.571429,0.404762,0.465116,0.5,0.512071,0.075938,126
1,0.014398,0.000807,0.000742,3.9e-05,0.05,2,100,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est...",0.595238,0.547619,0.452381,0.55814,0.571429,0.544961,0.048949,123
2,0.019909,0.000475,0.000791,1.6e-05,0.05,2,150,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est...",0.619048,0.571429,0.47619,0.55814,0.547619,0.554485,0.046143,114
3,0.026076,0.000718,0.00085,9e-06,0.05,2,200,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est...",0.642857,0.571429,0.47619,0.534884,0.52381,0.549834,0.055577,119
4,0.031943,0.000229,0.000914,7e-06,0.05,2,250,"{'learning_rate': 0.05, 'max_depth': 2, 'n_est...",0.666667,0.547619,0.5,0.581395,0.52381,0.563898,0.058004,99


In [67]:
df_cv.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_learning_rate', 'param_max_depth', 'param_n_estimators',
       'param_subsample', 'params', 'split0_test_score', 'split1_test_score',
       'split2_test_score', 'split3_test_score', 'split4_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score'],
      dtype='object')

In [73]:
# keep the best 10 parameter settings from GridSearchCV

df_cv[['param_max_depth', 'param_learning_rate', 'param_n_estimators', 'param_subsample', 'mean_test_score']].sort_values(by=['mean_test_score'], ascending=False)[:10]

Unnamed: 0,param_max_depth,param_learning_rate,param_n_estimators,param_subsample,mean_test_score
1198,3,0.3,50,0.9,0.634848
914,3,0.2,50,0.5,0.625758
558,5,0.01,350,0.9,0.625541
1220,3,0.3,200,0.1,0.625325
1210,3,0.3,150,0.1,0.625108
612,2,0.1,300,0.3,0.620996
918,3,0.2,50,0.9,0.620996
730,4,0.1,200,0.1,0.620996
778,5,0.1,50,0.9,0.620779
992,4,0.2,100,0.3,0.620779


In [70]:
grid_fit.best_score_

0.63484848484848488

In [71]:
grid_fit.best_params_

{'learning_rate': 0.3,
 'max_depth': 3,
 'n_estimators': 50,
 'subsample': 0.90000000000000002}

In [25]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

y_pred = best_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("F1: %.2f%%" % (f1 * 100.0))


Accuracy: 81.17%
Recall: 70.18%
Precision: 76.92%
F1: 73.39%


In [28]:
confusion_matrix(y_test, y_pred)

array([[85, 12],
       [17, 40]])

In [7]:
y_pred = model.predict(X_test) 
predictions = [round(value) for value in y_pred] 

In [8]:
accuracy = accuracy_score(y_test, predictions) 
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 81.17%


In [27]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("F1: %.2f%%" % (f1 * 100.0))


Accuracy: 81.17%
Recall: 70.18%
Precision: 76.92%
F1: 73.39%


In [10]:
# save model in a local disk
pickle.dump(model, open("production_pima_model.dat", "wb"))
print("Production model saved as: production_pima_model.dat")

Production model saved as: production_pima_model.dat
