In [16]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from xgboost import XGBClassifier
!pip install xgboost

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [17]:
#import and split the data

In [18]:
diabetes = pd.read_csv("../input/diabetes/diabetes.csv")
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [19]:
#set and fit the model

In [20]:
xgb_model=XGBClassifier().fit(X_train,y_train)

In [21]:
#test error without tuning

In [22]:
y_pred=xgb_model.predict(X_test)
accuracy_score(y_test,y_pred)

0.7359307359307359

In [23]:
#model tuning

In [24]:
#important parameters
#max_depth
#learning_rate
#n_estimators (number of iterations)
#booster: gbtree
#subsample
#colsample_bytree

In [25]:
xgb_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_samples_split": [2,5,10]}

In [26]:
xgb_cv=GridSearchCV(xgb_model,xgb_params, cv=10, n_jobs=-1, verbose=2)
xgb_cv.fit(X_train,y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 182 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 385 tasks      | elapsed:   53.1s
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1033 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1478 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 2005 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 2612 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 3329 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 4098 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 4949 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 17.0min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster=None,
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0, gpu_id=-1,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     scale_pos_weight=1, subsample=1,
                                     tree_method=None,
                                     validate_parameters=False,
                                     verbosity=None),
             iid='deprecated', n_jobs=-

In [27]:
xgb_cv.best_params_

{'learning_rate': 0.02,
 'max_depth': 3,
 'min_samples_split': 2,
 'n_estimators': 100,
 'subsample': 0.6}

In [28]:
xgb_tuned=XGBClassifier(learning_rate=0.02, max_depth=3, n_estimators=100, subsample=0.6).fit(X_train,y_train)

In [29]:
y_pred=xgb_tuned.predict(X_test)
accuracy_score(y_test,y_pred)

0.7575757575757576

In [30]:
# We found 0.766 by Logistic Regression
#          0.775 by Naive Bayes 
#          0.731 by KNN
#          0.744 by Linear SVC
#          0.735 by Nonlinear SVC Steps
#          0.735 by ANN
#          0.753 by CART
#          0.735 by Random Forest
#          0.735 by GBM
#And now,  0.757 by XG Boost