In [12]:
import zipfile
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Getting the Data

Link of the dataset - https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database

In [None]:
!kaggle datasets download -d uciml/pima-indians-diabetes-database

404 - Not Found


In [None]:
# extract the files in temp folder
with zipfile.ZipFile("pima-indians-diabetes-database.zip", 'r') as zip_ref:
    zip_ref.extractall('temp')

In [2]:
df = pd.read_csv('../temp/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
X = df.drop('Outcome', axis=1).values
y = df['Outcome']

In [22]:
seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

### GridSearch CV

In [9]:
model = XGBClassifier(use_label_encoder=False)

In [10]:
n_estimators = [50, 100, 150, 200]
max_depth = [2, 4, 6, 8]
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, learning_rate=learning_rate)

In [19]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold, verbose=1)
grid_result = grid_search.fit(X,y)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


In [20]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
# 	print("%f (%f) with: %r" % (mean, stdev, param))

Best: -0.468283 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}


In [21]:
best_model = XGBClassifier(learning_rate=0.1, max_depth=2, n_estimators=100, use_label_encoder=False)

In [25]:
best_model.fit(X_train, y_train, eval_metric='logloss', early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=True)

[0]	validation_0-logloss:0.66739
[1]	validation_0-logloss:0.64646
[2]	validation_0-logloss:0.62454
[3]	validation_0-logloss:0.60999
[4]	validation_0-logloss:0.59684
[5]	validation_0-logloss:0.58396
[6]	validation_0-logloss:0.57465
[7]	validation_0-logloss:0.56465
[8]	validation_0-logloss:0.55654
[9]	validation_0-logloss:0.55032
[10]	validation_0-logloss:0.54424
[11]	validation_0-logloss:0.53928
[12]	validation_0-logloss:0.53479
[13]	validation_0-logloss:0.53072
[14]	validation_0-logloss:0.52740
[15]	validation_0-logloss:0.52433
[16]	validation_0-logloss:0.52240
[17]	validation_0-logloss:0.52010
[18]	validation_0-logloss:0.51890
[19]	validation_0-logloss:0.51668
[20]	validation_0-logloss:0.51597
[21]	validation_0-logloss:0.51458
[22]	validation_0-logloss:0.51360
[23]	validation_0-logloss:0.51122
[24]	validation_0-logloss:0.51118
[25]	validation_0-logloss:0.51080
[26]	validation_0-logloss:0.51070
[27]	validation_0-logloss:0.50996
[28]	validation_0-logloss:0.50931
[29]	validation_0-loglos

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=2, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [26]:
best_model.score(X_test, y_test)

0.7677165354330708