#### Gradient Boosting classifier

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [5]:
df=pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
# Features and target
x=df.drop('class',axis=1)
y=df['class']

# Standardize features
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)
x_scaled

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [9]:
# perform train, test, spliton the dataset
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.8,random_state=42)

In [11]:
# Instantiate the model and defne and parameters

gbc=GradientBoostingClassifier(random_state=42)

# set up KFold cross-validation
kfold=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# hyperparameter grid

param_grid={
    'n_estimators':[50,100,150], 
    'learning_rate':[0.01,0.1,0.2],
    'max_depth':[3,4,5],
    'subsample':[0.8,1.0]
}

# Grid search with cross validation
grid_search=GridSearchCV(estimator=gbc,
                        param_grid=param_grid,
                        cv=kfold,
                        scoring='recall',
                        n_jobs=-1,
                        verbose=1)

In [13]:
# fit the model
grid_search.fit(x_train,y_train)

# best parameters and score
print('Best Parameters:',grid_search.best_params_)
print('Best cross-validation Recall',grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best cross-validation Recall 0.6257575757575757


In [17]:
# Evaluate on the test data suing best estimators

best_model=grid_search.best_estimator_
y_pred=best_model.predict(x_test)

print('\n Confusion Matrix:\n',confusion_matrix(y_test,y_pred))
print('\nClassification Report:\n',classification_report(y_test,y_pred))


 Confusion Matrix:
 [[310  95]
 [ 82 128]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.77      0.78       405
           1       0.57      0.61      0.59       210

    accuracy                           0.71       615
   macro avg       0.68      0.69      0.68       615
weighted avg       0.72      0.71      0.71       615



## identify feature importance scores using XGBClassifier

In [20]:
best_model.feature_importances_

array([0.08968868, 0.26073071, 0.05878982, 0.05376621, 0.08816734,
       0.16270506, 0.17732721, 0.10882496])