# Diabetes prediction: gradient boosting

## Notebook set-up

In [None]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import GradientBoostingClassifier

import configuration as config
import functions as funcs

## 1. Data preparation

### 1.1. Load data from disk

In [None]:
with open(config.DATA_FILE, 'rb') as input_file:
    dataset=pickle.load(input_file)

training_df=dataset['training']
testing_df=dataset['testing']

### 1.2. Inspect

In [None]:
training_df.head()

## 3. Baseline model

In [None]:
model=GradientBoostingClassifier()
model.fit(training_df.drop('Outcome', axis=1), training_df['Outcome'])
accuracy=accuracy_score(model.predict(training_df.drop('Outcome', axis=1)), training_df['Outcome'])*100
print(f'Accuracy of gradient boosting model: {accuracy:.1f}%')

In [None]:
cross_val_scores={
    'Model': [],
    'Score': []
}

scores=cross_val_score(
    model,
    training_df.drop('Outcome', axis=1),
    training_df['Outcome'],
    cv=7,
    n_jobs=-1
)

cross_val_scores['Model'].extend(['Baseline']*len(scores))
cross_val_scores['Score'].extend(scores)

print(f'Cross-validation accuracy: {np.mean(scores)*100:.1f} +/- {np.std(scores)*100:.1f}%')

## 4. Hyperparameter optimization

### 4.1. Hyperparameter grid search

In [None]:
%%time

hyperparameters={
    'n_estimators':[50,100,200],
    'criterion':['gini','entropy','log_loss'],
    'max_depth':[4,8,16],
    'min_samples_split':[2,4,8],
    'min_samples_leaf':[1,2,4],
    'min_weight_fraction_leaf':[0.001,0.01,0.1],
    'max_features':[0.25,0.5,0.75],
    'min_impurity_decrease':[0,0.0001,0.001],
    'ccp_alpha':[0.0,0.0001,0.001,0.01],
    'class_weight':['balanced',None]
}

search=GridSearchCV(
    GradientBoostingClassifier(),
    hyperparameters,
    return_train_score=True,
    cv=7,
    n_jobs=-1
)

search_results=search.fit(training_df.drop('Outcome', axis=1), training_df['Outcome'])
model=search_results.best_estimator_
hyperparameters=search_results.best_params_

print('Best hyperparameters:\n')

for key, value in search_results.best_params_.items():
    print(f' {key}: {value}')

print()


### 4.2. Hyperparameter optimization results

In [None]:
funcs.plot_cross_validation(search_results)

### 4.3. Cross-validation of optimized model

In [None]:
scores=cross_val_score(
    GradientBoostingClassifier(**search_results.best_params_, random_state=315),
    training_df.drop('Outcome', axis=1),
    training_df['Outcome'],
    n_jobs=-1
)

cross_val_scores['Model'].extend(['Optimized']*len(scores))
cross_val_scores['Score'].extend(scores)

print(f'Cross-validation accuracy: {np.mean(scores)*100:.1f} +/- {np.std(scores)*100:.1f}%')

## 5. Evaluation

### 5.1. Model comparison

In [None]:
sns.boxplot(pd.DataFrame.from_dict(cross_val_scores), x='Model', y='Score')
plt.show()

### 5.2. Test set performance

In [None]:
testing_predictions=model.predict(testing_df.drop('Outcome', axis=1))

accuracy=accuracy_score(testing_predictions, testing_df['Outcome'])*100

# Plot the confusion matrix
cm=confusion_matrix(testing_df['Outcome'], testing_predictions, normalize='true')
cm_disp=ConfusionMatrixDisplay(confusion_matrix=cm)
_=cm_disp.plot()

plt.title(f'Optimized model test set performance\noverall accuracy: {accuracy:.1f}%')
plt.xlabel('Predicted outcome')
plt.ylabel('True outcome')
plt.show()

## 6. Save

### 6.1. Optimized hyperparameters

In [None]:
with open(config.GRADIENT_BOOSTING_HYPERPARAMETERS, 'wb') as output_file:
    pickle.dump(hyperparameters.best_params_, output_file)

### 6.2. Model

In [None]:
with open(config.GRADIENT_BOOSTING_MODEL, 'wb') as output_file:
    pickle.dump(model, output_file)