### Configuration and imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

### Data import

In [None]:
dir_path= './data'
data_path = os.path.join(dir_path, 'diabetes_simple.csv')

df = pd.read_csv(data_path)

df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
columns = df.columns.tolist()
target_column = 'diabetes'

columns

### Creating pipelines for model classification

In [None]:
measures = ['accuracy', 'f1', 'precision', 'recall']

In [None]:
ordered_smoking_labels = ['No info', 'never', 'former', 'not current', 'current', 'ever']

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.svm import SVC

import matplotlib.pyplot as plt
from diabetes_model_creation import create_data_pipelines, separate_data, encoding_data, grid_search_pipe, get_measures_values, save_measures
import joblib


cols_to_encode = ['gender', 'smoking_history']
numerical_cols = df.select_dtypes(exclude=['object']).columns.tolist()

if target_column in numerical_cols:
    numerical_cols.remove(target_column)

In [None]:
# data pipeline creation
preprocessor = create_data_pipelines(numerical_cols, cols_to_encode)


In [None]:
x_train, x_test, y_train, y_test = separate_data(df, target_column)

y_train, y_test = encoding_data(y_train, y_test)

#### Grid Search in all models

In [None]:
# creating dict with models

# to test MLP, SVC and AdaBoosting
clf_dicts = [ { 
                    'model_name':  'Decision Tree',
                    'saving_name':  'dt_clf',
                    'model_base':  DecisionTreeClassifier(random_state=0),
                    'best_model': None
                  },
              {
                    'model_name':  'Gradient Boosting',
                    'saving_name':  'gbm_clf',
                    'model_base':  GradientBoostingClassifier(random_state=0),
                    'best_model': None
                  },
                {
                    'model_name':  'Hist Gradient Boosting',
                    'saving_name':  'hgbm_clf',
                    'model_base':  HistGradientBoostingClassifier(random_state=42),
                    'best_model': None
                  },
                {
                    'model_name':  'Random Forest',
                    'saving_name':  'rf_clf',
                    'model_base':  RandomForestClassifier(random_state=0),
                    'best_model': None
                  },
                {
                    'model_name':  'SVC',
                    'saving_name':  'svc_clf',
                    'model_base':  SVC(random_state=42),
                    'best_model': None
                  },
                ]

In [None]:
fig, ax = plt.subplots()
for est in clf_dicts:
  
  name = est['model_name']
  save = est['saving_name']
  model = est['model_base']
  cod = est['saving_name']

  print(f'Grid Search on {name} classifier')

  best = grid_search_pipe(model, name, x_train, y_train, preprocessor, measures)
  

  # print the best model and its hyperparameters
  print(f'best {name}: {best.get_params()}')

  # fit the best model on the training data
  best.fit(x_train, y_train)

  # make predictions on the testing data
  pred = best.predict(x_test)

  est['best_model'] = best

  acc_, recall_, f1, precision = get_measures_values(y_test, pred)

    # print the performance measures
  print(f'Accuracy score: {acc_:.2%}')
  print(f'Recall score: {recall_:.2%}')
  print(f'F1-score: {f1:.2%}')
  print(f'Precision score: {precision:.2%}')


  # plot precision-recall curve
  PrecisionRecallDisplay.from_predictions(y_test, pred, name=name, ax=ax)

  # Save measures
  save_measures(acc_, recall_, f1, precision, cod)

  # Set directory path
  model_folder = 'models'
  if not os.path.exists(model_folder):
    os.mkdir(model_folder)

  # Save pipeline  
  joblib.dump(best, os.path.join(model_folder, f'{save}_pipeline.pkl'))


results_dir = 'results'
if not os.path.exists(results_dir):
    os.mkdir(results_dir)

plt.title('Precision Recall curve of classifiers')
plt.savefig(os.path.join(results_dir, 'precision_recall_curve_clfs.png'), dpi=300)
plt.show()

### Visualizing results

In [None]:

scores = pd.read_csv(os.path.join(results_dir, 'scores.csv'), index_col=0)

scores

#### Visualizing classification metrics performances

In [None]:
rows = 2
cols = 2
fig, ax = plt.subplots(rows, cols, sharex=True, constrained_layout=True, figsize=(12, 8))
m = 0
measures_cols = scores.columns.tolist()
for r in range(rows):
    for c in range(cols):
        scores[measures_cols[m]].plot.bar(rot=0, color='green', ax=ax[r, c])

        ax[r, c].set_ylabel(measures_cols[m])

        for pat in ax[r, c].patches:
            ax[r, c].annotate(f'{pat.get_height():.5f}', (pat.get_x() * 1.005, pat.get_height() * 1.01))
        
        m += 1
        
plt.suptitle('Performance Measurement')
fig.supxlabel('Models')
plt.savefig(os.path.join(results_dir, 'performance_models.png'), dpi=300)
plt.show()

The best model was Hist Gradient Boosting, almost tied with Gradient Boosting!


### Feature Importance on the best model

In [None]:
from sklearn.inspection import permutation_importance

hgbm_clf_pipeline = joblib.load(os.path.join(model_folder, f'hgbm_clf_pipeline.pkl'))

perm_importance = permutation_importance(hgbm_clf_pipeline, x_test, y_test)

feature_names = x_train.columns.tolist()
features = np.array(feature_names)

sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
plt.title('Feature importance on Hist Gradient Boosting Classifier')

plt.savefig(os.path.join(results_dir, 'feature_importance_hgbm.png'), dpi=300)
plt.show()