I have just discovered that there's another method of dealing with imbalanced data. A lot of the models in sklearn has "class_weight" parameter that change the penalties associated with different label values. Let's give it a try (as well as fine-tuning the model).

In [2]:
import pandas as pd
import dill
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Import data and apply the same preprocessing as in exploration_2.ipynb and exploration_3.ipynb

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('heart_2020_cleaned.csv')

# one-hot encoding
category_cols = [col for col in data.columns if data[col].dtype == 'object']
data = pd.get_dummies(data=data, columns=category_cols, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(data.drop(['HeartDisease_Yes'], axis=1), data['HeartDisease_Yes'], test_size=0.2, random_state=42)

# this is the exact same as feature scaling in the 1st exploration
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

In [4]:
counts = y_train.value_counts()
scale_pos_weight = counts[0] / counts[1]

Let's first fine tune XGBClassifier.

The score we will use here is the F1 score because ROC_AUC score does not get affected by the imbalance of the data as much as F1 score. We fine tune the parameters scale_pos_weight, max_depth, and min_child_weight first because they have the most impact on the model.

In [5]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

params = {'scale_pos_weight': [scale_pos_weight, 1, 100], 'max_depth': [3, 5, 10], 'min_child_weight': [1, 3, 6]}
xgb_grid = GridSearchCV(XGBClassifier(), params, cv=5, scoring=scoring, refit='f1', verbose=2)

In [14]:
xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=10.745833524631559; total time=   6.9s
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=10.745833524631559; total time=   7.2s
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=10.745833524631559; total time=   8.7s
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=10.745833524631559; total time=   8.3s
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=10.745833524631559; total time=   9.1s
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=1; total time=   9.0s
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=1; total time=  10.1s
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=1; total time=   9.2s
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=1; total time=   7.1s
[CV] END max_depth=3, min_child_weight=1, scale_pos_weight=1; total time=   6.9s
[CV] END max_depth=3, min_child_weight=1, s

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_ca...
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                        

In [22]:
print(xgb_grid.best_params_)
print(xgb_grid.best_score_)
xgb_grid_result_df = pd.DataFrame(xgb_grid.cv_results_)
print(xgb_grid_result_df.loc[xgb_grid.best_index_, ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1', 'mean_test_roc_auc']])
optimal_scale_pos_weight = xgb_grid.best_params_['scale_pos_weight']
optimal_max_depth = xgb_grid.best_params_['max_depth']
optimal_min_child_weight = xgb_grid.best_params_['min_child_weight']

{'max_depth': 5, 'min_child_weight': 3, 'scale_pos_weight': 10.745833524631559}
0.34779100552244785
mean_test_accuracy     0.753295
mean_test_precision    0.224405
mean_test_recall         0.7726
mean_test_f1           0.347791
mean_test_roc_auc       0.83846
Name: 12, dtype: object


Now let's fine tune the parameter gamma

In [24]:
xgb_grid2 = GridSearchCV(
    XGBClassifier(scale_pos_weight=optimal_scale_pos_weight, max_depth=optimal_max_depth, min_child_weight=optimal_min_child_weight),
    {'gamma': [i / 10 for i in range(10)]},
    cv=5,
    scoring=scoring,
    refit='f1',
    verbose=2
)

xgb_grid2.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ..........................................gamma=0.0; total time=  16.0s
[CV] END ..........................................gamma=0.0; total time=  16.0s
[CV] END ..........................................gamma=0.0; total time=  18.2s
[CV] END ..........................................gamma=0.0; total time=  17.5s
[CV] END ..........................................gamma=0.0; total time=  17.8s
[CV] END ..........................................gamma=0.1; total time=  16.3s
[CV] END ..........................................gamma=0.1; total time=  15.1s
[CV] END ..........................................gamma=0.1; total time=  15.3s
[CV] END ..........................................gamma=0.1; total time=  15.0s
[CV] END ..........................................gamma=0.1; total time=  14.5s
[CV] END ..........................................gamma=0.2; total time=  15.9s
[CV] END .......................................

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_ca...
                                     max_delta_step=None, max_depth=5,
                                     max_leaves=None, min_child_weight=3,
                                     missing=nan, monotone_constraints=None,
                        

In [27]:
print(xgb_grid2.best_params_)
print(xgb_grid2.best_score_)
xgb_grid_result_df2 = pd.DataFrame(xgb_grid2.cv_results_)
print(xgb_grid_result_df2.loc[xgb_grid2.best_index_, ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1', 'mean_test_roc_auc']])
optimal_gamma = xgb_grid2.best_params_['gamma']

{'gamma': 0.0}
0.34779100552244785
mean_test_accuracy     0.753295
mean_test_precision    0.224405
mean_test_recall         0.7726
mean_test_f1           0.347791
mean_test_roc_auc       0.83846
Name: 0, dtype: object


Next, fine-tune parameters subsample and colsample_bytree.

In [28]:
xgb_grid3 = GridSearchCV(
    XGBClassifier(scale_pos_weight=optimal_scale_pos_weight, max_depth=optimal_max_depth, min_child_weight=optimal_min_child_weight, gamma=optimal_gamma),
    {'subsample': [i / 10 for i in range(5, 10)], 'colsample_bytree': [i / 10 for i in range(5, 10)]},
    cv=5,
    scoring=scoring,
    refit='f1',
    verbose=2
)

xgb_grid3.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END ................colsample_bytree=0.5, subsample=0.5; total time=  16.0s
[CV] END ................colsample_bytree=0.5, subsample=0.5; total time=  15.6s
[CV] END ................colsample_bytree=0.5, subsample=0.5; total time=  16.8s
[CV] END ................colsample_bytree=0.5, subsample=0.5; total time=  17.7s
[CV] END ................colsample_bytree=0.5, subsample=0.5; total time=  17.7s
[CV] END ................colsample_bytree=0.5, subsample=0.6; total time=  16.8s
[CV] END ................colsample_bytree=0.5, subsample=0.6; total time=  17.0s
[CV] END ................colsample_bytree=0.5, subsample=0.6; total time=  18.2s
[CV] END ................colsample_bytree=0.5, subsample=0.6; total time=  17.5s
[CV] END ................colsample_bytree=0.5, subsample=0.6; total time=  11.0s
[CV] END ................colsample_bytree=0.5, subsample=0.7; total time=  12.7s
[CV] END ................colsample_bytree=0.5, 

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=0.0, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat...
                                     max_leaves=None, min_child_weight=3,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                           

In [31]:
print(xgb_grid3.best_params_)
print(xgb_grid3.best_score_)
xgb_grid_result_df3 = pd.DataFrame(xgb_grid3.cv_results_)
print(xgb_grid_result_df3.loc[xgb_grid3.best_index_, ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1', 'mean_test_roc_auc']])
optimal_subsample = xgb_grid3.best_params_['subsample']
optimal_colsample_bytree = xgb_grid3.best_params_['colsample_bytree']

{'colsample_bytree': 0.8, 'subsample': 0.9}
0.34768505068007693
mean_test_accuracy     0.752517
mean_test_precision    0.224144
mean_test_recall       0.774666
mean_test_f1           0.347685
mean_test_roc_auc      0.838594
Name: 19, dtype: object


Next, fine-tune parameters reg_alpha.

In [35]:
xgb_grid4 = GridSearchCV(
    XGBClassifier(scale_pos_weight=optimal_scale_pos_weight, max_depth=optimal_max_depth, min_child_weight=optimal_min_child_weight, gamma=optimal_gamma, subsample=optimal_subsample, colsample_bytree=optimal_colsample_bytree),
    {'reg_alpha': [0, 0.0001, 0.01, 1, 100]},
    cv=5,
    scoring=scoring,
    refit='f1',
    verbose=2
)

xgb_grid4.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ........................................reg_alpha=0; total time=  18.3s
[CV] END ........................................reg_alpha=0; total time=  18.2s
[CV] END ........................................reg_alpha=0; total time=  20.6s
[CV] END ........................................reg_alpha=0; total time=  18.7s
[CV] END ........................................reg_alpha=0; total time=  18.2s
[CV] END ...................................reg_alpha=0.0001; total time=  18.4s
[CV] END ...................................reg_alpha=0.0001; total time=  18.6s
[CV] END ...................................reg_alpha=0.0001; total time=  18.6s
[CV] END ...................................reg_alpha=0.0001; total time=  18.7s
[CV] END ...................................reg_alpha=0.0001; total time=  18.5s
[CV] END .....................................reg_alpha=0.01; total time=  18.3s
[CV] END .....................................reg

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=0.0, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_bin=None,
                                     max_cat_...
                                     max_delta_step=None, max_depth=5,
                                     max_leaves=None, min_child_weight=3,
                                     missing=nan, monotone_constraints=None,
                        

In [36]:
print(xgb_grid4.best_params_)
print(xgb_grid4.best_score_)
xgb_grid_result_df4 = pd.DataFrame(xgb_grid4.cv_results_)
print(xgb_grid_result_df4.loc[xgb_grid4.best_index_, ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1', 'mean_test_roc_auc']])
optimal_alpha = xgb_grid4.best_params_['reg_alpha']

{'reg_alpha': 0}
0.34768505068007693
mean_test_accuracy     0.752517
mean_test_precision    0.224144
mean_test_recall       0.774666
mean_test_f1           0.347685
mean_test_roc_auc      0.838594
Name: 0, dtype: object


Finally, we increase the number of trees (100 to 1000) and decrease the learning rate (0.1 to 0.01).

In [37]:
final_xgb_clf = XGBClassifier(
        n_estimators=1000,
        learning_rate=0.01,
        scale_pos_weight=optimal_scale_pos_weight,
        max_depth=optimal_max_depth,
        min_child_weight=optimal_min_child_weight,
        gamma=optimal_gamma,
        subsample=optimal_subsample,
        colsample_bytree=optimal_colsample_bytree,
        reg_alpha=optimal_alpha
)

final_xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=3,
              missing=nan, monotone_constraints='()', n_estimators=1000,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

Next, let's train a weighted Logistic Regression model.

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

lr_grid = GridSearchCV(
    LogisticRegression(solver='liblinear'),
    {'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'class_weight': [{0:1, 1:1}, {0: 1, 1: scale_pos_weight}, {0:1, 1: scale_pos_weight * 10}],
    'penalty': ['l1', 'l2']},
    cv=5, scoring=scoring,
    refit='f1'0,
    verbose=1
    )

lr_grid.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'class_weight': [{0: 1, 1: 1},
                                          {0: 1, 1: 10.745833524631559},
                                          {0: 1, 1: 107.45833524631558}],
                         'penalty': ['l1', 'l2']},
             refit='f1',
             scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
             verbose=1)

In [22]:
print(lr_grid.best_params_)
print(lr_grid.best_score_)
lr_grid_df = pd.DataFrame(lr_grid.cv_results_)
print(lr_grid_df.loc[lr_grid.best_index_, ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1', 'mean_test_roc_auc']])
optimal_C = lr_grid.best_params_['C']
optimal_class_weight = lr_grid.best_params_['class_weight']
optimal_penalty = lr_grid.best_params_['penalty']

{'C': 0.001, 'class_weight': {0: 1, 1: 10.745833524631559}, 'penalty': 'l1'}
0.350163674973566
mean_test_accuracy     0.755171
mean_test_precision    0.226201
mean_test_recall       0.774758
mean_test_f1           0.350164
mean_test_roc_auc      0.840916
Name: 2, dtype: object


Grid search for BalancedBaggingClassifier with 1000 estimators takes a long time. Thus I will use my intuition and set max_features and max_samples to 0.8.

In [35]:
from imblearn.ensemble import BalancedBaggingClassifier

bbc = BalancedBaggingClassifier(
        base_estimator=LogisticRegression(random_state=42),
        n_estimators=100,
        max_features=0.8,
        max_samples=0.8,
        random_state=42
    )

In [36]:
bbc.fit(X_train, y_train)

BalancedBaggingClassifier(base_estimator=LogisticRegression(random_state=42),
                          max_features=0.8, max_samples=0.8, n_estimators=100,
                          random_state=42)

Finally, let's put the 3 models together and train the soft VotingClassifier.

In [39]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('xgb', final_xgb_clf), ('lr', lr_grid.best_estimator_), ('bbc', bbc)],
    voting='soft',
    weights=[1, 1, 1]
)

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            callbacks=None, colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=0.8,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=0.0,
                                            gpu_id=-1, grow_policy='depthwise',
                                            importance_type=None,
                                            interaction_constraints='',
                                            learning_rate=0.01, max_bin=256,
                                            m...
                                            predictor='auto', random_state=0,
                                     

In [42]:
y_pred = voting_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.88      0.92     58367
           1       0.31      0.55      0.39      5592

    accuracy                           0.85     63959
   macro avg       0.63      0.71      0.65     63959
weighted avg       0.90      0.85      0.87     63959



In [41]:
dill.dump(voting_clf, open('voting_clf.pkl', 'wb'))

In [43]:
# save all the models and results
dill.dump_session('exploration_4.db')