https://machinelearningmastery.com/bagging-and-random-forest-for-imbalanced-classification/

In [52]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

In [53]:
import pandas as pd

pd.set_option( 'display.max_columns' , None ) 

In [54]:
from sklearn.datasets import make_classification

In [104]:
X , y = make_classification( n_samples = 10000 ,
                             n_features = 10 ,
                             n_redundant = 2 ,
                             n_clusters_per_class = 1 ,
                             weights = [ 0.99 ] ,
                             flip_y = 0.01 ,
                             random_state = 999 )

# 1. Bagging for Imbalanced Classification

## 1.1 Standard Bagging

In [105]:
from sklearn.ensemble import BaggingClassifier

In [106]:
model = BaggingClassifier()

In [107]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [108]:
cv = RepeatedStratifiedKFold( n_splits = 10 , n_repeats = 3 , random_state = 777 )

In [109]:
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_auc_score

In [110]:
scores = cross_val_score( model , X , y , scoring = 'roc_auc' , cv = cv , n_jobs = -1 )

In [111]:
from numpy import mean

from numpy import std

In [112]:
print( f'standard bagging | roc-auc : {mean(scores):.3f} ( {std(scores):.3f} )' )

standard bagging | roc-auc : 0.837 ( 0.058 )


## 1.2 Bagging With Random Undersampling

In [113]:
from imblearn.ensemble import BalancedBaggingClassifier

In [114]:
model = BalancedBaggingClassifier()

In [115]:
cv = RepeatedStratifiedKFold( n_splits = 10 , n_repeats = 3 , random_state = 777 )

In [116]:
scores = cross_val_score( model , X , y , scoring = 'roc_auc' , cv = cv , n_jobs = -1 )

In [117]:
print( f'bagging with random undersample | roc-auc : {mean(scores):.3f} ( {std(scores):.3f} )' )

bagging with random undersample | roc-auc : 0.850 ( 0.069 )


# 2. Random Forest for Imbalanced Classification

## 2.1 Standard Random Forest

In [118]:
from sklearn.ensemble import RandomForestClassifier

In [119]:
model = RandomForestClassifier( n_estimators = 10 )

In [120]:
cv = RepeatedStratifiedKFold( n_splits = 10 , n_repeats = 3 , random_state = 777 )

In [121]:
scores = cross_val_score( model , X , y , scoring = 'roc_auc' , cv = cv , n_jobs = -1 )

In [122]:
print( f'standard random forest | roc-auc : {mean(scores):.3f} ( {std(scores):.3f} )' )

standard random forest | roc-auc : 0.834 ( 0.060 )


## 2.2 Random Forest With Class Weighting

In [123]:
model = RandomForestClassifier( n_estimators = 10 , class_weight = 'balanced' )

In [124]:
cv = RepeatedStratifiedKFold( n_splits = 10 , n_repeats = 3 , random_state = 777 )

In [125]:
scores = cross_val_score( model , X , y , scoring = 'roc_auc' , cv = cv , n_jobs = -1 )

In [126]:
print( f'standard random forest | roc-auc : {mean(scores):.3f} ( {std(scores):.3f} )' )

standard random forest | roc-auc : 0.836 ( 0.054 )


## 2.3 Random Forest With Bootstrap Class Weighting

*Given that each decision tree is constructed from a bootstrap sample (e.g. random selection with replacement), the class distribution in the data sample will be different for each tree.
As such, it might be interesting to change the class weighting based on the class distribution in each bootstrap sample, instead of the entire training dataset.*

In [127]:
model = RandomForestClassifier( n_estimators = 10 , class_weight = 'balanced_subsample' )

In [128]:
cv = RepeatedStratifiedKFold( n_splits = 10 , n_repeats = 3 , random_state = 777 )

In [129]:
scores = cross_val_score( model , X , y , scoring = 'roc_auc' , cv = cv , n_jobs = -1 )

In [130]:
print( f'standard random forest | roc-auc : {mean(scores):.3f} ( {std(scores):.3f} )' )

standard random forest | roc-auc : 0.831 ( 0.056 )


## 2.4 Random Forest With Random Undersampling

In [131]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [132]:
model = BalancedRandomForestClassifier( n_estimators = 10 )

In [133]:
cv = RepeatedStratifiedKFold( n_splits = 10 , n_repeats = 3 , random_state = 777 )

In [134]:
scores = cross_val_score( model , X , y , scoring = 'roc_auc' , cv = cv , n_jobs = -1 )

In [135]:
print( f'standard random forest | roc-auc : {mean(scores):.3f} ( {std(scores):.3f} )' )

standard random forest | roc-auc : nan ( nan )


# 3. Easy Ensemble for Imbalanced Classification

*The **Easy Ensemble** involves creating balanced samples of the training dataset by selecting all examples from the minority class and a subset from the majority class.*

In [99]:
from imblearn.ensemble import EasyEnsembleClassifier

In [100]:
model = EasyEnsembleClassifier( n_estimators = 10 )

In [101]:
cv = RepeatedStratifiedKFold( n_splits = 10 , n_repeats = 3 , random_state = 777 )

In [102]:
scores = cross_val_score( model , X , y , scoring = 'roc_auc' , cv = cv , n_jobs = -1 )

In [103]:
print( f'standard random forest | roc-auc : {mean(scores):.3f} ( {std(scores):.3f} )' )

standard random forest | roc-auc : 0.963 ( 0.043 )
