In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import seaborn as sns
import importlib
import matplotlib as mpl
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sns)

%matplotlib inline

## Problem definition

In [None]:
from sklearn.datasets import fetch_openml

In [None]:
df, y = fetch_openml('adult', version=2, as_frame=True, return_X_y=True)
df = df.drop(columns=['fnlwgt', 'education-num'])

In [None]:
df.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier

X_train, X_test, y_train, y_test = train_test_split(
    df, y, random_state=42
)

clf = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)

In [None]:
print(f"The accuracy of my classifier is "
      f"{clf.score(X_train, y_train):.3f}")

In [None]:
target_counts = y.value_counts()
target_counts.plot(kind='barh', legend=True)
_ = plt.title(f"Class balance ratio "
              f"{target_counts.min() / target_counts.max():.3f}")

### Let's make a baseline

In [None]:
X_train.info()

In [None]:
num_cols = [col for col in df
            if df[col].dtype.name != 'category']
cat_cols = [col for col in df
            if df[col].dtype.name == 'category']

In [None]:
print(f"The numerical columns are\n {num_cols}")
print(f"The categorical columns are\n {cat_cols}")

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

cat_preprocessor = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='unknwon',
                  add_indicator=True),
    OneHotEncoder(handle_unknown='ignore')
)
num_preprocessor = make_pipeline(
    StandardScaler(),
    SimpleImputer(strategy='mean', add_indicator=True)
)

In [None]:
preprocessor = make_column_transformer(
    (cat_preprocessor, cat_cols),
    (num_preprocessor, num_cols)
)

In [None]:
from sklearn.linear_model import LogisticRegression

model = make_pipeline(
    preprocessor, LogisticRegression(max_iter=10000)
)

In [None]:
model.fit(X_train, y_train).score(X_test, y_test)

## The metrics 

### From scikit-learn

In [None]:
def print_metric(metric, clf, X, y_true, metric_params=None):
    assert X.shape[0] == y_true.shape[0], "Different samples size!!!"
    y_pred = clf.predict(X)
    if metric_params is None:
        metric_params = {}
    score = metric(y_true, y_pred, **metric_params)
    print(f"The {metric.__name__.replace('_' , ' ')} "
          f"is {score:.3f}")

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print_metric(accuracy_score, model, X_test, y_test)

In [None]:
from sklearn.metrics import balanced_accuracy_score

In [None]:
print_metric(balanced_accuracy_score, model, X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
cm_df = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=clf.classes_,
    index=clf.classes_
)
sns.set(font_scale=1.4) # for label size
sns.heatmap(cm_df, annot=True, annot_kws={"size": 16},
            cmap='Oranges',)

plt.xlim(0, 2)
plt.ylim(0, 2)

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

for metric in (precision_score, recall_score, f1_score):
    print_metric(metric, model, X_test, y_test,
                 {'pos_label': '>50K'})

In [None]:
for metric in (precision_score, recall_score, f1_score):
    print_metric(metric, model, X_test, y_test,
                 {'pos_label': '<=50K'})

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import plot_roc_curve

_ = plot_roc_curve(model[-1], model[0].transform(X_test), y_test)

In [None]:
from sklearn.metrics import roc_auc_score

y_pred = model.predict_proba(X_test)
print(f"The ROC-AUC score is "
      f"{roc_auc_score(y_test, y_pred[:, 1]):.3f}")

### From imbalanced-learn

In [None]:
from imblearn.metrics import sensitivity_score
from imblearn.metrics import specificity_score

for metric in (specificity_score, sensitivity_score):
    print_metric(metric, model, X_test, y_test,
                 {'pos_label': '>50K'})

In [None]:
from imblearn.metrics import classification_report_imbalanced

y_pred = model.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
from imblearn.metrics import geometric_mean_score

print_metric(geometric_mean_score, model, X_test, y_test)

## The influence of imbalanced dataset on machine-learning model

#### Linear classifier

In [None]:
def plot_decision_function(X, y, clf, ax=None):
    if ax is None:
        _, ax = plt.subplots()
    plot_step = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, alpha=0.4)
    ax.scatter(X[np.flatnonzero(y == 1), 0],
               X[np.flatnonzero(y == 1), 1],
               color='yellow', alpha=0.8, edgecolor='k')
    ax.scatter(X[np.flatnonzero(y == 0), 0],
               X[np.flatnonzero(y == 0), 1],
              color='indigo', alpha=0.8, edgecolor='k')

In [None]:
from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC

X, y = make_classification(n_samples=1000, n_features=2,
                           n_informative=2, n_redundant=0, n_repeated=0,
                           n_classes=2,
                           n_clusters_per_class=1,
                           weights=[0.5, 0.5],
                           class_sep=1.2, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0
)
model = make_pipeline(StandardScaler(), LinearSVC(max_iter=10000))
_ = model.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(ncols=3, figsize=(15, 4))
plot_decision_function(X, y, model, ax=ax[0])
plot_decision_function(X_train, y_train, model, ax=ax[1])
plot_decision_function(X_test, y_test, model, ax=ax[2])

In [None]:
X, y = make_classification(n_samples=1000, n_features=2,
                           n_informative=2, n_redundant=0, n_repeated=0,
                           n_classes=2,
                           n_clusters_per_class=1,
                           weights=[0.05, 0.95],
                           class_sep=1.2, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0
)

In [None]:
_ = model.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(ncols=3, figsize=(15, 4))
plot_decision_function(X, y, model, ax=ax[0])
plot_decision_function(X_train, y_train, model, ax=ax[1])
plot_decision_function(X_test, y_test, model, ax=ax[2])

Let's recall the loss function for the logistic regression in the binary case:

$L(\theta) = - \frac{1}{m} \sum_{i=1}^{m}\left[ y^{(i)} \log \left( h_{\theta} (x^{(i)}) \right) + \left( 1 - y^{(i)} \right) \log \left( 1 - h_{\theta} (x^{(i)}) \right) \right]$

We sum over the sample without applying any weights.

#### Tree-based model

![](hellinger.png)

![](proba.png)

## What can we do about it?

In [None]:
df, y = fetch_openml('adult', version=2, as_frame=True, return_X_y=True)
df = df.drop(columns=['fnlwgt', 'education-num'])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier

X_train, X_test, y_train, y_test = train_test_split(
    df, y, random_state=42
)

In [None]:
# Helper function to evaluate the different models
def evaluate_classifier(clf, df_all_scores):
    name = getattr(clf, 'name', clf.__class__.__name__)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    balanced_score = balanced_accuracy_score(y_test, y_pred)
    df_score = pd.DataFrame(
        {name: [score, balanced_score]}, 
        index=['Test accuracy', 'Balanced accuracy']
    )
    df_all_scores = pd.concat([df_all_scores, df_score], axis=1).round(decimals=3)
    return df_all_scores

### From scikit-learn

#### Baseline classifier

As we have seen before, we will compare the new classifiers with a dummy baseline which predict the most frequent label in the dataset. This baseline will highlight the improvement compared to naive strategies.

In [None]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.name = "Most Frequent Classifier"

df_all_scores = pd.DataFrame()
df_all_scores = evaluate_classifier(dummy_clf, df_all_scores)
df_all_scores

#### Make use of the `class_weight` parameter

A first class of methods rely on sample weights to correct the imbalance. The core idea here is to weight prediction mistakes on the minority class higher than mistakes on the most common class.

##### In linear model

In `scikit-learn`, some estimators have a `class_weight` parameter that permits to do this. The idea is that the ERM is changed such that
$$
    \arg\min_\theta \frac{1}{\sum_i w_i} \sum_i w_i 1\{f_\theta(X_i) = y_i\}
$$
with weights $w_i = \frac{n}{kn_i}$ with $n$ the total number of samples, $k$ the number of classes and $n_i$ the number of samples from class $y_i$. This effectively rebalance the training in learning both from positive and negative examples.



In [None]:
cat_preprocessor = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='unknwon',
                  add_indicator=True),
    OneHotEncoder(handle_unknown='ignore')
)

num_preprocessor = make_pipeline(
    StandardScaler(),
    SimpleImputer(strategy='mean', add_indicator=True)
)

In [None]:
preprocessor_lr = make_column_transformer(
    (cat_preprocessor, cat_cols),
    (num_preprocessor, num_cols)
)

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = make_pipeline(
    preprocessor_lr, LogisticRegression(max_iter=10000)
)
model_lr.name = 'Logistic Regression'

In [None]:
df_all_scores = evaluate_classifier(model_lr, df_all_scores)
df_all_scores

We can set the `class_weight='balanced'` uses the values of `y` to automatically adjust weights inversely proportional to class frequencies in the input data.

In [None]:
from sklearn.base import clone

model_lr_balanced = clone(model_lr)
model_lr_balanced.set_params(
    logisticregression__class_weight='balanced')
model_lr_balanced.name = "Logistic Regression with balanced weights"
df_all_scores = evaluate_classifier(model_lr_balanced, df_all_scores)
df_all_scores

#### In tree-based model

In tree based models, the `class_weight` option is used to chose on the splits. Indeed, the purity criterion (which is minimize for to chose the split) is computed using these weights. In the leaf, the weights are used to compute the class to output.

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier

cat_preprocessor = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='unknwon',
                  add_indicator=True),
    OrdinalEncoder()
)

num_preprocessor = SimpleImputer(strategy='mean', add_indicator=True)

preprocessor_rf = make_column_transformer(
    (cat_preprocessor, cat_cols),
    (num_preprocessor, num_cols)
)
model_rf = make_pipeline(
    preprocessor_rf,
    RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
)
model_rf.name = "Random Forest"

In [None]:
df_all_scores = evaluate_classifier(model_rf, df_all_scores)
df_all_scores

In [None]:
model_rf_balanced = clone(model_rf)
model_rf_balanced.set_params(
    randomforestclassifier__class_weight='balanced')
model_rf_balanced.name = "Balanced Random Forest"

df_all_scores = evaluate_classifier(model_rf_balanced, df_all_scores)
df_all_scores

For ensemble models, `class_weight` can also take value `balanced_subsample`. This option is equivalent to the `'balanced'` one except that the weigths are computed directly for the bootstrap sample of each tree instead of weights computed globally.

In [None]:
model_rf_subbalanced = clone(model_rf)
model_rf_subbalanced.set_params(
    randomforestclassifier__class_weight='balanced_subsample')
model_rf_subbalanced.name = "Balanced Subsample Random Forest"

df_all_scores = evaluate_classifier(model_rf_subbalanced, df_all_scores)
df_all_scores

### 2.3 Resample the training set to have balanced classes

A second option to learn on unbalanced data is to reweight the classes by sampling a new training set with balanced class. This can be done by either subsampling, oversampling or more complicated scheme demonstrated in this section.

#### From imbalanced-learn

#### Random under-sampling during training

In [None]:
from imblearn.pipeline import make_pipeline as make_pipeline_imblearn
from imblearn.under_sampling import RandomUnderSampler

In [None]:
model_lr_undersampled = make_pipeline_imblearn(
    preprocessor_lr,
    RandomUnderSampler(),
    LogisticRegression(solver='lbfgs', max_iter=1000)
)
model_lr_undersampled.name = "Logistic Regression from rebalanced undersampled data"

In [None]:
df_all_scores = evaluate_classifier(model_lr_undersampled, df_all_scores)
df_all_scores

#### Random over-sampling during training

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
model_lr_oversampled = make_pipeline_imblearn(
    preprocessor_lr,
    RandomOverSampler(),
    LogisticRegression(solver='lbfgs', max_iter=1000)
)
model_lr_oversampled.name = "Logistic Regression from rebalanced oversampled data"

In [None]:
df_all_scores = evaluate_classifier(model_lr_oversampled, df_all_scores)
df_all_scores

#### More fancy methods


There exists some more fancy methods to re-balance the dataset. For instance the SMOTE method where extra points are generated by creatinig synthetic points for the minority class. See more info on the [original paper](https://arxiv.org/pdf/1106.1813.pdf) or in this [blog post](http://rikunert.com/SMOTE_explained).

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
model_lr_smote = make_pipeline_imblearn(
    preprocessor_lr,
    SMOTE(),
    LogisticRegression(solver='lbfgs', max_iter=1000)
)
model_lr_smote.name = "Logistic Regression from SMOTE sampled data"

In [None]:
df_all_scores = evaluate_classifier(model_lr_smote, df_all_scores)
df_all_scores

You can look in [imbalanced-learn documentation](https://imbalanced-learn.readthedocs.io/en/stable/api.html#module-imblearn.over_sampling) for more sampling strategies.

### 2.4 Used balanced algorithms: `BalancedRandomForest` and `BalancedBaggingClassifier` 

Instead of just sampling the training set to rebalance the classes, it is also possible to used _balanced_ classifier to fit the unbalanced dataset. The core idea is to use ensemble techniques with specific boostrap sampling strategies that make sure that each bootstrap sample is balanced.

#### Example of `BalancedRandomForestClassifier`

Here, a random forest is learn on the full dataset. Each tree is constructed using a balanced sub-sampled of the dataset.  
This idea has been proposed by [Chen et al. (2004)](https://statistics.berkeley.edu/sites/default/files/tech-reports/666.pdf).

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

model_balanced_rf = make_pipeline(
    preprocessor_rf,
    BalancedRandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
)
model_balanced_rf.name = "Balanced Random Forest"

In [None]:
df_all_scores = evaluate_classifier(model_balanced_rf, df_all_scores)
df_all_scores

#### Example of `BalancedBaggingClassifier`

In ensemble classifiers, bagging methods build several estimators on different randomly selected subset of data. In scikit-learn, this classifier is named `BaggingClassifier`. However, this classifier does not allow to balance each subset of data. Therefore, when training on imbalanced data set, this classifier will favor the majority classes.

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

In [None]:
model_bagging = make_pipeline(
    preprocessor_rf,
    BaggingClassifier(base_estimator=HistGradientBoostingClassifier(),
                      n_estimators=10, random_state=42, n_jobs=-1)
)
model_bagging.name = "Bagging Model"

In [None]:
df_all_scores = evaluate_classifier(model_bagging, df_all_scores)
df_all_scores

`BalancedBaggingClassifier `allows to resample each subset of data before to train each estimator of the ensemble. In short, it combines the output of an `EasyEnsemble` sampler with an ensemble of classifiers (i.e. `BaggingClassifier`). Therefore, `BalancedBaggingClassifier `takes the same parameters than the scikit-learn `BaggingClassifier`. Additionally, there is two additional parameters, sampling_strategy and replacement to control the behaviour of the random under-sampler.

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier
model_balanced_bagging = make_pipeline(
    preprocessor_rf,
    BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(),
                              n_estimators=10, random_state=42, n_jobs=-1)
)
model_balanced_bagging.name = "Balanced Bagging Model"

In [None]:
df_all_scores = evaluate_classifier(model_balanced_bagging, df_all_scores)
df_all_scores

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

model = make_pipeline(
    preprocessor, OneClassSVM(kernel='linear', nu=0.1)
)
mask_single_class = y_train == '>50K'
_ = model.fit(X_train[mask_single_class], y_train[mask_single_class])

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_test_converted = (2 * (y_test == '>50K').astype(int)) - 1
score = balanced_accuracy_score(y_test_converted, y_pred)
print(f"Balanced accuracy {score:.2f}")