In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)
warnings.simplefilter('ignore', UserWarning)

# Imbalanced data

Imbalanced data occurs in classification when the number of instances in each class are not the same. Some care is required to learn to predict the *rare* classes effectively. 

There is no one-size-fits-all approach to handling imbalanced data. A reasonable strategy is to consider this as a model selection problem, and use cross-validation to find an approach that works well for your data sets. We will show how to do this in the hyper-parameter optimization notebook. 

**Warning**: Like most things in ML, techniques should not be applied blindly, but considered carefully with the problem goal in mind. In many cases, there is a decision-theoretic problem of assigning the appropriate costs to minority and majority case mistakes that requires domain knowledge to model correctly. As you will see in this example, blind application of a technique does not necessarily improve performance.

## Simulate an imbalanced data set

In [None]:
import pandas as pd
import numpy as np

In [None]:
X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test]).squeeze()

In [None]:
y.value_counts()

In [None]:
np.random.seed(0)

In [None]:
idx = (
    (y == 0) | 
    ((y == 1) & (np.random.uniform(0, 1, y.shape) < 0.2))
).squeeze()

In [None]:
X_im, y_im = X.loc[idx, :], y[idx]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_im, y_im, random_state=0)

In [None]:
y_test.value_counts(), y_train.value_counts()

## Collect more data

This is the best but often impractical solution. Synthetic data generation may also be an option.

## Use evaluation metrics that are less sensitive to imbalance

For example, the `F1` score (harmonic mean of precision and recall) is less sensitive than the accuracy score.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score, confusion_matrix

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
clf = DummyClassifier(strategy='prior')

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score

In [None]:
accuracy_score(clf.predict(X_test), y_test)

In [None]:
f1_score(clf.predict(X_test), y_test)

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
accuracy_score(lr.predict(X_test), y_test)

In [None]:
balanced_accuracy_score(lr.predict(X_test), y_test)

In [None]:
f1_score(lr.predict(X_test), y_test)

## Over-sample the minority class

There are many ways to over-sample the minority class. A popular algorithm is known as SMOTE (Synthetic Minority Oversampling Technique) 

![img](https://ars.els-cdn.com/content/image/1-s2.0-S0020025517310083-gr3.jpg)

In [None]:
! python3 -m pip install --quiet imbalanced-learn

In [None]:
import imblearn

In [None]:
X_train_resampled, y_train_resampled = \
imblearn.over_sampling.SMOTE().fit_resample(X_train, y_train)

In [None]:
X_train.shape

In [None]:
X_train_resampled.shape

In [None]:
y_train.value_counts()

### Evaluate if this helps

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
f1_score(lr.predict(X_test), y_test)

In [None]:
confusion_matrix(lr.predict(X_test), y_test)

In [None]:
lr.fit(X_train_resampled, y_train_resampled)

In [None]:
f1_score(lr.predict(X_test), y_test)

In [None]:
confusion_matrix(lr.predict(X_test), y_test)

## Under-sample the majority class

Tomek pairs are nearest neighbor pairs of instances where the classes are different. Under-sampling is done by removing the majority member of the pair. 

![img](https://miro.medium.com/max/2788/1*pR35KsLpz7-_zvbvdm0frg.png)

In [None]:
X_train_resampled, y_train_resampled = \
imblearn.under_sampling.TomekLinks().fit_resample(X_train, y_train)

In [None]:
X_train.shape

In [None]:
X_train_resampled.shape

In [None]:
y_train.value_counts()

In [None]:
y_train_resampled.value_counts()

### Evaluate if this helps

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
f1_score(lr.predict(X_test), y_test)

In [None]:
confusion_matrix(lr.predict(X_test), y_test)

In [None]:
lr.fit(X_train_resampled, y_train_resampled)

In [None]:
f1_score(lr.predict(X_test), y_test)

In [None]:
confusion_matrix(lr.predict(X_test), y_test)

## Combine over- and under-sampling

For example, over-sample using SMOTE then clean using Tomek.

In [None]:
X_train_resampled, y_train_resampled = \
imblearn.combine.SMOTETomek().fit_resample(X_train, y_train)

In [None]:
X_train.shape

In [None]:
X_train_resampled.shape

In [None]:
y_train.value_counts()

In [None]:
y_train_resampled.value_counts()

### Evaluate if this helps

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
f1_score(lr.predict(X_test), y_test)

In [None]:
confusion_matrix(lr.predict(X_test), y_test)

In [None]:
lr.fit(X_train_resampled, y_train_resampled)

In [None]:
f1_score(lr.predict(X_test), y_test)

In [None]:
confusion_matrix(lr.predict(X_test), y_test)

## Use class weights to adjust the loss function

We make prediction errors in the minority class more costly than prediction errors in the majority class.

In [None]:
wts = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

In [None]:
wts

You can then pass in the class weights. Note that there are several alternative ways to calculate possible class weights to use, and you can also do a GridSearch on weights.

This is actually built-in to most classifiers. The defaults are equal weights to each class.

In [None]:
lr = LogisticRegression(class_weight=wts)

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.class_weight

In [None]:
f1_score(lr.predict(X_test), y_test)

In [None]:
roc_auc_score(lr.predict(X_test), y_test)

In [None]:
confusion_matrix(lr.predict(X_test), y_test)

In [None]:
lr_balanced = LogisticRegression(class_weight='balabced')

In [None]:
lr_balanced.class_weight

In [None]:
lr_balanced.fit(X_train, y_train)

In [None]:
roc_auc_score(lr_balanced.predict(X_test), y_test)

In [None]:
confusion_matrix(lr_balanced.predict(X_test), y_test)

In [None]:
f1_score(lr_balanced.predict(X_test), y_test)

## Use a classifier that is less sensitive to imbalance

Boosted trees are generally good because of their sequential nature.

In [None]:
from catboost import CatBoostClassifier

In [None]:
cb = CatBoostClassifier()

In [None]:
cb.fit(X_train, y_train, verbose=0)

In [None]:
f1_score(cb.predict(X_test), y_test)

In [None]:
confusion_matrix(cb.predict(X_test), y_test)

### Imbalanced learn has classifiers that balance the data automatically

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [None]:
brf = BalancedRandomForestClassifier()

In [None]:
brf.fit(X_train, y_train)

In [None]:
confusion_matrix(brf.predict(X_test), y_test)

In [None]:
f1_score(brf.predict(X_test), y_test)