In [61]:
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd

## Generates toy dataset for binary classification

In [138]:
def generate_data():
    x, y = make_classification(n_samples=5000, n_features=20, n_classes=2, 
                               weights=[0.95, 0.05], random_state=2202)
    return x, y

x, y = generate_data()

## Metrics

In [131]:
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

def classify(x, y):

    print(f"Samples of class 0: {y.shape[0] - np.sum(y)}")
    print(f"Samples of class 1: {np.sum(y)}")

    # Defines the model to be used
    model = DecisionTreeClassifier(max_depth=3)

    avg_accuracy = []
    avg_precision = []
    avg_recall = []
    avg_f1score = []

    # Defines Stratified K-fold in order to keep
    # the class balance for each fold
    st_k_fold = StratifiedKFold(n_splits=10)

    for train_idx, test_idx in st_k_fold.split(x, y):
        
        # Training fold
        x_train = x[train_idx]
        y_train = y[train_idx]
        
        # Testing fold
        x_test = x[test_idx]
        y_test = y[test_idx]

        # Train
        model.fit(x_train, y_train)

        # Get metrics
        accuracy = accuracy_score(y_test, model.predict(x_test))
        precision = precision_score(y_test, model.predict(x_test))
        recall = recall_score(y_test, model.predict(x_test))
        f1score = f1_score(y_test, model.predict(x_test))

        # Save metrics
        avg_accuracy.append(accuracy)
        avg_precision.append(precision)
        avg_recall.append(recall)
        avg_f1score.append(f1score)
    
    acc = np.mean(avg_accuracy)
    prec = np.mean(avg_precision)
    rec = np.mean(avg_recall)
    f1 = np.mean(avg_f1score)
    result_df = pd.DataFrame([acc, prec, rec, f1], 
                        index=['accuracy', 'precision', 'recall', 'f1-score'])
    result_df.index.name = 'avg.'
    print(result_df)

## IMBALANCED-LEARN IN PRACTICE

In [132]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [133]:
# Applies DT without fixing the class imbalance problem.
def dummy_decision_tree():
    classify(x, y)

# Applies Random Undersampling
def under_sampler():
    rus = RandomUnderSampler()
    x_under_sampler, y_under_sampler = rus.fit_resample(x, y)
    classify(x_under_sampler, y_under_sampler)

# Applies Random Oversampling
def over_sampler():
    ros = RandomOverSampler()
    x_over_sampler, y_over_sampler = ros.fit_resample(x, y)
    classify(x_over_sampler, y_over_sampler)

# Applies Synthetic Data Augmentation through SMOTE
def smote():
    smote= SMOTE()
    x_smote, y_smote = smote.fit_resample(x, y)
    classify(x_smote, y_smote)

### without fixing imbalance problem

In [134]:
dummy_decision_tree()

Samples of class 0: 4722
Samples of class 1: 278
                  0
avg.               
accuracy   0.962000
precision  0.774298
recall     0.456217
f1-score   0.569593


### under sampling

In [135]:
under_sampler()

Samples of class 0: 278
Samples of class 1: 278
                  0
avg.               
accuracy   0.786006
precision  0.849343
recall     0.701984
f1-score   0.764575


### oversampling

In [136]:
over_sampler()

Samples of class 0: 4722
Samples of class 1: 4722
                  0
avg.               
accuracy   0.810040
precision  0.892537
recall     0.707750
f1-score   0.788088


### SMOTE (Synthetic Minority Oversampling Technique)

In [137]:
smote()

Samples of class 0: 4722
Samples of class 1: 4722
                  0
avg.               
accuracy   0.829205
precision  0.870447
recall     0.776166
f1-score   0.819323
