#Import Libraries

https://towardsdatascience.com/imbalanced-classification-in-python-smote-enn-method-db5db06b8d50

In [None]:
# Import random undersampling and other necessary libraries
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load Dataset

In [None]:
url = "https://raw.githubusercontent.com/jackty9/Handling_Imbalanced_Data_in_Python/master/bank-full-encoded.csv"
df = pd.read_csv(url)

# Separating the independent variables from dependent variables
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# Split train-test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Summarize class distribution
print("Before sampling: ", Counter(y_train))

Before sampling:  Counter({0: 28024, 1: 3623})


In [None]:
y_train.value_counts()

0    28024
1     3623
Name: y, dtype: int64

In [None]:
y_train.value_counts(normalize=True)

0    0.885518
1    0.114482
Name: y, dtype: float64

In [None]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,40,4,1,2,0,3036,1,0,2,4,8,261,0,0,0,3,0
1,26,9,2,1,0,945,1,0,2,4,8,151,0,0,0,3,0
2,15,2,1,1,0,918,1,1,2,4,8,76,0,0,0,3,0
3,29,1,1,3,0,2420,1,0,2,4,8,92,0,0,0,3,0
4,15,11,2,3,0,917,0,0,2,4,8,198,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,33,9,1,2,0,1741,0,0,0,16,9,975,2,0,0,3,1
45207,53,5,0,0,0,2639,0,0,0,16,9,456,1,0,0,3,1
45208,54,5,1,1,0,5455,0,0,0,16,9,1116,4,181,3,2,1
45209,39,1,1,1,0,1584,0,0,1,16,9,508,3,0,0,3,0


# Undersampling Strategy

In [None]:
# Define undersampling strategy
undersample = RandomUnderSampler(sampling_strategy=1)

# Fit and apply the transform
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

# Summarize class distribution
print("After undersampling: ", Counter(y_train_under))

After undersampling:  Counter({0: 3623, 1: 3623})


# Oversampling Strategy

In [None]:
# Define oversampling  strategy
oversample = RandomOverSampler(sampling_strategy=0.5)

# Fit and apply the transform
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

print("After oversampling: ", Counter(y_train_over))

After oversampling:  Counter({0: 28024, 1: 14012})


In [None]:
y_train_over.value_counts(normalize=True)

0    0.666667
1    0.333333
Name: y, dtype: float64

# SMOTE: Oversampling

In [None]:
# Define oversampling strategy
oversample = SMOTE(sampling_strategy=0.5)

# Fit and apply the transform
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

print("After oversampling: ", Counter(y_train_over))

After oversampling:  Counter({0: 28024, 1: 14012})


In [None]:
y_train_over.value_counts(normalize=True)

0    0.666667
1    0.333333
Name: y, dtype: float64

In [None]:
X_train_over

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,14,0,2,1,0,2381,0,0,1,29,4,118,4,0,0,3
1,30,4,1,2,0,2063,0,0,0,4,1,389,0,61,2,2
2,10,1,1,1,0,2154,1,0,2,13,8,246,0,0,0,3
3,30,0,1,1,0,3552,0,0,0,16,9,212,1,0,0,3
4,41,4,1,0,0,6243,0,0,0,16,2,141,3,179,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42031,24,1,1,1,0,5320,0,0,1,11,6,869,1,0,0,3
42032,32,0,1,1,0,980,1,0,0,12,9,165,0,0,0,3
42033,14,4,0,1,0,1190,1,0,0,13,1,1077,0,281,4,2
42034,13,6,2,0,0,1555,0,0,0,16,4,206,0,247,3,1


# SMOTEEEN: Oversampling

In [None]:
# Define oversampling strategy
oversample = SMOTEENN()

# Fit and apply the transform
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

print("After oversampling: ", Counter(y_train_over))

After oversampling:  Counter({1: 24596, 0: 19614})


In [None]:
y_train_over.value_counts(normalize=True)

1    0.556345
0    0.443655
Name: y, dtype: float64