# Import Library

In [3]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load Dataset

In [7]:
url = "https://raw.githubusercontent.com/jackty9/Handling_Imbalanced_Data_in_Python/master/bank-full-encoded.csv"
df = pd.read_csv(url)

X = df.iloc[:, :-1]
Y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.30
)

print("Before sampling :", Counter(y_train))

Before sampling : Counter({0: 27915, 1: 3732})


In [8]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,40,4,1,2,0,3036,1,0,2,4,8,261,0,0,0,3,0
1,26,9,2,1,0,945,1,0,2,4,8,151,0,0,0,3,0
2,15,2,1,1,0,918,1,1,2,4,8,76,0,0,0,3,0
3,29,1,1,3,0,2420,1,0,2,4,8,92,0,0,0,3,0
4,15,11,2,3,0,917,0,0,2,4,8,198,0,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,33,9,1,2,0,1741,0,0,0,16,9,975,2,0,0,3,1
45207,53,5,0,0,0,2639,0,0,0,16,9,456,1,0,0,3,1
45208,54,5,1,1,0,5455,0,0,0,16,9,1116,4,181,3,2,1
45209,39,1,1,1,0,1584,0,0,1,16,9,508,3,0,0,3,0


In [9]:
df["y"].value_counts()

0    39922
1     5289
Name: y, dtype: int64

In [10]:
y_train.value_counts()

0    27915
1     3732
Name: y, dtype: int64

In [11]:
y_train.value_counts(normalize=True)

0    0.882074
1    0.117926
Name: y, dtype: float64

# Undersampling Strategy

In [17]:
# Define undersample strategy (1 mean 100%)
undersample = RandomUnderSampler(sampling_strategy=1)

# Fit and apply the transform
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

# Summarize class distribution
print("After undersampling: ", Counter(y_train_under))

After undersampling:  Counter({0: 3732, 1: 3732})


In [19]:
y_train_under.value_counts(normalize=True)

0    0.5
1    0.5
Name: y, dtype: float64

# Oversampling Strategy

In [18]:
oversample = RandomOverSampler(sampling_strategy=0.5)

X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

print("After oversampling: ", Counter(y_train_over))

After oversampling:  Counter({0: 27915, 1: 13957})


In [20]:
y_train_over.value_counts(normalize=True)

0    0.666675
1    0.333325
Name: y, dtype: float64

# SMOTE Strategy

In [21]:
smote = SMOTE(sampling_strategy=0.5)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("After smote: ", Counter(y_train_smote))

After smote:  Counter({0: 27915, 1: 13957})


In [22]:
y_train_smote.value_counts(normalize=True)

0    0.666675
1    0.333325
Name: y, dtype: float64

# SMOTEENN Strategy

In [23]:
smoteenn = SMOTEENN()
X_train_s, y_train_s = smoteenn.fit_resample(X_train, y_train)
print("After smoteenn: ", Counter(y_train_s))

After smoteenn:  Counter({1: 24468, 0: 19515})


In [24]:
y_train_s.value_counts(normalize=True)

1    0.556306
0    0.443694
Name: y, dtype: float64