#### Objective

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Applying **Undersampling** and **Oversampling** Technqiues to an Imbalanced Dataset

#### General Libraries

In [None]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

In [None]:
import sklearn
import imblearn

print("scikit-learn version:", sklearn.__version__)
print("imbalanced-learn version:", imblearn.__version__)

scikit-learn version: 1.2.2
imbalanced-learn version: 0.10.1


#### Data Exploration

In [None]:
# read the data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Maze/Handling Imbalanced Data/stroke.csv")

# check data frame shape
df.shape

(43400, 12)

In [None]:
# check column names and first few rows
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0


In [None]:
# define input and target variable for random undersampling and random oversampling
X = df.drop(["stroke"],1)
y = df.stroke

# define input and target variable (numerical) for cluster centroids, tomek links and SMOTE
df_n = df.dropna()

X_n = df_n[["age","avg_glucose_level","bmi"]]
y_n = df_n.stroke

In [None]:
# check distribution of complete data target variable
y.value_counts()

0    42617
1      783
Name: stroke, dtype: int64

In [None]:
# check distribution of numerical data target variable
y_n.value_counts()

0    28524
1      548
Name: stroke, dtype: int64

#### Random Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# initialize and fit undersampler
Random_undersampler = RandomUnderSampler(sampling_strategy='majority')
X_random_under, y_random_under = Random_undersampler.fit_resample(X, y)

# check new distribution
y_random_under.value_counts()

0    783
1    783
Name: stroke, dtype: int64

#### Cluster Centroids Undersampling

In [None]:
from imblearn.under_sampling import ClusterCentroids

# initialize and fit cluster centroids undersampler
CC_undersampler = ClusterCentroids()
X_cc_under, y_cc_under = CC_undersampler.fit_resample(X_n, y_n)

# check new distribution
y_cc_under.value_counts()

0    548
1    548
Name: stroke, dtype: int64

#### Tomek Links Undersampling

In [None]:
from imblearn.under_sampling import TomekLinks

# initialize and fit tomek links undersampler
TL_undersampler = TomekLinks()
X_tl_under, y_tl_under = TL_undersampler.fit_resample(X_n, y_n)

# check new distribution
y_tl_under.value_counts()

0    28219
1      548
Name: stroke, dtype: int64

Only a small reduction, therefore TomekLinks might not be goor for this dataset.

#### Random Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

# initialize and fit random oversampler
Random_oversampler = RandomOverSampler(sampling_strategy = 'minority')
X_random_over, y_random_over = Random_oversampler.fit_resample(X, y)

# check new distribution
y_random_over.value_counts()

0    42617
1    42617
Name: stroke, dtype: int64

#### SMOTE (Synthetic Minority Over-sampling Technique)

In [None]:
from imblearn.over_sampling import SMOTE

# initialize and fit SMOTE
Smote = SMOTE(random_state=42)
X_smote_over, y_smote_over = Smote.fit_resample(X_n, y_n)

# check new distribution
y_smote_over.value_counts()

0    28524
1    28524
Name: stroke, dtype: int64