In [1]:
from IPython.display import Image
Image(url= "https://ernesto.net/wp-content/uploads/2020/11/ernesto-logo-205x46-1.png", width=400, height=400)

In [None]:
!pip install imbalanced-learn

In [None]:
# Prep your dataset
import pandas as pd 
import matplotlib.pyplot as plt
df = pd.read_csv('./Dataset/diabetes.csv')
# Identify your target variable and count the classes
# In other words - replace Outcome with your target
df['y'].value_counts()

In [None]:
# Again - replace Outcome with your target
count_classes = pd.value_counts(df['y'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Frequency")

In [None]:
X = df.drop('y',axis = 1)
Y = df['y']

In [None]:
X.shape,Y.shape

# These Resampling techniques assume numeric arrays
## Identify ALL categorical columns with text and encode them
## .get_dummies works well
## for example:  X = pd.get_dummies(X[['job','marital','education','default','housing','loan','contact','month','poutcome']])
### the diabetes dataset is all numeric so it does not apply

# Undersampling Techniques
## Reduce the major class down to the size of the minor class

In [None]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_res,y_res=nm.fit_resample(X,Y)
X_res.shape,y_res.shape

In [None]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

# How to save the resampled data to a CSV

In [None]:
# To store the resampled data as a CSV - use Pandas - ".to_csv"
X_res.to_csv('X_undersampled.csv')
y_res.to_csv('Y_undersampled.csv')

In [None]:
undersampled_df = pd.concat([X_res, y_res], axis=1, join='inner')
undersampled_df.to_csv('undersampled_data.csv')
undersampled_df.head()

# Oversampling Techniques
## Increase the minor class up to the size of the major class with synthetic data points

In [None]:
from imblearn.over_sampling import RandomOverSampler
os =  RandomOverSampler()
X_train_res, y_train_res = os.fit_resample(X, Y)
X_train_res.shape,y_train_res.shape

In [None]:
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_train_res)))

# Hybrid Over/Under sampling Techniques
## Combination of Oversampling and Undersampling

In [None]:
from imblearn.combine import SMOTETomek
smk = SMOTETomek()
X_res,y_res=smk.fit_resample(X,Y)
X_res.shape,y_res.shape

In [None]:
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

In [None]:
from imblearn.combine import SMOTEENN
smk = SMOTEENN()
X_res,y_res=smk.fit_resample(X,Y)
X_res.shape,y_res.shape

# Applied Resampling Techniques

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3)
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier()
brf.fit(X_train,Y_train)
brf.score(X_train,Y_train)

In [None]:
brf.score(X_test,Y_test)