1. Undersampling
2. Oversampling/SMOTE
3. Cost Sensitive Learning
4. Ensemble Method

In [8]:
#Import necessary libraries
import pandas as pd
import numpy as np

In [9]:
#Creating imbalanced dataset
data = {
    'Feature1': np.random.randint(1, 100, 100),
    'Feature2': np.random.rand(100),
    'Label': np.random.choice(['A','B','C'], size=100, p = [0.7, 0.2, 0.1])
}
df = pd.DataFrame(data)


In [3]:
df

Unnamed: 0,Feature1,Feature2,Label
0,7,0.518193,A
1,40,0.501147,B
2,47,0.062595,A
3,38,0.215300,A
4,14,0.754391,A
...,...,...,...
95,61,0.834793,A
96,6,0.120615,A
97,34,0.079443,B
98,58,0.895560,A


In [4]:
print("original Class Distribution")
df['Label'].value_counts()

original Class Distribution


Label
A    68
B    26
C     6
Name: count, dtype: int64

### 1. Oversampling

In [11]:
from imblearn.over_sampling import RandomOverSampler

In [10]:
X = df[['Feature1', 'Feature2']]
y = df['Label']

In [12]:
#Create the instance for RandomOverSampler
ros = RandomOverSampler(random_state=42)

#Resample the dataset
X_resampled, y_resampled = ros.fit_resample(X, y)

#Convert the resampled data back to a dataframe if needed
oversampled_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame({'Label': y_resampled})],axis=1)

print('Class Distribution after Random Overampling')
print(oversampled_df['Label'].value_counts())

Class Distribution after Random Overampling
Label
A    75
B    75
C    75
Name: count, dtype: int64


### Cost Sensitive Learning

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight

In [15]:
#Initialize the label encoder
label_encoder = LabelEncoder()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
#Encode the target labels
y_train_encoded = label_encoder.fit_transform(y_train)

#Calculate class weight
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

In [25]:
#Create a dict with class weights for each label
class_Weights_dict = dict(zip(np.unique(y_train), class_weights))

In [26]:
#Train the model
clf = RandomForestClassifier(class_weight=class_Weights_dict, random_state=42)
clf.fit(X_train, y_train)

#Evaluate the model
accuracy = clf.score(X_test, y_test)
print('Accuracy', accuracy)

Accuracy 0.75


### Ensemble Method

In [27]:
#Import Necessary libraries
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [28]:
#Define the base estimator (DecisionTreeClassifier)
base_estimator = DecisionTreeClassifier(random_state=42)

In [30]:
#Train the ensemble model using Bagging Classifier
bagging_clf = BaggingClassifier(base_estimator, n_estimators=10, random_state=42)
bagging_clf.fit(X_train, y_train)

#Evaluate the model
accuracy = bagging_clf.score(X_test, y_test)
print('Accuracy', accuracy)

Accuracy 0.6
