<a href="https://colab.research.google.com/github/goelnikhils-lgtm/languagemodels/blob/main/Handlingclassimbalance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#example class to handle class imbalance by looking into various samplinb techniques like SMOTE (Minority class over sampling), Stratified Sampling
#why do we use F1 as metric to evaluate class imbalance as Precision = tp/tp+fp and Recall = tp/tp+fn
#Credit - https://www.geeksforgeeks.org/machine-learning/handling-imbalanced-data-for-classification/

In [None]:
#need to handling class imbalance as -> machine learning models get biased to majority class and that hampers generalization and leads to overfitting
#overfitting should be avoided and addressed

In [None]:
#let's code

#case for RandomSampler and balancing the majority and miniority classes
import numpy as np
from sklearn.datasets import  make_classification
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

#create a class imbalanced dataset
X,y = make_classification(n_classes = 2 , class_sep = 2 , weights = [0.1,0.9],
                          n_informative = 3 , n_redundant = 1, flip_y=0,
                          n_features = 20,n_clusters_per_class = 1 ,
                          n_samples = 1000, random_state = 42)
print("Original Class Distribution:", Counter(y))

#oversampling using RandomSampler
oversample = RandomOverSampler(sampling_strategy = 'minority')
x_over, y_over = oversample.fit_resample(X,y)
print("Random Oversampling Class Distribution:", Counter(y_over))

#undersampling using RandomSampler
undersample = RandomUnderSampler(sampling_strategy = 'majority')
x_under, y_under = undersample.fit_resample(X,y)
#yields a balanced dataset of classes --------------------------------->
print("Random Undersampling Class Distribution:", Counter(y_under))

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.metrics import accuracy_score , classification_report


#create a class imbalanced dataset
X,y = make_classification(n_classes = 2 , class_sep = 2 , weights = [0.1,0.9],
                          n_informative = 3 , n_redundant = 1, flip_y=0,
                          n_features = 20,n_clusters_per_class = 1 ,
                          n_samples = 1000, random_state = 42)
#split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

base_classifier = RandomForestClassifier(n_estimators = 100, random_state = 42)

#create a Balanaced Bagging Classifier
balanced_bagging_classifier = BalancedBaggingClassifier(base_classifier,
                                                        n_estimators = 10,
                                                        sampling_strategy = 'auto',
                                                        replacement = False,
                                                        random_state = 42)

#fit the model
balanced_bagging_classifier.fit(X_train, y_train)

#make predictions
y_pred = balanced_bagging_classifier.predict(X_test)

#evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Classification Report:\n",classification_report(y_test, y_pred))



In [None]:
#SMOTE
#SMOTE uses k-NN for synthetically oversampling the minority class

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

#create an imbalanced dataset
#create a class imbalanced dataset
X,y = make_classification(n_classes = 2 , class_sep = 2 , weights = [0.1,0.9],
                          n_informative = 3 , n_redundant = 1, flip_y=0,
                          n_features = 20,n_clusters_per_class = 1 ,
                          n_samples = 1000, random_state = 42)
#split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

#display class distribution before SMOTE
print("Original Class Distribution before SMOTE:", Counter(y_train))

#apply SMOTE to oversample minority class
smote = SMOTE(sampling_strategy = 'auto', random_state = 42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

#display class distribution after SMOTE
print("Class Distribution after SMOTE:", Counter(y_train_smote))



In [None]:
#Threshold Moving to handle class imbalance

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score , roc_auc_score

#create an imbalance dataset
X,y = make_classification(n_classes = 2 , class_sep = 2 , weights = [0.1,0.9],
                          n_informative = 3 , n_redundant = 1, flip_y=0,
                          n_features = 20,n_clusters_per_class = 1 ,
                          n_samples = 1000, random_state = 42)
#split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

#train a classification model (Random Forest as an example)
model = RandomForestClassifier(n_estimators = 100, random_state = 42)
model.fit(X_train, y_train)

#predit the probabilities
y_pred_proba = model.predict_proba(X_test)[:,1]

#define a threshold for classification
threshold = 0.5

#adjust the threshold based on your criteria
while threshold >=0:
    y_pred = (y_pred_proba >= threshold).astype(int)
    f1 = f1_score(y_test, y_pred)
    print(f"Threshold:{threshold:.2f} - F1 Score:{f1:.4f}")
    threshold -= 0.02