---------------------
### using the Synthetic Minority Over-sampling Technique (SMOTE) 
- to handle class imbalance with logistic regression. 
----------------------

In [None]:
#!pip install --upgrade scikit-learn imbalanced-learn

In [1]:
import numpy as np
import pandas as pd

# ML model
from sklearn.linear_model import LogisticRegression

# synthetic data generation
from imblearn.over_sampling import SMOTE

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
# Generate a suitable imbalanced dataset (You can replace this with your dataset)
X, y = make_classification(n_classes    = 2, 
                           weights      = [0.95, 0.05], 
                           n_samples    = 1000, 
                           random_state = 42)

In [3]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Print class distribution in the original dataset
print("Class distribution in the original dataset:")
print(pd.Series(y_train).value_counts())

Class distribution in the original dataset:
0    663
1     37
dtype: int64


#### model performance with imb dataset

In [6]:
# Train a logistic regression model on the resampled data
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[280,   4],
       [ 13,   3]], dtype=int64)

#### apply SMOTE on the dataset

In [7]:
# Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy='auto', random_state=42)

In [8]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [10]:
# Print class distribution after applying SMOTE
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())


Class distribution after SMOTE:
0    663
1    663
dtype: int64


#### check model performance

In [25]:
# Train a logistic regression model on the resampled data
logreg = LogisticRegression()
logreg.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = logreg.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[242,  42],
       [  6,  10]], dtype=int64)

In [26]:
# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.85      0.91       284
           1       0.19      0.62      0.29        16

    accuracy                           0.84       300
   macro avg       0.58      0.74      0.60       300
weighted avg       0.93      0.84      0.88       300



#### Example - 02 (credit card default.csv)

In [34]:
df = pd.read_csv(r'D:\AI-DATASETS\02-MISC-large\creditcard.csv')

In [35]:
df.shape

(284807, 31)

In [36]:
df.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
244477,152372.0,2.032084,-0.038046,-1.434243,0.13451,0.332904,-0.546143,0.10745,-0.159242,0.267287,...,0.30228,0.989175,0.029482,0.75039,0.250701,-0.159923,-0.02275,-0.061403,1.0,0
18116,29200.0,1.152481,0.200501,0.70071,0.703845,-0.530779,-0.812785,0.042175,-0.1281,-0.058919,...,-0.181479,-0.505874,0.214386,0.613156,0.082207,0.088383,-0.01188,0.024362,16.0,0
83418,59841.0,1.320125,-0.665264,0.04594,-0.664811,-0.742951,-0.339117,-0.538817,0.068727,-0.801768,...,0.218575,0.593415,-0.138575,0.030392,0.633261,-0.04864,-0.007835,-0.011247,20.0,0
176996,122996.0,2.060038,-0.044358,-1.059489,0.418157,-0.138852,-1.216017,0.193947,-0.342407,0.523278,...,-0.286807,-0.688435,0.339139,0.049527,-0.295983,0.194899,-0.070133,-0.059442,2.69,0
141526,84376.0,-0.835906,1.29839,1.284521,0.46727,0.463769,0.763769,0.124683,0.616315,-0.801928,...,-0.148763,-0.53236,-0.269017,-1.344259,0.280494,-0.373317,-0.010818,0.006589,5.0,0


In [37]:
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [46]:
X = df.drop(['Class'], axis=1)
y = df.Class

In [47]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### apply SMOTE on the dataset

In [48]:
# Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy='minority', random_state=42)

In [49]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [50]:
# Print class distribution after applying SMOTE
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())


Class distribution after SMOTE:
0    199008
1    199008
Name: Class, dtype: int64


#### check model performance

In [51]:
# Train a logistic regression model on the resampled data
logreg = LogisticRegression()
logreg.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = logreg.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[83828,  1479],
       [   12,   124]], dtype=int64)

In [52]:
# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85307
           1       0.08      0.91      0.14       136

    accuracy                           0.98     85443
   macro avg       0.54      0.95      0.57     85443
weighted avg       1.00      0.98      0.99     85443

