<a href="https://colab.research.google.com/github/danammanisha/Imbalanced-Learning-Using-ML-Models-and-Sampling-Techniques/blob/main/imbalanced_learning_using_ml_models_and_sampling_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import required libraries
import pandas as pd
import numpy as np

In [2]:
#read the csv file
df = pd.read_csv("/content/train .csv")

In [3]:
#read first 5 rows
df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [4]:
#check for null values
df.isnull().sum()

Unnamed: 0,0
id,0
target,0
ps_ind_01,0
ps_ind_02_cat,0
ps_ind_03,0
ps_ind_04_cat,0
ps_ind_05_cat,0
ps_ind_06_bin,0
ps_ind_07_bin,0
ps_ind_08_bin,0


In [5]:
target_count = df.target.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion of class 0 is ', round(target_count[0] * 100 / (target_count[1] + target_count[0]), 2),'%')


Class 0: 573518
Class 1: 21694
Proportion of class 0 is  96.36 %


In [6]:
print(df)

             id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0             7       0          2              2          5              1   
1             9       0          1              1          7              0   
2            13       0          5              4          9              1   
3            16       0          0              1          2              0   
4            17       0          0              2          0              1   
...         ...     ...        ...            ...        ...            ...   
595207  1488013       0          3              1         10              0   
595208  1488016       0          5              1          3              0   
595209  1488017       0          1              1         10              0   
595210  1488021       0          5              2          3              1   
595211  1488027       0          0              1          8              0   

        ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin

**install imblearn**

In [7]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


In [8]:
import imblearn

**Split features & target**

In [9]:
X = df.drop("target", axis=1)
y = df["target"]


In [10]:
X.head()
y.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


**Splitting the data**

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

**Scaling the data**

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**RandomUnderSampler**

In [13]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

print("Before Undersampling:")
print(y.value_counts())

print("\nAfter Undersampling:")
print(pd.Series(y_resampled).value_counts())


Before Undersampling:
target
0    573518
1     21694
Name: count, dtype: int64

After Undersampling:
target
0    21694
1    21694
Name: count, dtype: int64


**ClusterCentroids**

In [15]:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(sampling_strategy={0:200},random_state=42)
X_cc, y_cc = cc.fit_resample(X_train_scaled, y_train)

print("After Cluster Centroids:")
print(pd.Series(y_cc).value_counts())


After Cluster Centroids:
target
1    17355
0      200
Name: count, dtype: int64


**Applying ML model**

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=300),
    "Decision Tree": DecisionTreeClassifier()
}

print("\n===== UNDERSAMPLING RESULTS ====")
for name, model in models.items():
    model.fit(X_cc, y_cc)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    print("\nModel:", name)
    print("F1 Score:", f1_score(y_test, y_pred))
    print("AUC Score:", roc_auc_score(y_test, y_prob))


===== UNDERSAMPLING RESULTS ====

Model: Logistic Regression
F1 Score: 0.07033497864338918
AUC Score: 0.606559648577196

Model: Decision Tree
F1 Score: 0.07033440858472062
AUC Score: 0.5


**RandomOverSampler**

In [19]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

print("Before Oversampling:")
print(y.value_counts())

print("\nAfter Oversampling:")
print(pd.Series(y_resampled).value_counts())


Before Oversampling:
target
0    573518
1     21694
Name: count, dtype: int64

After Oversampling:
target
0    573518
1    573518
Name: count, dtype: int64


**SMOTE**

In [21]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X_train_scaled, y_train)

print("After SMOTE:")
print(pd.Series(y_sm).value_counts())


After SMOTE:
target
0    458814
1    458814
Name: count, dtype: int64


**Applying ML Models**

In [22]:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=300),
    "Decision Tree": DecisionTreeClassifier()
}

print("\n===== SMOTE RESULTS =====")
for name, model in models.items():
    model.fit(X_sm, y_sm)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    print("\nModel:", name)
    print("F1 Score:", f1_score(y_test, y_pred))
    print("AUC Score:", roc_auc_score(y_test, y_prob))



===== SMOTE RESULTS =====

Model: Logistic Regression
F1 Score: 0.09015456674473069
AUC Score: 0.60611757964008

Model: Decision Tree
F1 Score: 0.05730769230769231
AUC Score: 0.5092185281347108
