<a href="https://colab.research.google.com/github/dhruvi003/ML-Learning/blob/main/handling_imbalanced_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install imbalanced-learn



In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('/content/creditcard.csv')

In [5]:
data.shape

(49610, 31)

In [6]:
# checking if there is any null values
data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,1
V5,1
V6,1
V7,1
V8,1
V9,1


In [7]:
data.fillna(data.mean(), inplace=True)

In [9]:
# now we will check for dependent feature output value
# there is class feature, which is output, so we check how many different values are there in class
data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,49461
1.0,148
0.002983,1


In [17]:
# there is very huge differece in 0 and 1
# let's first separate x and y
x = data.drop('Class',axis=1)
# Convert the target variable to integer type
Y = data.Class.astype(int)

In [11]:
# imbalance dataset don't impact random forest, so we will just check for this time
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold, GridSearchCV
import numpy as np



In [16]:
# Hyperparameter Tuning
log_class = LogisticRegression(solver='liblinear') # Use a solver that supports l1 penalty
grid = {'C': 10.0 ** np.arange(-2,3), 'penalty':['l1','l2']}
cv = KFold(n_splits=5, random_state=None, shuffle=False)

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(x,Y,train_size=0.7)

# Filter out rows where Y_train is not 0 or 1
valid_indices = Y_train.isin([0, 1])
X_train = X_train[valid_indices]
Y_train = Y_train[valid_indices]

In [20]:
clf = GridSearchCV(log_class, grid, cv=cv, n_jobs=-1, scoring='f1_macro')
clf.fit(X_train,Y_train)

In [21]:
# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     14838
           1       0.68      0.62      0.65        45

    accuracy                           1.00     14883
   macro avg       0.84      0.81      0.83     14883
weighted avg       1.00      1.00      1.00     14883

[[14825    13]
 [   17    28]]
