# Undersampling

1. Random Undersampling : Randomly remove samples from a majority class to balance them.

In [5]:
#Importing Credit card Data
# Step 1: Set up Kaggle API
from google.colab import files
files.upload()  # Upload kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Step 2: Download the dataset
!kaggle datasets download -d nelgiriyewithana/credit-card-fraud-detection-dataset-2023
!unzip credit-card-fraud-detection-dataset-2023.zip

# Step 3: Load the dataset
import pandas as pd
data1 = pd.read_csv('creditcard_2023.csv')
print(data1.head())

df = data1.copy()
df1 = df.sample(n=500)
df1.info()
df1.describe()


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/nelgiriyewithana/credit-card-fraud-detection-dataset-2023
License(s): other
Downloading credit-card-fraud-detection-dataset-2023.zip to /content
 96% 137M/143M [00:01<00:00, 124MB/s]
100% 143M/143M [00:01<00:00, 120MB/s]
Archive:  credit-card-fraud-detection-dataset-2023.zip
  inflating: creditcard_2023.csv     
   id        V1        V2        V3        V4        V5        V6        V7  \
0   0 -0.260648 -0.469648  2.496266 -0.083724  0.129681  0.732898  0.519014   
1   1  0.985100 -0.356045  0.558056 -0.429654  0.277140  0.428605  0.406466   
2   2 -0.260272 -0.949385  1.728538 -0.457986  0.074062  1.419481  0.743511   
3   3 -0.152152 -0.508959  1.746840 -1.090178  0.249486  1.143312  0.518269   
4   4 -0.206820 -0.165280  1.527053 -0.448293  0.106125  0.530549  0.658849   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0 -0.130006  0.727159  ... -0.110552  0.217606 -0.13

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,284587.882,0.039493,-0.01582,0.009277,-0.037234,0.030783,0.017612,0.042434,-0.037185,0.000218,...,-0.023113,0.026225,0.039758,-0.057084,0.047084,0.047441,-0.019397,0.02891,12223.326,0.482
std,166903.899357,1.005301,1.009973,0.976394,1.023588,0.948412,0.992326,1.094488,1.028495,0.968973,...,1.07822,1.073676,0.855829,0.975499,0.965235,0.995156,0.999125,0.91673,6655.457234,0.500176
min,3732.0,-2.567916,-6.268234,-2.476251,-3.586057,-3.18813,-2.831449,-4.272496,-6.381213,-3.740071,...,-11.741749,-6.450353,-6.778899,-2.610151,-3.316503,-3.420628,-4.947034,-4.846945,113.26,0.0
25%,135820.5,-0.482777,-0.544812,-0.602875,-0.746171,-0.234676,-0.368918,-0.201677,-0.193321,-0.539864,...,-0.165725,-0.464228,-0.219936,-0.664296,-0.493382,-0.594518,-0.307095,-0.199165,7085.3625,0.0
50%,276704.5,-0.078564,-0.132849,0.044333,-0.140271,0.10802,0.098266,0.233776,-0.109261,0.096187,...,-0.04629,-0.018065,-0.064499,-0.037929,0.036349,0.084848,-0.177761,-0.002757,12477.88,0.0
75%,432580.0,0.88667,0.33969,0.666814,0.70906,0.440101,0.484103,0.514153,0.016902,0.51556,...,0.12564,0.484896,0.132899,0.545881,0.558994,0.70634,0.350278,0.433595,17619.185,1.0
max,568559.0,2.112819,4.303169,3.412377,2.35265,3.583745,4.047343,12.883107,5.611369,3.467223,...,8.08708,9.743776,6.343132,2.841048,3.620224,3.973061,3.682522,4.37897,24034.2,1.0


In [6]:
# Step 4: Perform Random Undersampling
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

df1 = df1[['id', 'V1', 'V2', 'V3', 'V4', 'V5', 'Amount', 'Class']]

# Separate features and target
X = df1.drop('Class', axis=1)
y = df1['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

# Train the Random Forest model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_res, y_train_res)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       1.00      1.00      1.00        72

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

[[78  0]
 [ 0 72]]


Tomek Links : This method removes pair of samples that are closest together but belongs to different classes, aiming to balance the dataset.

In [7]:

from imblearn.under_sampling import TomekLinks
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Separate features and target
X = df1.drop('Class', axis=1)
y = df1['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply Tomek Links undersampling
tl = TomekLinks()
X_train_res, y_train_res = tl.fit_resample(X_train, y_train)

# Train the Random Forest model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_res, y_train_res)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[78  0]
 [ 0 72]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       1.00      1.00      1.00        72

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150



Cluster Centroid : Identify the clusters in the majority class and replace each cluster with its centroid.

In [8]:
# Step 4: Perform Cluster Centroid undersampling
import numpy as np
from imblearn.under_sampling import ClusterCentroids
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Separate features and target
X = df1.drop('Class', axis=1)
y = df1['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply Cluster Centroid undersampling
cc = ClusterCentroids(random_state=42)
X_train_res, y_train_res = cc.fit_resample(X_train, y_train)

# Train the Random Forest model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_res, y_train_res)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Confusion Matrix:
[[78  0]
 [ 0 72]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       1.00      1.00      1.00        72

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150



# Oversampling Techniques :

Oversampling is a technique used to handle imbalanced datasets by increasing the number of instances in the minority class. This can be done by duplicating existing minority class instances or generating new synthetic instances. The goal is to balance the class distribution to improve the performance of machine learning models.

Random Oversampling: This technique involves randomly duplicating examples from the minority class.


In [9]:
# Step 4: Perform Random Oversampling
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Separate features and target
X = df1.drop('Class', axis=1)
y = df1['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

# Train the Random Forest model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_res, y_train_res)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[78  0]
 [ 0 72]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       1.00      1.00      1.00        72

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150



SMOTE (Synthetic Minority Over-sampling Technique): This method generates synthetic examples by interpolating between existing minority class examples.

In [11]:
# Step 4: Perform SMOTE Oversampling
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Separate features and target
X = df1.drop('Class', axis=1)
y = df1['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train the Random Forest model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_res, y_train_res)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[78  0]
 [ 0 72]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        78
           1       1.00      1.00      1.00        72

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150



In [12]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Generate a synthetic imbalanced dataset
# Separate features and target
X = df1.drop('Class', axis=1)
y = df1['Class']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate class weights
class_counts = np.bincount(y_train)
total_samples = len(y_train)
class_weights = {0: total_samples / (2 * class_counts[0]), 1: total_samples / (2 * class_counts[1])}

# Initialize Logistic Regression model with class weights
model = LogisticRegression(class_weight=class_weights, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        62
           1       0.90      1.00      0.95        38

    accuracy                           0.96       100
   macro avg       0.95      0.97      0.96       100
weighted avg       0.96      0.96      0.96       100

