In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score

# Function to split data and scale features
def prepare_data(df):
    X = df.drop(columns=['Time', 'Class'])
    y = df['Class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test

# Loading and preparing data
df = pd.read_csv("./data/creditcard.csv")
X_train_scaled, X_test_scaled, y_train, y_test = prepare_data(df)

# Applying undersampling and training Decision Tree
rus = RandomUnderSampler(random_state=1)
X_undersampled_dt, y_undersampled_dt = rus.fit_resample(X_train_scaled, y_train)
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X_undersampled_dt, y_undersampled_dt)
y_pred_dt_us = dt_model.predict(X_test_scaled)
print("Classification Report - Decision Tree Undersampled Data:")
print(classification_report(y_test, y_pred_dt_us))
print("Accuracy Score - Decision Tree Undersampled Data:")
print(accuracy_score(y_test, y_pred_dt_us))

# Applying oversampling and training Decision Tree
ros = RandomOverSampler(random_state=1)
X_oversampled_dt, y_oversampled_dt = ros.fit_resample(X_train_scaled, y_train)
dt_model.fit(X_oversampled_dt, y_oversampled_dt)
y_pred_dt_os = dt_model.predict(X_test_scaled)
print("Classification Report - Decision Tree Oversampled Data:")
print(classification_report(y_test, y_pred_dt_os))
print("Accuracy Score - Decision Tree Oversampled Data:")
print(accuracy_score(y_test, y_pred_dt_os))

# Applying undersampling and training XGBoost
X_undersampled_xgb, y_undersampled_xgb = rus.fit_resample(X_train_scaled, y_train)
xgb_model = XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_undersampled_xgb, y_undersampled_xgb)
y_pred_xgb_us = xgb_model.predict(X_test_scaled)
print("Classification Report - XGBoost Undersampled Data:")
print(classification_report(y_test, y_pred_xgb_us))
print("Accuracy Score - XGBoost Undersampled Data:")
print(accuracy_score(y_test, y_pred_xgb_us))

# Applying oversampling and training XGBoost
X_oversampled_xgb, y_oversampled_xgb = ros.fit_resample(X_train_scaled, y_train)
xgb_model.fit(X_oversampled_xgb, y_oversampled_xgb)
y_pred_xgb_os = xgb_model.predict(X_test_scaled)
print("Classification Report - XGBoost Oversampled Data:")
print(classification_report(y_test, y_pred_xgb_os))
print("Accuracy Score - XGBoost Oversampled Data:")
print(accuracy_score(y_test, y_pred_xgb_os))


Classification Report - Decision Tree Undersampled Data:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96     71091
           1       0.02      0.84      0.03       111

    accuracy                           0.93     71202
   macro avg       0.51      0.88      0.50     71202
weighted avg       1.00      0.93      0.96     71202

Accuracy Score - Decision Tree Undersampled Data:
0.9261256706272296
Classification Report - Decision Tree Oversampled Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     71091
           1       0.70      0.72      0.71       111

    accuracy                           1.00     71202
   macro avg       0.85      0.86      0.85     71202
weighted avg       1.00      1.00      1.00     71202

Accuracy Score - Decision Tree Oversampled Data:
0.9990730597455127
Classification Report - XGBoost Undersampled Data:
              precision    recall  f1-score  

In [11]:
# Creating a dictionary to store the metrics for each model and sampling method
results = {
    'Model & Sampling': [
        'Decision Tree - Undersampled',
        'Decision Tree - Oversampled',
        'XGBoost - Undersampled',
        'XGBoost - Oversampled'
    ],
    'Balanced Accuracy': [
        balanced_accuracy_score(y_test, y_pred_dt_us),
        balanced_accuracy_score(y_test, y_pred_dt_os),
        balanced_accuracy_score(y_test, y_pred_xgb_us),
        balanced_accuracy_score(y_test, y_pred_xgb_os)
    ],
    'Accuracy': [
        accuracy_score(y_test, y_pred_dt_us),
        accuracy_score(y_test, y_pred_dt_os),
        accuracy_score(y_test, y_pred_xgb_us),
        accuracy_score(y_test, y_pred_xgb_os)
    ],
    'Precision': [
        precision_score(y_test, y_pred_dt_us),
        precision_score(y_test, y_pred_dt_os),
        precision_score(y_test, y_pred_xgb_us),
        precision_score(y_test, y_pred_xgb_os)
    ],
    'Recall': [
        recall_score(y_test, y_pred_dt_us),
        recall_score(y_test, y_pred_dt_os),
        recall_score(y_test, y_pred_xgb_us),
        recall_score(y_test, y_pred_xgb_os)
    ],
    'F1 Score': [
        f1_score(y_test, y_pred_dt_us),
        f1_score(y_test, y_pred_dt_os),
        f1_score(y_test, y_pred_xgb_us),
        f1_score(y_test, y_pred_xgb_os)
    ]
}

# Converting the dictionary into a DataFrame for a tabular display
results_df = pd.DataFrame(results)

# Displaying the DataFrame
print(results_df)


               Model & Sampling  Balanced Accuracy  Accuracy  Precision  \
0  Decision Tree - Undersampled           0.882051  0.926126   0.017432   
1   Decision Tree - Oversampled           0.860114  0.999073   0.695652   
2        XGBoost - Undersampled           0.925993  0.977950   0.058681   
3         XGBoost - Oversampled           0.891843  0.999565   0.925532   

     Recall  F1 Score  
0  0.837838  0.034154  
1  0.720721  0.707965  
2  0.873874  0.109977  
3  0.783784  0.848780  


In [12]:
from sklearn.metrics import balanced_accuracy_score

# Decision Tree with undersampled data
y_train_pred_dt_us = dt_model.predict(X_undersampled_dt)
print("Balanced Accuracy - Decision Tree Undersampled Training Data:", balanced_accuracy_score(y_undersampled_dt, y_train_pred_dt_us))
print("Balanced Accuracy - Decision Tree Undersampled Test Data:", balanced_accuracy_score(y_test, y_pred_dt_us))

# Decision Tree with oversampled data
y_train_pred_dt_os = dt_model.predict(X_oversampled_dt)
print("Balanced Accuracy - Decision Tree Oversampled Training Data:", balanced_accuracy_score(y_oversampled_dt, y_train_pred_dt_os))
print("Balanced Accuracy - Decision Tree Oversampled Test Data:", balanced_accuracy_score(y_test, y_pred_dt_os))

# XGBoost with undersampled data
y_train_pred_xgb_us = xgb_model.predict(X_undersampled_xgb)
print("Balanced Accuracy - XGBoost Undersampled Training Data:", balanced_accuracy_score(y_undersampled_xgb, y_train_pred_xgb_us))
print("Balanced Accuracy - XGBoost Undersampled Test Data:", balanced_accuracy_score(y_test, y_pred_xgb_us))

# XGBoost with oversampled data
y_train_pred_xgb_os = xgb_model.predict(X_oversampled_xgb)
print("Balanced Accuracy - XGBoost Oversampled Training Data:", balanced_accuracy_score(y_oversampled_xgb, y_train_pred_xgb_os))
print("Balanced Accuracy - XGBoost Oversampled Test Data:", balanced_accuracy_score(y_test, y_pred_xgb_os))


Balanced Accuracy - Decision Tree Undersampled Training Data: 1.0
Balanced Accuracy - Decision Tree Undersampled Test Data: 0.8820506796199922
Balanced Accuracy - Decision Tree Oversampled Training Data: 1.0
Balanced Accuracy - Decision Tree Oversampled Test Data: 0.8601141969922828
Balanced Accuracy - XGBoost Undersampled Training Data: 1.0
Balanced Accuracy - XGBoost Undersampled Test Data: 0.9259932169161185
Balanced Accuracy - XGBoost Oversampled Training Data: 1.0
Balanced Accuracy - XGBoost Oversampled Test Data: 0.8918426592182764
