# Preprocessing the Data

In [48]:
#Let us start with Importing packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from src.data_preprocessing import load_data, preprocess_data

In [49]:
#Load and preprocess data
df = load_data('data/creditcard.csv')
df = preprocess_data(df)
df = df.dropna()
df = add_features(df)

# Model Training and Evaluation

In [50]:
#Importing required packages
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [51]:
#Train/Test Split the Data
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Print class distribution
print("Training set class distribution:\n", y_train.value_counts())
print("Test set class distribution:\n", y_test.value_counts())

Training set class distribution:
 Class
0.0    4776
1.0       2
Name: count, dtype: int64
Test set class distribution:
 Class
0.0    1194
1.0       1
Name: count, dtype: int64


In [52]:
# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [53]:
# Model training
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score

In [54]:
# Evaluation function
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = model.decision_function(X_test)

    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, zero_division=0))
    print("ROC AUC Score:", roc_auc_score(y_test, y_proba))
    print("Average Precision Score (PR-AUC):", average_precision_score(y_test, y_proba))

In [55]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_resampled, y_resampled)
print("\nTraining: Logistic Regression")
evaluate_model(lr, X_test, y_test)


Training: Logistic Regression
[[1193    1]
 [   1    0]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1194
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1195
   macro avg       0.50      0.50      0.50      1195
weighted avg       1.00      1.00      1.00      1195

ROC AUC Score: 0.9623115577889447
Average Precision Score (PR-AUC): 0.021739130434782608


In [56]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(X_resampled, y_resampled)
print("\nTraining: Random Forest")
evaluate_model(rf, X_test, y_test)


Training: Random Forest
[[1194    0]
 [   1    0]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1194
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1195
   macro avg       0.50      0.50      0.50      1195
weighted avg       1.00      1.00      1.00      1195

ROC AUC Score: 0.9958123953098827
Average Precision Score (PR-AUC): 0.14285714285714285


In [57]:
# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_resampled, y_resampled)
print("\nTraining: XGBoost")
evaluate_model(xgb, X_test, y_test)


Training: XGBoost
[[1194    0]
 [   1    0]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1194
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1195
   macro avg       0.50      0.50      0.50      1195
weighted avg       1.00      1.00      1.00      1195

ROC AUC Score: 0.9472361809045227
Average Precision Score (PR-AUC): 0.015625


Parameters: { "use_label_encoder" } are not used.

