In [2]:
import pandas as pd
import sys
import numpy as np
import os
sys.path.append(os.path.abspath(".."))
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore', 'futurewarning')
from src.woe import calculate_woe_iv, apply_woe_binning

In [None]:
# Load the data
train_balanced = pd.read_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\train_balanced.csv')
test = pd.read_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\test.csv')

In [4]:
# Separate features and target
X_train = train_balanced.drop('FraudResult', axis=1)
y_train = train_balanced['FraudResult']
X_test = test.drop('FraudResult', axis=1)
y_test = test['FraudResult']

# **Default Estimator on Original Data**

In [5]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_predictions = dummy_clf.predict(X_test)
print("Dummy Classifier (Baseline):\n", classification_report(y_test, dummy_predictions))
print("ROC-AUC:", roc_auc_score(y_test, dummy_clf.predict_proba(X_test)[:, 1]))
print("F1-score:", f1_score(y_test, dummy_predictions))

Dummy Classifier (Baseline):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.00      0.00      0.00        36

    accuracy                           1.00     19133
   macro avg       0.50      0.50      0.50     19133
weighted avg       1.00      1.00      1.00     19133

ROC-AUC: 0.5
F1-score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Logistic Regression (Default)
lr_default = LogisticRegression(random_state=42, solver='liblinear')
lr_default.fit(X_train, y_train)
lr_default_predictions = lr_default.predict(X_test)
print("\nLogistic Regression (Default):\n", classification_report(y_test, lr_default_predictions))
print("ROC-AUC:", roc_auc_score(y_test, lr_default.predict_proba(X_test)[:, 1]))
print("F1-score:", f1_score(y_test, lr_default_predictions))


Logistic Regression (Default):
               precision    recall  f1-score   support

           0       1.00      0.95      0.97     19097
           1       0.00      0.00      0.00        36

    accuracy                           0.95     19133
   macro avg       0.50      0.47      0.49     19133
weighted avg       1.00      0.95      0.97     19133

ROC-AUC: 0.4737131486620935
F1-score: 0.0


# **WoE Binning**

In [7]:
bins = pd.qcut(X_train['Amount'], q=10, duplicates='drop')
X_train['Amount_bins'] = pd.cut(X_train['Amount'], bins=bins.cat.categories, include_lowest=True)
X_test['Amount_bins'] = pd.cut(X_test['Amount'], bins=bins.cat.categories, include_lowest=True)
woe_amount = calculate_woe_iv(pd.concat([X_train, y_train], axis=1), 'Amount_bins', 'FraudResult')
X_train = apply_woe_binning(X_train, 'Amount_bins', 'FraudResult', woe_amount)
X_test = apply_woe_binning(X_test, 'Amount_bins', 'FraudResult', woe_amount)

In [8]:
numerical_cols = X_train.select_dtypes(include=np.number).columns.tolist()
numerical_cols.remove('Amount')

In [9]:
for col in numerical_cols:
    bins = pd.qcut(X_train[col], q=10, duplicates='drop')
    X_train[col + '_bins'] = pd.cut(X_train[col], bins=bins.cat.categories, include_lowest=True)
    X_test[col + '_bins'] = pd.cut(X_test[col], bins=bins.cat.categories, include_lowest=True)
    woe_col = calculate_woe_iv(pd.concat([X_train, y_train], axis=1), col + '_bins', 'FraudResult')
    X_train = apply_woe_binning(X_train, col + '_bins', 'FraudResult', woe_col)
    X_test = apply_woe_binning(X_test, col + '_bins', 'FraudResult', woe_col)

In [10]:
# Drop original columns and bins
cols_to_drop = [col for col in X_train.columns if col.endswith('_bins') or col in numerical_cols]
X_train = X_train.drop(cols_to_drop, axis=1)
X_test = X_test.drop(cols_to_drop, axis=1)

In [11]:
# Impute NaN values
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [12]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [13]:
# Debugging: Print shapes and class distribution AFTER SMOTE
print('X_train shape AFTER SMOTE', X_train.shape)
print('X_test shape AFTER SMOTE', X_test.shape)
print('y_train shape AFTER SMOTE', y_train.shape)
print('y_test shape AFTER SMOTE', y_test.shape)
print('y_train value counts AFTER SMOTE:\n', pd.Series(y_train).value_counts())

X_train shape AFTER SMOTE (152744, 20)
X_test shape AFTER SMOTE (19133, 20)
y_train shape AFTER SMOTE (152744,)
y_test shape AFTER SMOTE (19133,)
y_train value counts AFTER SMOTE:
 FraudResult
0    76372
1    76372
Name: count, dtype: int64


In [14]:
# Save WoE-transformed data (ONLY ONCE, AFTER SMOTE)
X_train.to_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\X_train_woe.csv', index=False)
X_test.to_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\X_test_woe.csv', index=False)
y_train.to_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\y_train.csv', index=False)
y_test.to_csv(r'C:\Users\user\Desktop\BatiBank_SmartCredit\data\y_test.csv', index=False)


# **Default Estimator on WoE Transformed Data**

In [15]:
dummy_clf_woe = DummyClassifier(strategy='most_frequent')
dummy_clf_woe.fit(X_train, y_train)
dummy_predictions_woe = dummy_clf_woe.predict(X_test)
print("\nDummy Classifier (Baseline - WoE Data):\n", classification_report(y_test, dummy_predictions_woe))
print("ROC-AUC:", roc_auc_score(y_test, dummy_clf_woe.predict_proba(X_test)[:, 1]))
print("F1-score:", f1_score(y_test, dummy_predictions_woe))


Dummy Classifier (Baseline - WoE Data):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.00      0.00      0.00        36

    accuracy                           1.00     19133
   macro avg       0.50      0.50      0.50     19133
weighted avg       1.00      1.00      1.00     19133

ROC-AUC: 0.5
F1-score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
lr_default_woe = LogisticRegression(random_state=42, solver='liblinear')
lr_default_woe.fit(X_train, y_train)
lr_default_predictions_woe = lr_default_woe.predict(X_test)
print("\nLogistic Regression (Default - WoE Data):\n", classification_report(y_test, lr_default_predictions_woe))
print("ROC-AUC:", roc_auc_score(y_test, lr_default_woe.predict_proba(X_test)[:, 1]))
print("F1-score:", f1_score(y_test, lr_default_predictions_woe))


Logistic Regression (Default - WoE Data):
               precision    recall  f1-score   support

           0       1.00      0.40      0.57     19097
           1       0.00      1.00      0.01        36

    accuracy                           0.40     19133
   macro avg       0.50      0.70      0.29     19133
weighted avg       1.00      0.40      0.57     19133

ROC-AUC: 0.7000837827931088
F1-score: 0.006244037811117856


# **Summary of Results**:

# **Default Estimator (Original Data)**:

The Dummy Classifier and default Logistic Regression both demonstrated a strong bias towards the majority class (Non-Fraud).
They achieved high accuracy, but this was misleading due to the severe class imbalance.
Critically, they failed to detect any fraud cases, resulting in zero precision, recall, and F1-score for the Fraud class.
ROC-AUC was at or below 0.5 indicating no predictive power.

# **WoE Transformation**:

The WoE transformation successfully converted numerical features into a format that reflects their predictive power concerning fraud.
The calculated IV values provided insights into the importance of each feature bin.
The WoE transformation had a dramatic affect on the logistic regression.

# **Default Estimator (WoE Data)**:

The Logistic Regression model trained on WoE-transformed data showed a significant shift in behavior.
It achieved perfect recall for the Fraud class, meaning it captured all fraud cases.
However, this came at the cost of extremely low precision, indicating a high number of false positives.
ROC-AUC improved greatly.
This indicates the model is now over predicting fraud.