In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 


In [10]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline

In [3]:
# Read in data
data = pd.read_csv("../data/processed/fraud.csv")


In [8]:
X = data.drop("FraudFound_P", axis=1)
y = data["FraudFound_P"]

# Split data into train (60%), val(20%), test (20%)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=42, stratify=y_train_full
)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size: {len(X_test)}")

Train size: 9252
Validation size: 3084
Test size: 3084


In [20]:
# Pipeline: SMOTEENN + Scaler + RF
pipeline = Pipeline([
    ('smote', SMOTEENN(random_state=42)),
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Baseline RF

In [21]:
pipeline.fit(X_train, y_train)

# Evaluate on validation set
y_val_pred = pipeline.predict(X_val)
y_val_prob = pipeline.predict_proba(X_val)[:, 1]

print("\n--- Validation Results ---")
print("ROC-AUC:", roc_auc_score(y_val, y_val_prob))
print("Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))


--- Validation Results ---
ROC-AUC: 0.802972263868066
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      2900
           1       0.23      0.22      0.22       184

    accuracy                           0.91      3084
   macro avg       0.59      0.59      0.59      3084
weighted avg       0.91      0.91      0.91      3084

Confusion Matrix:
[[2765  135]
 [ 144   40]]


# Hyperparameter Tuning

In [13]:
# StratifiedKFold to preserve class balance in folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {
    "rf__n_estimators": [100, 200, 300],
    "rf__max_depth": [None, 10, 20, 30],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4],
    "rf__max_features": ["sqrt", "log2"]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

print("Best parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}
Best ROC-AUC: 0.7970337661863608


In [24]:
# Evaluate on validation set
y_val_prob = best_rf.predict_proba(X_val)[:, 1]
y_val_pred = best_rf.predict(X_val)

print("\n--- Validation Results ---")
print("ROC-AUC:", roc_auc_score(y_val, y_val_prob))
print("Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))


--- Validation Results ---
ROC-AUC: 0.80779047976012
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95      2900
           1       0.25      0.23      0.24       184

    accuracy                           0.91      3084
   macro avg       0.60      0.59      0.60      3084
weighted avg       0.91      0.91      0.91      3084

Confusion Matrix:
[[2770  130]
 [ 141   43]]
