In [9]:
import pandas as pd

### Reading in data and splitting into train, validation, test sets

In [32]:
df = pd.read_csv("fraud.csv")

In [33]:
print(df.shape)

(15420, 45)


In [34]:
# Separate features and response
X = df.drop(columns=['FraudFound_P'])
y = df['FraudFound_P']

In [35]:
from sklearn.model_selection import train_test_split
# Split into training (80% of whole dataset) and testing (20% of whole dataset) 
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,   # ensures reproducibility
    stratify=y         # preserves class balance
)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (12336, 44) (12336,)
Testing set shape: (3084, 44) (3084,)


In [36]:
# Further split training into training (60% of whole dataset) and validation (20% of whole dataset) 
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train, y_train,
    test_size=0.25,
    random_state=42,   # ensures reproducibility
    stratify=y_train         # preserves class balance
)

print("Training set shape:", X_subtrain.shape, y_subtrain.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (9252, 44) (9252,)
Validation set shape: (3084, 44) (3084,)
Testing set shape: (3084, 44) (3084,)


Above, we have effectively split the data into 60% train, 20% validation, and 20% test. The training and validation sets will be used for the train-validation set approach, while the full training set (train and validation combined) will be used to perform cross-validation later on.

### Function to check class proportions

In [37]:
def check_class_proportions(y):
    """
    Prints the count and proportion of each class in a pandas Series or array.
    """
    y_series = pd.Series(y, name="label")
    counts = y_series.value_counts().sort_index()
    proportions = y_series.value_counts(normalize=True).sort_index()
    
    result = pd.DataFrame({
        "count": counts,
        "proportion": proportions.round(4)
    })
    
    print(result)
    return result

## Logistic Regression (Base Model)

### Train-validation set approach

Now, we use the train-validation set approach to conduct hyperparameter tuning on the logistic regression model. The model with the combination of hyperparameters which produces the highest validation F1-score will be selected.

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    precision_recall_curve, auc
)
from itertools import product
import pandas as pd
import numpy as np

best_f1 = 0
best_params = None
best_metrics = {}
best_model = None

# --- Hyperparameter grid ---
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],      # supports both L1 and L2
    'class_weight': [None, 'balanced']
}

# --- Loop through all combinations ---
for C, penalty, solver, class_weight in product(
    param_grid['C'],
    param_grid['penalty'],
    param_grid['solver'],
    param_grid['class_weight']
):
    try:
        model = LogisticRegression(
            C=C,
            penalty=penalty,
            solver=solver,
            class_weight=class_weight,
            max_iter=1000,
            random_state=42
        )
        model.fit(X_subtrain, y_subtrain)

        # Predict on validation set
        y_val_pred = model.predict(X_val)
        y_val_prob = model.predict_proba(X_val)[:, 1]

        # Compute metrics
        acc = accuracy_score(y_val, y_val_pred)
        prec = precision_score(y_val, y_val_pred, zero_division=0)
        rec = recall_score(y_val, y_val_pred, zero_division=0)
        f1 = f1_score(y_val, y_val_pred, zero_division=0)
        roc_auc = roc_auc_score(y_val, y_val_prob)

        # --- Compute PR-AUC ---
        precision_vals, recall_vals, _ = precision_recall_curve(y_val, y_val_prob)
        pr_auc = auc(recall_vals, precision_vals)

        # Track best model (based on F1)
        if f1 > best_f1:
            best_f1 = f1
            best_params = {
                'C': C,
                'penalty': penalty,
                'solver': solver,
                'class_weight': class_weight
            }
            best_metrics = {
                'Accuracy': acc,
                'Precision': prec,
                'Recall': rec,
                'F1': f1,
                'ROC-AUC': roc_auc,
                'PR-AUC': pr_auc
            }
            best_model = model

    except Exception as e:
        print(f"Skipped combination (C={C}, penalty={penalty}) due to error: {e}")
        continue

# --- Summary of tuning results ---
print("\n=== Train–Validation Hyperparameter Tuning Results ===")
print("Best Parameters:", best_params)
print(f"Best F1 Score (Validation): {best_f1:.4f}\n")

print("=== Metrics for Selected Model (Optimal Hyperparameters) ===")
for metric, value in best_metrics.items():
    print(f"{metric:10s}: {value:.4f}")

# --- Classification summary  ---
print("\n=== Classification Summary for Best Model ===")
y_val_pred = best_model.predict(X_val)
cm = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix:\n", cm)

report_dict = classification_report(y_val, y_val_pred, output_dict=True, zero_division=0)
report_df = pd.DataFrame(report_dict).transpose()
print("\nClassification Report:\n")
display(report_df.style.set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center')]},
     {'selector': 'td', 'props': [('text-align', 'center')]}]
).format(precision=6))





=== Train–Validation Hyperparameter Tuning Results ===
Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear', 'class_weight': 'balanced'}
Best F1 Score (Validation): 0.2241

=== Metrics for Selected Model (Optimal Hyperparameters) ===
Accuracy  : 0.6767
Precision : 0.1308
Recall    : 0.7826
F1        : 0.2241
ROC-AUC   : 0.7831
PR-AUC    : 0.1390

=== Classification Summary for Best Model ===
Confusion Matrix:
 [[1943  957]
 [  40  144]]

Classification Report:



Unnamed: 0,precision,recall,f1-score,support
0,0.979829,0.67,0.795822,2900.0
1,0.13079,0.782609,0.224125,184.0
accuracy,0.676719,0.676719,0.676719,0.676719
macro avg,0.555309,0.726304,0.509973,3084.0
weighted avg,0.929173,0.676719,0.761713,3084.0


The results of the train-validation approach suggest that the model with the hyperparameter combination: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear', 'class_weight': 'balanced'} is the optimal one which produces the highest validation F1-score.

### 5-fold Cross Validation

Here, 5-fold CV is performed on the full training set to conduct hyperparameter tuning. We keep the actual test set separate, reserving it for the final model evaluation. The model with the combination of hyperparameters which yields the highest average cross-validated F1-score will be selected.

In [46]:
from sklearn.metrics import (
    make_scorer, accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report,
    precision_recall_curve, auc, average_precision_score   
)
import pandas as pd
import numpy as np

# --- Logistic Regression base model ---
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# --- Hyperparameter grid ---
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'class_weight': [None, 'balanced']
}

# --- 5-fold Stratified CV setup ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- Optimize for F1-score ---
grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    n_jobs=-1,
    verbose=2,
    error_score='raise'
)
grid.fit(X_train, y_train)

# --- Best hyperparameters ---
best_model = grid.best_estimator_
print("\n=== Cross-Validation Hyperparameter Tuning Results ===")
print("Best Parameters:", grid.best_params_)
print(f"Best Mean F1 (CV): {grid.best_score_:.4f}")

# --- Cross-validation predictions ---
y_train_pred = cross_val_predict(best_model, X_train, y_train, cv=cv)
y_train_prob = cross_val_predict(best_model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]

Fitting 5 folds for each of 20 candidates, totalling 100 fits

=== Cross-Validation Hyperparameter Tuning Results ===
Best Parameters: {'C': 10, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
Best Mean F1 (CV): 0.2211


In [47]:
from sklearn.metrics import (
    make_scorer, accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report,
    precision_recall_curve, auc, average_precision_score   
)
import pandas as pd
import numpy as np
# --- Compute metrics using cross-validation predictions ---
acc = accuracy_score(y_train, y_train_pred)
prec = precision_score(y_train, y_train_pred, zero_division=0)
rec = recall_score(y_train, y_train_pred, zero_division=0)
f1 = f1_score(y_train, y_train_pred, zero_division=0)
roc_auc = roc_auc_score(y_train, y_train_prob)

# --- PR curve & areas ---
prec_curve, rec_curve, _ = precision_recall_curve(y_train, y_train_prob)
pr_auc = auc(rec_curve, prec_curve)                 # area under PR curve
avg_prec = average_precision_score(y_train, y_train_prob)  # AP (threshold-free)

print("\n=== Metrics for Selected Model (Optimal Hyperparameters) ===")
print(f"Accuracy  : {acc:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")
print(f"F1        : {f1:.4f}")
print(f"ROC-AUC   : {roc_auc:.4f}")
print(f"PR-AUC    : {pr_auc:.4f}")
print(f"Avg Precision (AP): {avg_prec:.4f}")

# --- Confusion matrix and classification report ---
print("\n=== Classification Summary for Best Model ===")
cm = confusion_matrix(y_train, y_train_pred)
print("Confusion Matrix:\n", cm)

report_dict = classification_report(y_train, y_train_pred, output_dict=True, zero_division=0)
report_df = pd.DataFrame(report_dict).transpose()

print("\nClassification Report:\n")
display(report_df.style.set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center')]},
     {'selector': 'td', 'props': [('text-align', 'center')]}]
).format(precision=6))


=== Metrics for Selected Model (Optimal Hyperparameters) ===
Accuracy  : 0.6612
Precision : 0.1281
Recall    : 0.8035
F1        : 0.2210
ROC-AUC   : 0.7799
PR-AUC    : 0.1354
Avg Precision (AP): 0.1361

=== Classification Summary for Best Model ===
Confusion Matrix:
 [[7563 4035]
 [ 145  593]]

Classification Report:



Unnamed: 0,precision,recall,f1-score,support
0,0.981188,0.652095,0.783487,11598.0
1,0.128133,0.803523,0.221021,738.0
accuracy,0.661154,0.661154,0.661154,0.661154
macro avg,0.554661,0.727809,0.502254,12336.0
weighted avg,0.930154,0.661154,0.749838,12336.0


The 5-fold CV results suggest that the model with the hyperparameter combination: {'C': 10, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'} is the optimal one which produces the highest average cross-validated F1-score. The optimal hyperparameter combination selected by 5-fold CV is similar to that chosen using the train-validation approach earlier.