In [2]:
import pandas as pd

### Reading in data and splitting into train, validation, test sets

In [3]:
df = pd.read_csv("fraud.csv")

In [4]:
print(df.shape)

(15420, 45)


In [5]:
# Separate features and response
X = df.drop(columns=['FraudFound_P'])
y = df['FraudFound_P']

In [11]:
from sklearn.model_selection import train_test_split
# Split into training (80% of whole dataset) and testing (20% of whole dataset) 
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,   # ensures reproducibility
    stratify=y         # preserves class balance
)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (12336, 44) (12336,)
Testing set shape: (3084, 44) (3084,)


In [9]:
# Further split training into training (60% of whole dataset) and validation (20% of whole dataset) 
X_subtrain, X_val, y_subtrain, y_val = train_test_split(
    X_train, y_train,
    test_size=0.25,
    random_state=42,   # ensures reproducibility
    stratify=y_train         # preserves class balance
)

print("Training set shape:", X_subtrain.shape, y_subtrain.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (9252, 44) (9252,)
Validation set shape: (3084, 44) (3084,)
Testing set shape: (3084, 44) (3084,)


Above, we have effectively split the data into 60% train, 20% validation, and 20% test. The training and validation sets will be used for the train-validation set approach, while the full training set (train and validation combined) will be used to perform cross-validation later on.

### Function to check class proportions

In [36]:
def check_class_proportions(y):
    """
    Prints the count and proportion of each class in a pandas Series or array.
    """
    y_series = pd.Series(y, name="label")
    counts = y_series.value_counts().sort_index()
    proportions = y_series.value_counts(normalize=True).sort_index()
    
    result = pd.DataFrame({
        "count": counts,
        "proportion": proportions.round(4)
    })
    
    print(result)
    return result

## Logistic Regression (Base Model)

### Train-validation set approach

Now, we use the train-validation set approach to conduct hyperparameter tuning on the logistic regression model. The model with the combination of hyperparameters which produces the highest validation F1-score will be selected.

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from itertools import product
import numpy as np

best_f1 = 0
best_params = None
best_metrics = {}

# --- Hyperparameter grid ---
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],      # supports both L1 and L2
    'class_weight': [None, 'balanced']
}

# --- Loop through all combinations ---
for C, penalty, solver, class_weight in product(
    param_grid['C'],
    param_grid['penalty'],
    param_grid['solver'],
    param_grid['class_weight']
):
    try:
        # Train model
        model = LogisticRegression(
            C=C,
            penalty=penalty,
            solver=solver,
            class_weight=class_weight,
            max_iter=1000,
            random_state=42
        )
        model.fit(X_subtrain, y_subtrain)

        # Predict on validation set
        y_val_pred = model.predict(X_val)
        y_val_prob = model.predict_proba(X_val)[:, 1]

        # Compute metrics
        acc = accuracy_score(y_val, y_val_pred)
        prec = precision_score(y_val, y_val_pred, zero_division=0)
        rec = recall_score(y_val, y_val_pred, zero_division=0)
        f1 = f1_score(y_val, y_val_pred, zero_division=0)
        roc_auc = roc_auc_score(y_val, y_val_prob)

        # Track the best model (based on F1)
        if f1 > best_f1:
            best_f1 = f1
            best_params = {
                'C': C,
                'penalty': penalty,
                'solver': solver,
                'class_weight': class_weight
            }
            best_metrics = {
                'Accuracy': acc,
                'Precision': prec,
                'Recall': rec,
                'F1': f1,
                'ROC-AUC': roc_auc
            }

    except Exception as e:
        print(f"Skipped combination (C={C}, penalty={penalty}) due to error: {e}")
        continue

print("\n=== Train–Validation Hyperparameter Tuning Results ===")
print("Best Parameters:", best_params)
print(f"Best F1 Score (Validation): {best_f1:.4f}\n")

print("=== Metrics for Selected Model (Optimal Hyperparameters) ===")
for metric, value in best_metrics.items():
    print(f"{metric:10s}: {value:.4f}")



=== Train–Validation Hyperparameter Tuning Results ===
Best Parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear', 'class_weight': 'balanced'}
Best F1 Score (Validation): 0.2241

=== Metrics for Selected Model (Optimal Hyperparameters) ===
Accuracy  : 0.6767
Precision : 0.1308
Recall    : 0.7826
F1        : 0.2241
ROC-AUC   : 0.7831


The results of the train-validation approach suggest that the model with the hyperparameter combination: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear', 'class_weight': 'balanced'} is the optimal one which produces the highest validation F1-score.

### 5-fold Cross Validation

Here, 5-fold CV is performed on the full training set to conduct hyperparameter tuning. We keep the actual test set separate, reserving it for the final model evaluation. The model with the combination of hyperparameters which yields the highest average cross-validated F1-score will be selected.

In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import (
    make_scorer, accuracy_score, precision_score, recall_score, 
    f1_score, roc_auc_score
)

# Logistic regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Hyperparameter grid 
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],      # solvers that support l1/l2 regularization
    'class_weight': [None, 'balanced']
}

# --- Scoring metrics ---
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score, response_method='predict_proba')
}

# --- Stratified 5-fold cross-validation setup ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- Grid search (optimize for F1 score) ---
grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    n_jobs=-1,
    verbose=2,
    error_score='raise'  # show which combos fail
)

grid.fit(X_train, y_train)


print("\n=== Model with Optimal Hyperparameters from Grid Search ===")
print("Optimal Parameters:", grid.best_params_)
print(f"Best F1 Score (CV mean): {grid.best_score_:.4f}")
best_model = grid.best_estimator_
cv_results = cross_validate(best_model, X_train, y_train, cv=cv, scoring=scoring)

print("\n=== 5-Fold Cross-Validation (Logistic Regression with Optimal Hyperparameters) ===")
for metric in scoring.keys():
    print(f"{metric:10s}: {cv_results[f'test_{metric}'].mean():.4f} ± {cv_results[f'test_{metric}'].std():.4f}")



Fitting 5 folds for each of 20 candidates, totalling 100 fits

=== Model with Optimal Hyperparameters from Grid Search ===
Optimal Parameters: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
Best F1 Score (CV mean): 0.2158

=== 5-Fold Cross-Validation (Logistic Regression with Optimal Hyperparameters) ===
accuracy  : 0.6579 ± 0.0104
precision : 0.1251 ± 0.0036
recall    : 0.7877 ± 0.0542
f1        : 0.2158 ± 0.0073
roc_auc   : 0.7826 ± 0.0149


The 5-fold CV results suggest that the model with the hyperparameter combination: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'} is the optimal one which produces the highest average cross-validated F1-score. The optimal hyperparameter combination selected by 5-fold CV is similar to that chosen using the train-validation approach earlier. As 5-fold CV provides a more robust performance estimate by averaging across multiple folds, we will proceed with using this model chosen by 5-fold CV as our representative logistic regression model to compare against other candidate models.