In [1]:
import pandas as pd

In [65]:
df = pd.read_csv("fraud.csv")

In [64]:
print(df.shape)

(15420, 45)


In [67]:
# Separate features and response
X = df.drop(columns=['FraudFound_P'])
y = df['FraudFound_P']

In [69]:
# Split into training (75%) and testing (25%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,   # ensures reproducibility
    stratify=y         # preserves class balance
)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (11565, 44) (11565,)
Testing set shape: (3855, 44) (3855,)


### Function to check class proportions

In [36]:
def check_class_proportions(y):
    """
    Prints the count and proportion of each class in a pandas Series or array.
    """
    y_series = pd.Series(y, name="label")
    counts = y_series.value_counts().sort_index()
    proportions = y_series.value_counts(normalize=True).sort_index()
    
    result = pd.DataFrame({
        "count": counts,
        "proportion": proportions.round(4)
    })
    
    print(result)
    return result

## Logistic Regression (Base Model)

### 5-fold Cross Validation

Here, 5-fold CV is performed on the full training set to conduct hyperparameter tuning. We keep the actual test set separate, reserving it for the final model evaluation. The model with the combination of hyperparameters which yields the highest average cross-validated F1-score will be selected.

In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.metrics import (
    make_scorer, accuracy_score, precision_score, recall_score, 
    f1_score, roc_auc_score
)

# Logistic regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Hyperparameter grid 
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],      # solvers that support l1/l2 regularization
    'class_weight': [None, 'balanced']
}

# --- Scoring metrics ---
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score, response_method='predict_proba')
}

# --- Stratified 5-fold cross-validation setup ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- Grid search (optimize for F1 score) ---
grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    n_jobs=-1,
    verbose=2,
    error_score='raise'  # show which combos fail
)

grid.fit(X_train, y_train)


print("\n=== Model with Optimal Hyperparameters from Grid Search ===")
print("Optimal Parameters:", grid.best_params_)
print(f"Best F1 Score (CV mean): {grid.best_score_:.4f}")
best_model = grid.best_estimator_
cv_results = cross_validate(best_model, X_train, y_train, cv=cv, scoring=scoring)

print("\n=== 5-Fold Cross-Validation (Logistic Regression with Optimal Hyperparameters) ===")
for metric in scoring.keys():
    print(f"{metric:10s}: {cv_results[f'test_{metric}'].mean():.4f} ± {cv_results[f'test_{metric}'].std():.4f}")



Fitting 5 folds for each of 20 candidates, totalling 100 fits

=== Model with Optimal Hyperparameters from Grid Search ===
Optimal Parameters: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}
Best F1 Score (CV mean): 0.2158

=== 5-Fold Cross-Validation (Logistic Regression with Optimal Hyperparameters) ===
accuracy  : 0.6579 ± 0.0104
precision : 0.1251 ± 0.0036
recall    : 0.7877 ± 0.0542
f1        : 0.2158 ± 0.0073
roc_auc   : 0.7826 ± 0.0149


The 5-fold CV results suggest that the selected model with the hyperparameter combination: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'} is the optimal one which produces the highest average cross-validated F1-score.