In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
import joblib

# Load saved pipelines
lr_pipeline = joblib.load('/home/danial/Data Science/Fraud Detection/Models/lr_pipeline.pkl')
dt_pipeline = joblib.load('/home/danial/Data Science/Fraud Detection/Models/dt_pipeline.pkl')

# Load train data
train_df = pd.read_csv('/home/danial/Data Science/Fraud Detection/Data/splits/train.csv')

# Separate features and target
X_train = train_df.drop('Class', axis=1)
y_train = train_df['Class']


In [2]:
# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [3]:
# Cross Validation F1 for Logistic Regression
lr_f1_scores = cross_val_score(lr_pipeline, X_train, y_train, cv=skf, scoring='f1', n_jobs=-1)
print("Logistic Regression CV F1 scores:", lr_f1_scores)
print("Mean F1:", lr_f1_scores.mean())
print("Std F1:", lr_f1_scores.std())


Logistic Regression CV F1 scores: [0.796875   0.75555556 0.74193548 0.8        0.77862595]
Mean F1: 0.7745983987249992
Std F1: 0.022747331261179355


In [4]:
# Cross Validation F1 for Decision Tree
dt_f1_scores = cross_val_score(dt_pipeline, X_train, y_train, cv=skf, scoring='f1', n_jobs=-1)
print("Decision Tree CV F1 scores:", dt_f1_scores)
print("Mean F1:", dt_f1_scores.mean())
print("Std F1:", dt_f1_scores.std())


Decision Tree CV F1 scores: [0.80555556 0.76712329 0.76190476 0.77333333 0.81632653]
Mean F1: 0.7848486938154258
Std F1: 0.021876224052895164


In [5]:
print("Summary:")
print("LR F1 mean ± std:", f"{lr_f1_scores.mean():.4f} ± {lr_f1_scores.std():.4f}")
print("DT F1 mean ± std:", f"{dt_f1_scores.mean():.4f} ± {dt_f1_scores.std():.4f}")


Summary:
LR F1 mean ± std: 0.7746 ± 0.0227
DT F1 mean ± std: 0.7848 ± 0.0219
