Credit Card Fraud Detection Model

Context
It is important that credit card companies are able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase


In [11]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, confusion_matrix, classification_report, f1_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Loading the dataset
df = pd.read_csv('creditcard.csv')

# Splitting the dataset into features and target
X = df.drop(['Class'], axis=1)
y = df['Class']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Balancing the dataset
oversampler = RandomOverSampler(sampling_strategy='minority')
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Fitting the Logistic Regression model to the training data
logreg = LogisticRegression(max_iter=1000)
param_grid = {'C': [0.1, 1, 10], 'penalty': ['l2']}
grid_search = GridSearchCV(logreg, param_grid, cv=5, error_score='raise', scoring=make_scorer(f1_score))
grid_search.fit(X_train_resampled, y_train_resampled)

# Evaluating the performance of the model using cross-validation
cv_f1_scores = cross_val_score(grid_search.best_estimator_, X_train_resampled, y_train_resampled, cv=5, scoring=make_scorer(f1_score))
cv_roc_auc_scores = cross_val_score(grid_search.best_estimator_, X_train_resampled, y_train_resampled, cv=5, scoring=make_scorer(roc_auc_score))

print("Best parameters:", grid_search.best_params_)
print("Cross-validation F1 scores:", cv_f1_scores)
print("Mean F1 score:", np.mean(cv_f1_scores))
print("Cross-validation ROC AUC scores:", cv_roc_auc_scores)
print("Mean ROC AUC score:", np.mean(cv_roc_auc_scores))

# Predicting on the test data
y_pred = grid_search.predict(X_test)

# Evaluating the performance of the model using confusion matrix and classification report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("Classification Report:\n", cr)

# Evaluating the performance of the model using F1 score and ROC AUC score
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)


Best parameters: {'C': 0.1, 'penalty': 'l2'}
Cross-validation F1 scores: [0.94472293 0.94458498 0.94557492 0.94569131 0.94473923]
Mean F1 score: 0.9450626752198931
Cross-validation ROC AUC scores: [0.94642804 0.94630781 0.94723016 0.94728512 0.94642779]
Mean ROC AUC score: 0.9467357826242522
Confusion Matrix:
 [[55534  1330]
 [    8    90]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.92      0.12        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962

F1 Score: 0.11857707509881422
ROC AUC Score: 0.9474891039702319
