In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('creditcardcsvpresent.csv')

In [4]:
### Exploratory Data Analysis ###

In [5]:
# How many Fraud Samples?
df.isFradulent.value_counts()

N    2627
Y     448
Name: isFradulent, dtype: int64

In [6]:
### Feature Engineering ###

In [7]:
# turn Y/N into booleans
df['Is declined'] = df['Is declined'].map(dict(Y=1, N=0))
df['isFradulent'] = df['isFradulent'].map(dict(Y=1, N=0))
df['isForeignTransaction'] = df['isForeignTransaction'].map(dict(Y=1, N=0))
df['isHighRiskCountry'] = df['isHighRiskCountry'].map(dict(Y=1, N=0))

In [8]:
# Dropping isFradulent because it's our target. Dropping Transaction date because it's empty.
X = df.drop(columns=['isFradulent', 'Transaction date'])

In [9]:
y = df[['isFradulent']]

In [10]:
### Analysis ###

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [24]:
model = LogisticRegression(penalty='l1', solver='liblinear')

In [25]:
model.fit(X_train, np.ravel(y_train))

LogisticRegression(penalty='l1', solver='liblinear')

In [26]:
predicted = model.predict(X_test)

In [27]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       773
           1       0.97      0.95      0.96       150

    accuracy                           0.99       923
   macro avg       0.98      0.97      0.98       923
weighted avg       0.99      0.99      0.99       923



According to the above results, this model classifies 99% of results correctly

In [30]:
results = confusion_matrix(y_true=y_test, y_pred=predicted)
print(confusion_matrix(y_true=y_test, y_pred=predicted))

[[769   4]
 [  7 143]]


In [33]:
print(f"True Positive: {results[0][0]} \n False Positive: {results[0][1]} \
      \n False Negative: {results[1][0]} \n True Negative: {results[1][1]}")

True Positive: 769 
 False Positive: 4       
 False Negative: 7 
 True Negative: 143
