In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('creditcardcsvpresent.csv')

In [4]:
### Exploratory Data Analysis ###

In [5]:
# How many Fraud Samples?
df.isFradulent.value_counts()

N    2627
Y     448
Name: isFradulent, dtype: int64

In [6]:
### Feature Engineering ###

In [7]:
# turn Y/N into booleans
df['Is declined'] = df['Is declined'].map(dict(Y=1, N=0))
df['isFradulent'] = df['isFradulent'].map(dict(Y=1, N=0))
df['isForeignTransaction'] = df['isForeignTransaction'].map(dict(Y=1, N=0))
df['isHighRiskCountry'] = df['isHighRiskCountry'].map(dict(Y=1, N=0))

In [8]:
# Dropping isFradulent because it's our target. Dropping Transaction date because it's empty.
X = df.drop(columns=['isFradulent', 'Transaction date'])

In [9]:
y = df[['isFradulent']]

In [10]:
### Analysis ###

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [12]:
model = LogisticRegression(penalty='l1')

In [13]:
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
predicted = model.predict(X_test)

In [15]:
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99       773
          1       0.97      0.96      0.96       150

avg / total       0.99      0.99      0.99       923



According to the above results, this model classifies 99% of results correctly

In [16]:
print(confusion_matrix(y_true=y_test, y_pred=predicted))

[[768   5]
 [  6 144]]


Based on the above results:

-768 test samples true positive (normal transactions correctly identified as normal transactions)

-6 test samples false positive (fraud incorrectly identified as normal transactions)

-143 samples true negative (fraud correctly identified as fraud)

-5 samples false negative (normal transactions incorrectly identified as fraud)

Note that re-running the fit and predict steps of this model will result in small amount of deviation.
