# Credit Card Fraud Detection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# credit_data = pd.read_csv(".\\input\\creditcard.csv")
credit_data = pd.read_csv("./input/creditcard.csv")

## Data Exploration

In [3]:
credit_data.head(n=10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [4]:
credit_data[["Class", "Time"]].groupby("Class").count()

Unnamed: 0_level_0,Time
Class,Unnamed: 1_level_1
0,284315
1,492


- unbalanced dataset, most transactions are legitimate

In [5]:
credit_data.shape

(284807, 31)

In [6]:
credit_data["Amount"].describe()

count    284807.000000
mean         88.349619
std         250.120109
min           0.000000
25%           5.600000
50%          22.000000
75%          77.165000
max       25691.160000
Name: Amount, dtype: float64

## Data Manipulation
### Standardize `Amount`

In [7]:
from sklearn import preprocessing
## https://www.askpython.com/python/examples/standardize-data-in-python
credit_data["Amount"] = preprocessing.scale(credit_data["Amount"])
credit_data.drop("Time", axis=1, inplace=True)
credit_data.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0
5,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,-0.338556,0
6,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,-0.099254,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,-0.333279,0
7,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,1.249376,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,-0.190107,0
8,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,-0.41043,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,0.019392,0
9,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,-0.338516,0


## Data Modeling
### Split the dataset into training and testing sets

In [8]:
from sklearn.model_selection import train_test_split
x = credit_data.drop("Class", axis=1)
y = credit_data["Class"]
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2)
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
102286,-0.309470,1.088486,0.691242,0.363091,-0.188216,-0.535290,0.353941,0.376982,-0.758531,-0.518232,...,-0.105188,-0.325074,-1.105408,0.362213,0.043889,-0.813692,0.008267,-0.095433,-0.024301,-0.273268
160588,1.546799,-0.802704,-2.460077,0.279238,0.548518,-0.119153,0.411733,-0.167277,0.718058,-0.871018,...,0.472819,-0.119265,-0.688846,-0.245924,-1.480736,0.011634,-0.054652,-0.057466,0.006911,0.776230
12846,1.136495,0.070236,0.853399,1.131064,-0.511741,-0.261490,-0.244554,-0.096453,1.806242,-0.648022,...,-0.153312,-0.176551,0.013376,-0.024399,0.413740,0.456740,0.406348,-0.025182,0.007617,-0.273308
99314,1.116465,-0.035347,0.100050,0.411819,0.785268,1.826907,-0.291966,0.441493,0.303455,-0.419856,...,-0.156677,-0.155715,-0.031044,0.025598,-1.735424,0.298088,0.446149,0.057849,-0.008577,-0.349231
278705,0.289786,0.267991,0.887726,-0.827709,0.544296,0.618742,0.222701,0.066304,0.680779,-0.448083,...,-0.056803,0.210649,0.934327,-0.239023,0.118133,-0.256721,-0.329553,-0.084863,-0.217314,-0.353189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39822,-1.889375,2.413828,0.912510,2.811219,-1.262497,-0.110527,-0.826664,1.399482,-1.348281,0.776083,...,0.238419,0.262776,0.654782,-0.049716,0.410152,-0.022336,0.340003,0.241754,0.122618,-0.310650
127094,-1.330365,1.232545,1.191169,-0.368288,0.240284,0.260937,0.360943,0.414537,-0.002946,0.716961,...,0.358023,-0.232454,-0.350697,0.132859,-0.331264,-0.212282,0.132122,0.610743,0.317620,-0.328121
234297,2.108108,-0.207740,-3.386067,-0.720760,2.905339,2.932171,0.054796,0.562129,0.017654,0.190507,...,-0.174293,0.095248,0.367526,0.010535,0.784185,0.412944,0.734475,-0.093999,-0.091945,-0.350191
148273,-0.724439,-0.122047,1.239030,-1.003733,1.170696,1.456970,0.091333,0.397337,0.647886,-0.563635,...,0.236893,0.305360,1.235444,-0.044052,-0.491608,-0.493830,0.754843,0.225042,0.003735,-0.173675


### Logistic Regression Model

In [9]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

99.92

- Coeficients

In [10]:
coeff = pd.DataFrame(credit_data.columns.delete(0))
coeff.columns = ["Feature"]
coeff["Correlation"] = pd.Series(logreg.coef_[0])
coeff

Unnamed: 0,Feature,Correlation
0,V2,0.083114
1,V3,0.043818
2,V4,0.050405
3,V5,0.717618
4,V6,0.113524
5,V7,-0.122972
6,V8,-0.118688
7,V9,-0.162161
8,V10,-0.318669
9,V11,-0.858944


### Decision Tree Model

In [11]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_dt = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_dt

100.0