## Credit Card Fraud Detection

#### Work Flow
Credit Card Data ---> Data pre processing ---> Data Analysis ---> Train Test Split ---> Logistic Regression Model ---> Evaluation

### Modules/Libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

### Credit Card Data

In [4]:
# loading the dataset
creditcard = pd.read_csv('creditcard.csv')
creditcard.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
# checkingt the size of the data
creditcard.shape

(284807, 31)

### Data Pre-processing

In [6]:
# checking for missing values
creditcard.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
# checking the data types
creditcard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

### Data Analysis

In [8]:
# checking the statistical information
creditcard.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.918649e-15,5.682686e-16,-8.761736e-15,2.811118e-15,-1.552103e-15,2.04013e-15,-1.698953e-15,-1.893285e-16,-3.14764e-15,...,1.47312e-16,8.042109e-16,5.282512e-16,4.456271e-15,1.426896e-15,1.70164e-15,-3.662252e-16,-1.217809e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [10]:
# distribution of legit transactions and fraudulent transactions
creditcard['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

0 - Normal transaction        
1 - Fraudulent transaction     
The data ishighly unbalanced. Reason being that the amount of data for normal transaction is more than that for fraudulent transaction, thus when fed into our machine learning model, it's going to be biased towards normal transaction and thus we cannot predict accurately any fraudulent transaction

In [11]:
# separating the data for more analysis
legit = creditcard[creditcard.Class == 0]
fraud = creditcard[creditcard.Class == 1]

In [12]:
# checking the size of the separated data
legit_shape = legit.shape
fraud_shape = fraud.shape
print(f"Size of normal transaction: {legit_shape}")
print(f"Size of fraudulent transaction: {fraud_shape}")

Size of normal transaction: (284315, 31)
Size of fraudulent transaction: (492, 31)


In [13]:
# statistical measure of the data
# legit
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [14]:
# fraud
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [15]:
# compare the value for both transaction
creditcard.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


#### Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

##### Number of Fraudulent Transactions -> 492

In [16]:
# sample 492 random data
legit_sample = legit.sample(n=492)

##### Concatenating two dataframes

In [17]:
# concatenate 2 dataframes
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [18]:
# check the first five rows
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
264499,161458.0,-0.740109,1.766555,-0.683117,3.176792,3.095678,4.739461,0.14365,0.894376,-1.69017,...,-0.179924,-0.462455,0.154498,0.647705,-0.983848,-0.061035,-0.654681,-0.21215,0.75,0
250968,155148.0,-0.32825,0.836647,-1.783849,-1.640771,3.989082,2.952116,1.563025,0.260346,-0.773841,...,0.324035,1.129778,-0.626787,0.774617,1.017503,-0.121418,-0.19433,-0.142987,13.83,0
30184,35814.0,-0.196671,-0.06799,2.079449,-0.983374,-0.600528,0.544679,-0.477525,0.06192,-1.125595,...,0.513537,1.596911,-0.243516,-0.24604,-0.483561,0.014363,-0.098892,-0.08681,11.0,0
270545,164132.0,-1.804062,0.585676,0.136439,-2.228741,-1.291037,-0.356862,-0.949186,1.251502,-1.427569,...,0.566126,1.131402,-0.203957,-0.285265,0.053134,-0.202753,-0.469931,-0.160964,39.95,0
42817,41259.0,1.190232,0.283568,0.107623,0.9949,0.117527,-0.273007,0.192243,-0.105199,-0.074866,...,0.048758,0.206112,-0.136618,-0.259047,0.672821,-0.280123,0.02946,0.017689,23.32,0


In [19]:
# checking the distribution of data in the new dataset
new_dataset['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [20]:
# comparing the values for both transaction
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,97258.563008,0.051312,-0.002064,-0.010246,0.027133,0.06899,-0.073331,0.000895,-0.019338,-0.011531,...,0.000797,-0.004503,0.01125,-0.036938,0.016315,-0.012527,-0.011764,-0.0234,0.003369,83.542967
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


### Train Test Split

In [21]:
# splitting the data into Features and Target
X = new_dataset.drop(columns='Class', axis=1)
y = new_dataset['Class']

In [22]:
# checking X
X.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
264499,161458.0,-0.740109,1.766555,-0.683117,3.176792,3.095678,4.739461,0.14365,0.894376,-1.69017,...,0.285949,-0.179924,-0.462455,0.154498,0.647705,-0.983848,-0.061035,-0.654681,-0.21215,0.75
250968,155148.0,-0.32825,0.836647,-1.783849,-1.640771,3.989082,2.952116,1.563025,0.260346,-0.773841,...,-0.000724,0.324035,1.129778,-0.626787,0.774617,1.017503,-0.121418,-0.19433,-0.142987,13.83
30184,35814.0,-0.196671,-0.06799,2.079449,-0.983374,-0.600528,0.544679,-0.477525,0.06192,-1.125595,...,0.298909,0.513537,1.596911,-0.243516,-0.24604,-0.483561,0.014363,-0.098892,-0.08681,11.0


In [23]:
# checking y
y.head(3)

264499    0
250968    0
30184     0
Name: Class, dtype: int64

###### Training  data

In [26]:
# split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

# check the size for the training data
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


##### Training the model

In [27]:
# model training
model = LogisticRegression()
model.fit(X,y)

LogisticRegression()

### Model Evaluation

##### Accuracy Score

In [32]:
# accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print(f"Accuracy score on training data: {training_data_accuracy*100}")

Accuracy score on training data: 94.0279542566709


In [31]:
# accuracy score on testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print(f"Accuracy score on training data: {test_data_accuracy*100}")

Accuracy score on training data: 93.4010152284264
