## Fraud Detection using Logistic Regression

In [44]:
# Importing the Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [74]:
# Loading the Dataset
dataset = pd.read_csv('Fraud.csv')

In [75]:
#Checking the shape of the dataset
dataset.shape

(6362620, 11)

In [4]:
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


### Check the number of missing values

In [5]:
# Number of missing values
dataset.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

### Check whether the dataset is balanced or imbalanced

In [6]:
dataset['isFraud'].value_counts()

isFraud
0    6354407
1       8213
Name: count, dtype: int64

#### It is highly imbalanced dataset, We need to balance it using UNDERSAMPLING technique.

In [7]:
# Separating the legit and fraudulent transactions
legit = dataset[dataset.isFraud == 0]
fraud = dataset[dataset.isFraud == 1]

In [8]:
print(legit.shape)
print(fraud.shape)

(6354407, 11)
(8213, 11)


In [9]:
legit_sample = legit.sample(n=8213)

In [10]:
legit_sample.shape

(8213, 11)

In [11]:
#concatenate 
d = pd.concat([legit_sample, fraud], axis = 0)

In [12]:
d.shape

(16426, 11)

In [49]:
d['isFraud'].value_counts()

isFraud
0    8213
1    8213
Name: count, dtype: int64

#### Now the data is balanced

### Use Label Encoding to deal with categorical data

In [14]:
#Import the Label Encoder

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [15]:
#label encode the 'type' column
d['type_encoded'] = label_encoder.fit_transform(d['type'])

#label encode the 'nameOrig' column
d['nameOrig_encoded'] = label_encoder.fit_transform(d['nameOrig'])

#label encode the 'nameDest' column
d['nameDest_encoded'] = label_encoder.fit_transform(d['nameDest'])

In [16]:
d.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_encoded,nameOrig_encoded,nameDest_encoded
1868849,164,CASH_IN,175967.36,C773964108,282.0,176249.36,C1340271509,57140.84,0.0,0,0,0,14468,2322
751312,38,TRANSFER,454549.05,C2037050050,0.0,0.0,C852518538,1122791.36,1577340.42,0,0,4,8871,12470
1270879,135,CASH_IN,324313.56,C1238662062,438388.95,762702.5,C899699848,454966.98,130653.42,0,0,0,2098,12807
5593810,394,CASH_IN,291466.42,C1878529523,30265.0,321731.42,C1606469602,0.0,0.0,0,0,0,7571,4201
1468864,140,PAYMENT,3061.77,C1212397383,31600.0,28538.23,M1517661863,0.0,0.0,0,0,3,1877,14211


In [23]:
a = d.drop(['type','nameOrig','nameDest'], axis=1)

### Split the Dataset into Features and Target Variable

In [24]:
# X -> Input features ; Y -> Target Variable

X = a.drop(['isFraud'], axis=1)
y = a['isFraud']

### Split the dataset into training and testing data

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
X_train.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud,type_encoded,nameOrig_encoded,nameDest_encoded
292640,15,PAYMENT,2351.82,C1454362657,10474.0,8122.18,M573720011,0.0,0.0,0,3,3889,15602
4937906,350,CASH_IN,95096.18,C1905056888,20958.0,116054.18,C1790831319,10805230.34,10710134.16,0,0,7796,5472
6201836,579,CASH_OUT,1540624.7,C1904414737,1540624.7,0.0,C2037994906,78012.35,1618637.05,0,1,7789,7191
176634,12,CASH_IN,25947.5,C2128048578,158.0,26105.5,C1688537809,0.0,0.0,0,0,9621,4791
6265699,614,CASH_OUT,5980.8,C1827873700,0.0,0.0,C350130544,81932.05,87912.85,0,1,7131,8979


In [26]:
y_train.head()

292640     0
4937906    0
6201836    1
176634     0
6265699    0
Name: isFraud, dtype: int64

## Initialize and train the Random Forest classifier

In [34]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

### Predict 

In [72]:
x_train_pred = model.predict(X_train)

#Accuracy score on training data
print(accuracy_score(x_train_pred, y_train))

1.0


In [68]:
pd.DataFrame(y_pred)

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,0
...,...
13135,1
13136,1
13137,0
13138,0


### Accuracy score

In [73]:
#Accuracy score on testing data

y_pred = model.predict(X_test)
print(accuracy_score(y_pred, y_test))

0.9917833231892879


In [59]:
#Classification report
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1630
           1       1.00      0.99      0.99      1656

    accuracy                           0.99      3286
   macro avg       0.99      0.99      0.99      3286
weighted avg       0.99      0.99      0.99      3286



In [60]:
#Confusion Matrix
print(confusion_matrix(y_test, y_pred))

[[1626   23]
 [   4 1633]]
