In [77]:
# Importing Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


data = pd.read_csv("fraud_detection_dataset.csv")
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [46]:
# Checking information about the data
data.info()
data.describe()

# Checking for null data
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [35]:
data.type.value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [47]:
# Distribution of Fraud and Non-Fraud transactions
# Fraudulent transactions (1) and Non-Fraudulent (0)

data['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [62]:
# Converting categories into numeric  

data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5})

print(data.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0        19384.72   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [None]:
# Dataset is unbalanced hence requires processing
# Seperating data for analysis
non_fraud = data[data.isFraud == 0]
fraud = data[data.isFraud == 1]

print(non_fraud.shape)
print(fraud.shape)

In [57]:
fraud.amount.describe()

count    8.213000e+03
mean     1.467967e+06
std      2.404253e+06
min      0.000000e+00
25%      1.270913e+05
50%      4.414234e+05
75%      1.517771e+06
max      1.000000e+07
Name: amount, dtype: float64

In [60]:
non_fraud.amount.describe()

count    6.354407e+06
mean     1.781970e+05
std      5.962370e+05
min      1.000000e-02
25%      1.336840e+04
50%      7.468472e+04
75%      2.083648e+05
max      9.244552e+07
Name: amount, dtype: float64

In [63]:
# Mean value of Fraud vs Non-Fraud

data.groupby("isFraud").mean(numeric_only=True)

Unnamed: 0_level_0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,243.235663,2.054737,178197.0,832828.7,855970.228109,1101421.0,1224926.0,0.0
1,368.413856,2.49653,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


In [64]:
# Under-Sampling
# Building sample dataset of Fraud and Non-Fraud transactions

non_fraud_sample = non_fraud.sample(n=8213)

In [65]:
# Adding Non-Fraud sample to Fraud sample
new_data = pd.concat([non_fraud_sample, fraud], axis=0)

In [66]:
# Random Non-Fraud(0) gets added to the top
new_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
516280,20,CASH_OUT,52004.46,C971982899,10006.0,0.0,C1848372882,336179.39,388183.85,0,0
318698,16,CASH_IN,224765.83,C769264146,634286.72,859052.55,C2109293916,1751229.01,2060756.25,0,0
4778677,335,PAYMENT,2929.96,C1695184496,53886.69,50956.73,M1264491669,0.0,0.0,0,0
741267,38,PAYMENT,8590.0,C26873422,79517.84,70927.84,M2104558888,0.0,0.0,0,0
3878948,283,PAYMENT,9760.6,C1549907890,0.0,0.0,M1969110197,0.0,0.0,0,0


In [67]:
# Random Fraud(1) gets added to the bottom
new_data.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [68]:
new_data['isFraud'].value_counts()

0    8213
1    8213
Name: isFraud, dtype: int64

In [69]:
# Mean value of Fraud vs Non-Fraud in new dataset

new_data.groupby("isFraud").mean(numeric_only=True)

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,243.982345,180543.3,801535.6,825330.086536,1078210.0,1203055.0,0.0
1,368.413856,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


In [85]:
X = new_data.drop(columns=['isFraud','type','nameOrig', 'nameDest'], axis = 1)
y = new_data['isFraud']
print(X)

         step      amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
516280     20    52004.46       10006.00            0.00       336179.39   
318698     16   224765.83      634286.72       859052.55      1751229.01   
4778677   335     2929.96       53886.69        50956.73            0.00   
741267     38     8590.00       79517.84        70927.84            0.00   
3878948   283     9760.60           0.00            0.00            0.00   
...       ...         ...            ...             ...             ...   
6362615   743   339682.13      339682.13            0.00            0.00   
6362616   743  6311409.28     6311409.28            0.00            0.00   
6362617   743  6311409.28     6311409.28            0.00        68488.84   
6362618   743   850002.52      850002.52            0.00            0.00   
6362619   743   850002.52      850002.52            0.00      6510099.11   

         newbalanceDest  isFlaggedFraud  
516280        388183.85               0  
318

In [72]:
print(y)

516280     0
318698     0
4778677    0
741267     0
3878948    0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 16426, dtype: int64


In [86]:
# Splitting Data into training and testing data
# stratify = y for even distribution of data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 2)


In [87]:
print(X.shape, X_train.shape, X_test.shape)

(16426, 7) (13140, 7) (3286, 7)


In [88]:
# Training the model

model = LogisticRegression()
model.fit(X_train,y_train)

In [92]:
# Accuracy_score on Training Data
X_train_prediction = model.predict(X_train)

train_score = accuracy_score(X_train_prediction, y_train)
print(train_score)

0.9035007610350076


In [93]:
# Accuracy_score on Testing Data
X_test_prediction = model.predict(X_test)

test_score = accuracy_score(X_test_prediction, y_test)
print(test_score)

0.8980523432744979


In [94]:
# Testing with DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train,y_train)

In [97]:
# Accuracy_score on Training Data
X_train_prediction = model.predict(X_train)

train_score = accuracy_score(X_train_prediction, y_train)
print(train_score)

1.0


In [98]:
# Accuracy_score on Testing Data
X_test_prediction = model.predict(X_test)

test_score = accuracy_score(X_test_prediction, y_test)
print(test_score)

0.9923919659160073
