In [2]:
#importing nesessary modules
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Loading Dataset
dataset = pd.read_csv('Fraud.csv')

In [4]:
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
# Checking for missing values
dataset.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [6]:
dataset['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [7]:
dataset['isFlaggedFraud'].value_counts()

0    6362604
1         16
Name: isFlaggedFraud, dtype: int64

In [16]:
#Categorizing Classes into two separate variables
fraud = dataset[dataset.isFraud == 1]
right = dataset[dataset.isFraud == 0]
print(fraud.shape)
print(right.shape)

(8213, 11)
(6354407, 11)


In [17]:
fraud.amount.describe()

count    8.213000e+03
mean     1.467967e+06
std      2.404253e+06
min      0.000000e+00
25%      1.270913e+05
50%      4.414234e+05
75%      1.517771e+06
max      1.000000e+07
Name: amount, dtype: float64

In [18]:
right.amount.describe()

count    6.354407e+06
mean     1.781970e+05
std      5.962370e+05
min      1.000000e-02
25%      1.336840e+04
50%      7.468472e+04
75%      2.083648e+05
max      9.244552e+07
Name: amount, dtype: float64

In [29]:
#Comparing values for transcations
dataset.select_dtypes(include=np.number).groupby('isFraud').mean()
#dataset.groupby('isFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,243.235663,178197.0,832828.7,855970.228109,1101421.0,1224926.0,0.0
1,368.413856,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


In [20]:
#Under Sampling using random sampling
right_sample = right.sample(n=8213)

In [21]:
#Merging two dataframe to create a new dataset
new_dataset = pd.concat([right_sample, fraud], axis = 0)

In [25]:
new_dataset['isFraud'].value_counts()

0    8213
1    8213
Name: isFraud, dtype: int64

In [30]:
new_dataset.select_dtypes(include=np.number).groupby('isFraud').mean()
#new_dataset.groupby('isFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,242.60246,186748.8,842605.8,866571.032366,1131571.0,1257676.0,0.0
1,368.413856,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


In [34]:
#Splitting the dataset
X = new_dataset.drop(columns = ['isFraud','type','nameOrig','nameDest','isFlaggedFraud'], axis = 1)
y = new_dataset['isFraud']
print(X)

         step      amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
629677     34     2947.20         301.00         3248.20            0.00   
1467581   140   418493.57           0.00            0.00      3983461.83   
1206267   133   109897.82       13539.08            0.00       441853.54   
5048563   354    19602.64      320630.05       301027.41            0.00   
5574646   393    11484.92       84835.00        73350.08            0.00   
...       ...         ...            ...             ...             ...   
6362615   743   339682.13      339682.13            0.00            0.00   
6362616   743  6311409.28     6311409.28            0.00            0.00   
6362617   743  6311409.28     6311409.28            0.00        68488.84   
6362618   743   850002.52      850002.52            0.00            0.00   
6362619   743   850002.52      850002.52            0.00      6510099.11   

         newbalanceDest  
629677             0.00  
1467581      4401955.39  
1206267  

In [35]:
print(y)

629677     0
1467581    0
1206267    0
5048563    0
5574646    0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 16426, dtype: int64


In [39]:
#Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify = y, random_state=True)

In [41]:
print(X.shape,X_train.shape,X_test.shape)

(16426, 6) (12319, 6) (4107, 6)


In [50]:
#Model Training using Logistic Regression
model = LogisticRegression()

In [51]:
model.fit(X_train, y_train)

In [52]:
#Model Accuracy Training
X_training_prediction = model.predict(X_train)
training_accuracy = accuracy_score(X_training_prediction, y_train)

In [53]:
print("Training accuracy score is : ",training_accuracy)

Training accuracy score is :  0.9044565305625457


In [54]:
#Model Accuracy Test
X_test_prediction = model.predict(X_test)
test_acccuracy = accuracy_score(X_test_prediction, y_test)

In [56]:
print("Test accuracy score is : ",test_acccuracy)

Test accuracy score is :  0.9079620160701242
