# Credit Card fraud detection using Deep Learning project
# codes - Submitted By: Devran Dey Sarkar(1181700008)

## Importing Dependencies from various Python Libraries

In [None]:
import pandas as pd
import numpy as np
import sklearn.utils
from sklearn.metrics import accuracy_score, classification_report 
from sklearn.metrics import confusion_matrix, make_scorer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef
from keras import callbacks
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix

## Importing the pre-processed training dataset from local computer storage

In [None]:
df = pd.read_csv(r"D:\Project Related(ML)\project dataSet\CreditCardNewDataSet\Credit Card Final V1\Training Data Set.csv")

In [None]:
# Checking the presence of Null Values in the dataset
df.isnull().sum()

In [None]:
#Number of different classes i.e., Fraudulent and Genuine
df['Class'].nunique()

In [None]:
# checking the count of fraud and legitimate class in the training dataset 
df['Class'].value_counts() 

In [None]:
# plotting the heatmap of the dataset ; it is used to show the dependency of each feature in providing the output 
f, ax1 = plt.subplots(figsize=(24,10))

corr = df.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Imbalanced Correlation Matrix", fontsize=14)

In [None]:
# making some changes in the dataset which include removal of Time column from the training dataset
X = df.drop(['Time', 'Class'], axis=1)
y = df['Class']

In [None]:
# Splitting the data into traning and testing ; validation data is 15% and shuffling of the data in permitted
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15,shuffle = True, random_state = 42)

In [None]:
print(X_train.shape, X_val.shape)

## Using Earlystopping technique 

In [None]:
earlystopping = callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=2, mode=min,
                                        restore_best_weights=True)

## Developing the model 

In [None]:
# This is the model structure for proposed model-1
clf = Sequential([
    Dense(units=128, kernel_initializer='uniform', input_dim=29, activation='relu'),
    Dense(units=64, kernel_initializer='uniform', activation='relu'),                            
    Dense(32, kernel_initializer='uniform', activation='relu'),
    Dense(16, kernel_initializer='uniform', activation='relu'),
    Dense(8, kernel_initializer='uniform', activation='relu'), 
    Dense(1, kernel_initializer='uniform', activation='sigmoid')
])
clf.summary()

In [None]:
# compiling the model 
clf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Training the model and validating 

In [None]:
# the epoch used is 30 and the batch size is 700 ; shuffling of the data is permitted

In [None]:
hist = clf.fit(X_train, y_train,validation_data=(X_val, y_val), epochs=30, batch_size=700 ,
               callbacks=[earlystopping], shuffle=True, verbose = 2) 

## For visualising the learning curves

In [None]:
# Loss Curves
plt.figure(figsize=[8,6])
plt.plot(hist.history['loss'],'r',linewidth=1.0)
plt.plot(hist.history['val_loss'],'b',linewidth=1.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=12)
plt.xlabel('Epochs ',fontsize=12)
plt.ylabel('Loss',fontsize=12)
plt.title('Loss Curves',fontsize=12)

In [None]:
# Accuracy Curves
plt.figure(figsize=[8,6])
plt.plot(hist.history['accuracy'],'r',linewidth=1.0)
plt.plot(hist.history['val_accuracy'],'b',linewidth=1.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=12)
plt.xlabel('Epochs ',fontsize=12)
plt.ylabel('Accuracy',fontsize=12)
plt.title('Accuracy Curves',fontsize=12)

## For testing Purpose fetching the pre-processed testing dataset

In [None]:
# the file is stored in local computer storage
df_test = pd.read_csv(r"D:\Project Related(ML)\project dataSet\CreditCardNewDataSet\Credit Card Final V1\Testing Data Set Additional.csv")

In [None]:
# checking the count of fraud and legitimate class in the testing dataset
df_test["Class"].value_counts()

In [None]:
X_test = df_test.drop(['Time', 'Class'], axis=1)
y_test = df_test['Class']

## Testing the model and visualising the result using Confusion Matrix

In [None]:
y_pred = clf.predict(X_test) > 0.5
mat = confusion_matrix(y_test, y_pred)
labels = ['Legitimate', 'Fraudulent']

sns.heatmap(mat, square=True, annot=True, fmt='d', cbar=False, cmap='magma', 
            xticklabels=labels, yticklabels=labels)

plt.xlabel('Predicted label')
plt.ylabel('Actual label')

## Showing the performance using various result parameters

In [None]:
print("Testing Accuracy = ",accuracy_score(y_test, y_pred.round())*100)
print("Precision Score",precision_score(y_test, y_pred.round())*100)
print("recall value = ",recall_score(y_test, y_pred.round())*100)
print("f1 score = ",f1_score(y_test, y_pred.round())*100)
mcc = matthews_corrcoef(y_test, y_pred)*100
print(" MCC value = ",mcc)

In [None]:
# classification_report
print(classification_report(y_test, y_pred))