In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn
from tabulate import tabulate

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import shap

In [7]:
data=pd.read_csv("/Users/harsh/ieeepaper/PS_20174392719_1491204439457_log.csv")
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df_shape = data.shape
# Create a table to display the shape
shape_table = [["Rows", df_shape[0]], ["Columns", df_shape[1]]]

# Print the table
print(tabulate(shape_table, headers=["Dimension", "Count"], tablefmt="grid"))

+-------------+---------+
| Dimension   |   Count |
| Rows        | 6362620 |
+-------------+---------+
| Columns     |      11 |
+-------------+---------+


In [4]:
categorical_cols = ['type', 'nameOrig', 'nameDest']

for col in categorical_cols:
 le = LabelEncoder()
 data[col] = le.fit_transform(data[col])


X = data.drop('isFraud', axis=1)
y = data['isFraud']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [5]:
#STEP 2: BUILDING A DEEP LEARNING MODEL BASED ON CNN
#Sequential model
model = Sequential ()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m159066/159066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 358us/step - accuracy: 0.9991 - loss: 0.0061 - val_accuracy: 0.9994 - val_loss: 0.0027
Epoch 2/10
[1m159066/159066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 353us/step - accuracy: 0.9994 - loss: 0.0029 - val_accuracy: 0.9995 - val_loss: 0.0026
Epoch 3/10
[1m159066/159066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 355us/step - accuracy: 0.9994 - loss: 0.0027 - val_accuracy: 0.9994 - val_loss: 0.0027
Epoch 4/10
[1m159066/159066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 360us/step - accuracy: 0.9994 - loss: 0.0027 - val_accuracy: 0.9995 - val_loss: 0.0023
Epoch 5/10
[1m159066/159066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 359us/step - accuracy: 0.9995 - loss: 0.0026 - val_accuracy: 0.9995 - val_loss: 0.0026
Epoch 6/10
[1m159066/159066[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 376us/step - accuracy: 0.9995 - loss: 0.0024 - val_accuracy: 0.9995 - va

In [6]:
loss, accuracy = model.evaluate(X_test, y_test)

[1m39767/39767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 243us/step - accuracy: 0.9995 - loss: 0.0021


In [7]:
print('Test accuracy:', accuracy)

Test accuracy: 0.999511182308197


In [8]:
from sklearn.metrics import classification_report
y_true = y_test
y_pred = model.predict(X_test)

y_pred_labels = (y_pred > 0.5).astype(int)
report = classification_report(y_true, y_pred_labels)
print(report)

[1m39767/39767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 219us/step
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270904
           1       0.98      0.63      0.77      1620

    accuracy                           1.00   1272524
   macro avg       0.99      0.81      0.88   1272524
weighted avg       1.00      1.00      1.00   1272524



In [9]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
RandomForestClassifier(random_state=42)
report = classification_report(y_test, y_pred)

print(report)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'])

In [None]:
# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'])

plt.show()

In [None]:
# STEP 3: SHAP EXPLAINER
explainer = shap.KernelExplainer(model.predict, X_train[:100])
shap_values = explainer.shap_values(X_test[:10])

shap.initjs()
shap.summary_plot(shap_values, X_test[:10], feature_names=[f'Feature {i}' for i in range(X_test.shape[1])])
