## 0. Install and Import

In [1]:
!pip install pandas numpy matplotlib tensorflow scikit-learn



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.models import load_model

from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler

In [3]:
'''
tensorflow = 2.12.0
matplotlib = 3.7.1
numpy = 1.23.5
pandas = 2.0.2
scikit-learn = 1.2.2
'''

'\ntensorflow = 2.12.0\nmatplotlib = 3.7.1\nnumpy = 1.23.5\npandas = 2.0.2\nscikit-learn = 1.2.2\n'

## 1. Data exploration

In [4]:
df = pd.read_csv('card_transdata.csv')

In [5]:
df.shape # rows, columns

(1000000, 8)

In [6]:
df.columns

Index(['distance_from_home', 'distance_from_last_transaction',
       'ratio_to_median_purchase_price', 'repeat_retailer', 'used_chip',
       'used_pin_number', 'online_order', 'fraud'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [8]:
df.corr() # relationships between columns

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
distance_from_home,1.0,0.000193,-0.001374,0.143124,-0.000697,-0.001622,-0.001301,0.187571
distance_from_last_transaction,0.000193,1.0,0.001013,-0.000928,0.002055,-0.000899,0.000141,0.091917
ratio_to_median_purchase_price,-0.001374,0.001013,1.0,0.001374,0.000587,0.000942,-0.00033,0.462305
repeat_retailer,0.143124,-0.000928,0.001374,1.0,-0.001345,-0.000417,-0.000532,-0.001357
used_chip,-0.000697,0.002055,0.000587,-0.001345,1.0,-0.001393,-0.000219,-0.060975
used_pin_number,-0.001622,-0.000899,0.000942,-0.000417,-0.001393,1.0,-0.000291,-0.100293
online_order,-0.001301,0.000141,-0.00033,-0.000532,-0.000219,-0.000291,1.0,0.191973
fraud,0.187571,0.091917,0.462305,-0.001357,-0.060975,-0.100293,0.191973,1.0


#### Normalize the data

In [9]:
df = df.round(1).astype(int)

## 2. Building our Model

In [10]:
x = df.drop('fraud', axis=1)
y = df['fraud']

#### 3. Scale the data

In [11]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(x)

#### 4. Split data for testing and training

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

#### 5. Creating encoder and decoder

In [13]:
encoder = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    Dropout(0.5),
    tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.001))
])

In [14]:
decoder = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu'),
    Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    Dropout(0.5),
    tf.keras.layers.Dense(X_train.shape[1], activation='sigmoid')
])

In [15]:
# Define the input layer for the autoencoder
input_layer = tf.keras.layers.Input(shape=(X_train.shape[1],))

# Call the encoder and decoder models with the input layer and previous layer respectively
encoded = encoder(input_layer)
decoded = decoder(encoded)

In [16]:
autoencoder = tf.keras.models.Model(input_layer, decoded)

autoencoder.compile(optimizer='adam', loss='mse')

#### 6. Training process

In [None]:
history = autoencoder.fit(X_train, X_train,
                          epochs=2,
                          batch_size=32,
                          shuffle=True,
                          validation_split=0.1)

Epoch 1/2
 5125/25313 [=====>........................] - ETA: 24s - loss: 0.0261

#### 7. Testing with custom data

In [None]:
x_test = [10, 5, 2, 1, 1, 0, 0]
column_names = ['distance_from_home', 'distance_from_last_transaction',
       'ratio_to_median_purchase_price', 'repeat_retailer', 'used_chip',
       'used_pin_number', 'online_order']

# Convert the test sample into a pandas DataFrame with appropriate column names
x_test_df = pd.DataFrame([x_test], columns=column_names)

# Normalize the test sample using the same MinMaxScaler used for training data
x_test_scaled = scaler.transform(x_test_df)

# Make predictions using the loaded autoencoder
predictions = autoencoder.predict(x_test_scaled)
print("Reconstructed output:", predictions)

# Calculate reconstruction errors (MSE) for each sample in X_train
reconstruction_errors = np.mean(np.square(X_train - autoencoder.predict(X_train)), axis=1)

# Calculate mean and standard deviation of reconstruction errors
mean_error = np.mean(reconstruction_errors)
std_error = np.std(reconstruction_errors)

# Set a threshold value based on mean error plus some multiple of standard deviation (e.g., 2)
threshold = mean_error + 2 * std_error

# Calculate the reconstruction error (MSE) between original input and reconstructed output
reconstruction_error = np.mean(np.square(x_test_scaled - predictions))

# Check if your test sample's reconstruction error exceeds the threshold
if reconstruction_error > threshold:
    print("The transaction is potentially fraudulent.")
else:
    print("The transaction is likely not fraudulent.")

#### 8. Saving the Model for future

In [None]:
autoencoder.save('autoencoder_model.h5')