# CSE688: Practical and Application of Machine Learning - Spring 2021
## Assignment 4a
### Authors

- B073040018 朱劭璿
- B072010029 陳居廷

#### (f) Anomaly detection on credit card fraud

In [1]:
import tensorflow as tf
import numpy as np

data = []
labels = []

with open('creditcardfraud_normalised.csv', 'r') as f:
    f.readline() # drop csv field names
    lines = f.readlines()
for l in lines:
    l = l[:-1].split(',')
    data.append([float(c) for c in l[:-1]])
    labels.append(int(l[-1]))
    
data   = np.array(data)
labels = np.array(labels)

print(f'Data shape: {data.shape}')

Data shape: (284807, 29)


In [2]:
normal_indices   = (labels == 0)
normal_data      = data[normal_indices]
normal_labels    = labels[normal_indices]
abnormal_indices = (labels == 1)
abnormal_data    = data[abnormal_indices]
abnormal_labels  = labels[abnormal_indices]

print(f'Number of normal   activities: {len(normal_labels)}')
print(f'Number of abnormal activities: {len(abnormal_labels)}')

Number of normal   activities: 284315
Number of abnormal activities: 492


In [3]:
normal_shuffler = np.random.permutation(len(normal_labels))
train_splitter  = normal_shuffler[:-500]
val_splitter    = normal_shuffler[-500:-250]
test_splitter   = normal_shuffler[-250:]

train_data   = normal_data[train_splitter]
train_labels = normal_labels[train_splitter]
val_data     = normal_data[val_splitter]
val_labels   = normal_labels[val_splitter]
test_data    = normal_data[test_splitter]
test_labels  = normal_labels[test_splitter]

abnormal_shuffler = np.random.permutation(len(abnormal_labels))
val_splitter      = abnormal_shuffler[:int(len(abnormal_labels)/2)]
test_splitter     = abnormal_shuffler[int(len(abnormal_labels)/2):]

val_data     = np.concatenate((val_data,    abnormal_data[val_splitter]))
val_labels   = np.concatenate((val_labels,  abnormal_labels[val_splitter]))
test_data    = np.concatenate((test_data,   abnormal_data[test_splitter]))
test_labels  = np.concatenate((test_labels, abnormal_labels[test_splitter]))

print(f'Number of training   samples: {len(train_labels)}')
print(f'Number of validation smaples: {len(val_labels)}')
print(f'Number of testing    smaples: {len(test_labels)}')

ds_train = tf.data.Dataset.from_tensor_slices((train_data, train_data)).cache().batch(128).prefetch(tf.data.experimental.AUTOTUNE)
ds_val = tf.data.Dataset.from_tensor_slices((val_data, val_labels)).cache().batch(128).prefetch(tf.data.experimental.AUTOTUNE)
ds_test = tf.data.Dataset.from_tensor_slices((test_data, test_labels)).cache().batch(128).prefetch(tf.data.experimental.AUTOTUNE)

Number of training   samples: 283815
Number of validation smaples: 496
Number of testing    smaples: 496


In [4]:
autoencoder = tf.keras.models.Sequential([
    tf.keras.layers.Dense(16, input_shape=(29,), activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(29),
])
autoencoder.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                480       
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_4 (Dense)              (None, 16)                144       
_________________________________________________________________
dense_5 (Dense)              (None, 29)                493       
Total params: 1,329
Trainable params: 1,329
Non-trainable params: 0
______________________________________________________

## Training

In [11]:
LR = 1e-4
ATH = 0.002 # anomaly confidence threshold

In [6]:
class AnomalyValidation(tf.keras.callbacks.Callback):
    def __init__(self, ATH, ds_val, log_step=1):
        super(AnomalyValidation, self).__init__()
        self.best_weights = None
        self.best_acc = 0
        self.ATH = ATH # anomaly confidence threshold
        self.ds_val = ds_val
        self.log_step = log_step
    def on_epoch_end(self, epoch, logs=None):
        global best_classifier, best_acc
        num_correct = 0
        num_total = 0
        for data, label in self.ds_val:
            num_correct += ((tf.keras.losses.MSE(self.model(data), data).numpy() > self.ATH) == label.numpy()).sum()
            num_total += label.shape[0]
        acc = num_correct/num_total
        if acc > self.best_acc:
            self.best_acc = acc
            self.best_weights = self.model.get_weights()
        if epoch % self.log_step != 0:
            return
        print(f"Epoch {epoch+1: >2d}", end='')
        print(
            f"\x1b[32m Train \x1b[0m "
            f"MSE: {logs['loss']: .6f}, ",
            end = '\t'
        )
        print(f'Anomaly detection accuracy:\x1b[31m {acc: .5f}\x1b[0m')
    def on_train_end(self, logs=None):
        self.model.set_weights(self.best_weights)

In [7]:
autoencoder.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
    loss='mse',
)
history = autoencoder.fit(
    ds_train, 
    epochs=30,
    verbose=0,
    callbacks=[AnomalyValidation(ATH, ds_val, 3)]
)
# AnomalyValidation callback: Perform anomaly detect on validation dataset
#                             Save the best weights and set the weight in training end

Epoch  1[32m Train [0m MSE:  0.070087, 	Anomaly detection accuracy:[31m  0.83669[0m
Epoch  4[32m Train [0m MSE:  0.001562, 	Anomaly detection accuracy:[31m  0.84879[0m
Epoch  7[32m Train [0m MSE:  0.001278, 	Anomaly detection accuracy:[31m  0.87702[0m
Epoch 10[32m Train [0m MSE:  0.001230, 	Anomaly detection accuracy:[31m  0.88105[0m
Epoch 13[32m Train [0m MSE:  0.001195, 	Anomaly detection accuracy:[31m  0.89516[0m
Epoch 16[32m Train [0m MSE:  0.001152, 	Anomaly detection accuracy:[31m  0.88508[0m
Epoch 19[32m Train [0m MSE:  0.001101, 	Anomaly detection accuracy:[31m  0.89113[0m
Epoch 22[32m Train [0m MSE:  0.001071, 	Anomaly detection accuracy:[31m  0.89718[0m
Epoch 25[32m Train [0m MSE:  0.001052, 	Anomaly detection accuracy:[31m  0.90121[0m
Epoch 28[32m Train [0m MSE:  0.001039, 	Anomaly detection accuracy:[31m  0.89718[0m


## Inference

In [12]:
num_correct = 0
num_total = 0
for data, label in ds_test:
    num_correct += ((tf.keras.losses.MSE(autoencoder(data), data).numpy() > ATH) == label.numpy()).sum()
    num_total += label.shape[0]
print(f'Anomaly detection accuracy: {num_correct/num_total: .5f}')

Anomaly detection accuracy:  0.91331
