# Autoencoder Outlier Detection

This is a template notebook for autoencoder outlier detection.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}


## How to use the notebook

The following cells:
- specify objective, variables, and data types,
- set up the outlier detection models,
- read dataset,
- present results from the models.

By default, the notebook is set up to run with an example (art daily small noise). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to autoencoder.board for detailed instructions.

In [0]:
# Link to experiments card (refresh and hit enter on this line to see the link)

# Imports and General Setup

In [0]:
import os
import shutil

import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers

import seaborn as sns
import matplotlib.pyplot as plt

from distutils.dir_util import copy_tree

from sklearn.preprocessing import StandardScaler

from joblib import dump, load

# Project

In [0]:
experiment_name = '{{cookiecutter.use_case_name}}'  # please provide a name for the outlier detection experiment

# Dataset

In [0]:
time_series = True # Specify if the data is time series
path = '{{cookiecutter.data_path}}' # Specify the path of the data, note that it should be 'clean' without anomalies.

if path == 'default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/outlier_detection/art_daily_small_noise.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'], index_col = 'date')
else:
    df = pd.read_csv(path)

num_col = len(df.columns)

path = './../out'
isExist = os.path.exists(path)
if isExist:
  for root, dirs, files in os.walk(path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(path)

## Visualising the dataset

In [0]:
df

In [0]:
n_bins = 50
plt_v = 3
plt_h = 6
if time_series:
    suptitle = 'Time Series, Frequency, and Box plots of features'
    plt_row = 3
    plt_v *= 3
else:
    suptitle = 'Frequency and Box plots of features'
    plt_row = 2
    plt_v *= 2


if num_col == 1:
    fig, axs = plt.subplots(plt_row, num_col, figsize=(plt_h*num_col, plt_v))
    fig.suptitle(suptitle)
    axs[0].hist(df[df.columns[0]], bins = n_bins)
    axs[0].set_ylabel('Frequency')
    axs[1].boxplot(df[df.columns[0]], vert=False)
    axs[1].set_xlabel(df.columns[0])
    if time_series:
        axs[2].plot(df)
        axs[2].set_xlabel('time')
        axs[2].set_ylabel(df.columns[0])
elif num_col > 1:
    fig, axs = plt.subplots(plt_row, num_col, figsize=(plt_h*num_col, plt_v))
    fig.suptitle(suptitle)
    for i in range(num_col):
        axs[0][i].hist(df[df.columns[i]], bins = n_bins)
        axs[0][i].set_ylabel('Frequency')
        axs[1][i].boxplot(df[df.columns[i]], vert=False)
        axs[1][i].set_xlabel(df.columns[i])
        if time_series:
            axs[2][i].plot(df[df.columns[i]])
            axs[2][i].set_xlabel('time')
            axs[2][i].set_ylabel(df.columns[i])

In [0]:
sns.pairplot(df)

# Normalisation

In [0]:
scaler = StandardScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), index = df.index, columns = df.columns)
df

## Sequences
The autoencoder model would expect sequences as input. These sequences are groups of data and may be grouped together by a common time period (eg. Samples in a day/week/month).

In [0]:
# Use an even factor/multiple of 32
TIME_STEPS = 32 # In the example dataset, there is one data point every 5 minutes. 288 will be the timestamps in a day.

# Generated training sequences for use in the model.
def create_sequences(values, time_steps=TIME_STEPS):
    output = []
    for i in range(len(values) - time_steps + 1):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)

X_train = create_sequences(df.values)
print('(Number of timestamps - time steps, time steps, num features)')
X_train.shape

In [0]:
# Convolutional Reconstruction Autoencoder
model = keras.Sequential(
    [
        layers.Input(shape=(X_train.shape[1], X_train.shape[2])),
        layers.Conv1D(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1D(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1DTranspose(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(filters=1, kernel_size=7, padding="same"),
    ]
)

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
model.summary()

In [0]:
# LSTM for time series
if time_series:
    lstm_model = keras.Sequential()
    lstm_model.add(keras.layers.LSTM(
        units = 64,
        input_shape=(X_train.shape[1], X_train.shape[2])
        ))
    lstm_model.add(keras.layers.Dropout(rate=0.2))
    lstm_model.add(keras.layers.RepeatVector(n=X_train.shape[1]))

    lstm_model.add(keras.layers.LSTM(
        units = 64,
        return_sequences = True
        ))
    lstm_model.add(keras.layers.Dropout(rate=0.2))
    lstm_model.add(keras.layers.TimeDistributed(keras.layers.Dense(units = X_train.shape[2])))

    lstm_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
    lstm_model.summary()

## Train the model

In [0]:
history = model.fit(
    X_train,
    X_train,
    epochs=50,
    batch_size=128,
    validation_split=0.1,
    shuffle=False, # No assumption that data is independent
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
    ],
)

In [0]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.show()

In [0]:
if time_series:
    lstm_history = lstm_model.fit(
        X_train,
        X_train,
        epochs=50,
        batch_size=128,
        validation_split=0.1,
        shuffle=False, # No assumption that data is independent
        callbacks=[
            keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
        ],
    )

In [0]:
if time_series:
    plt.plot(lstm_history.history["loss"], label="Training Loss")
    plt.plot(lstm_history.history["val_loss"], label="Validation Loss")
    plt.legend()
    plt.show()

In [0]:
# Get train MAE loss.
X_train_pred = model.predict(X_train)
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)

plt.hist(train_mae_loss, bins=50)
plt.xlabel("Train MAE loss")
plt.ylabel("No of samples")
plt.legend(labels = df.columns)
plt.show()

# Get reconstruction loss threshold.
threshold = np.amax(train_mae_loss, axis=0)
print("Reconstruction error threshold: ", threshold)

In [0]:
if time_series:
    # Get train MAE loss from LSTM model.
    lstm_X_train_pred = lstm_model.predict(X_train)
    lstm_train_mae_loss = np.mean(np.abs(lstm_X_train_pred - X_train), axis=1)

    plt.hist(lstm_train_mae_loss, bins=50)
    plt.xlabel("Train MAE loss")
    plt.ylabel("No of samples")
    plt.legend(labels = df.columns)
    plt.show()

    # Get reconstruction loss threshold.
    lstm_threshold = np.amax(lstm_train_mae_loss, axis=0)
    print("Reconstruction error threshold: ", lstm_threshold)

In [0]:
# Check how the first sequence is learnt
plt.plot(X_train[0], label=df.columns)
plt.plot(X_train_pred[0], color='r', label='learnt')
plt.legend()
plt.show()

In [0]:
if time_series:
    plt.plot(X_train[0], label=df.columns)
    plt.plot(lstm_X_train_pred[0], label=['learnt ' + col for col in df.columns])
    plt.legend()
    plt.show()

## Export the data
Picking the model with the lower total mae loss

In [0]:
if time_series:
    if len(df.columns) > 1:
        sum_mae_loss = sum(sum(train_mae_loss))
        lstm_sum_mae_loss = sum(sum(lstm_train_mae_loss))
    else:
        sum_mae_loss = sum(train_mae_loss)
        lstm_sum_mae_loss = sum(lstm_train_mae_loss)
    print(sum_mae_loss)
    print(lstm_sum_mae_loss)
else:
    print(sum_mae_loss)
if time_series and lstm_sum_mae_loss < sum_mae_loss:
    lstm_model.save('./../out/autoencoder_model')
    dump([[scaler, TIME_STEPS, lstm_threshold], scaler.inverse_transform(df), time_series], './../out/autoencoder_model_data.joblib')
else:
    model.save('./../out/autoencoder_model')
    dump([[scaler, TIME_STEPS, threshold], scaler.inverse_transform(df), time_series], './../out/autoencoder_model_data.joblib')