# Autoencoder Outlier Detection

This is a template notebook for autoencoder outlier detection.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}


In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

## How to use the notebook

The following cells:
- specify objective, variables, and data types,
- set up the outlier detection models,
- read dataset,
- present results from the models.

By default, the notebook is set up to run with an example (art daily small noise). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to autoencoder.board for detailed instructions.

In [0]:
# <halerium id="b2feb61d-a25b-499e-b3ed-7f5e422f78c8">
# Link to autoencoder.board
# </halerium id="b2feb61d-a25b-499e-b3ed-7f5e422f78c8">


## Imports

In [0]:
import os
import shutil

import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras import layers

import seaborn as sns
import matplotlib.pyplot as plt

from distutils.dir_util import copy_tree

from sklearn.preprocessing import StandardScaler

from joblib import dump, load

### 2. Import the Dataset

In [0]:
# <halerium id="91b472d3-82da-41e9-96e2-8389b64a2329">
time_series = True # Specify if the data is time series
path = 'default example' # Specify the path of the data, note that it should be 'clean' without anomalies.
# </halerium id="91b472d3-82da-41e9-96e2-8389b64a2329">


Importing the dataset

In [0]:
if path == 'default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/outlier_detection/art_daily_small_noise.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'], index_col = 'date')
else:
    df = pd.read_csv(path)

num_col = len(df.columns)

Visualising the dataset

In [0]:
df

Creating the /out folder for artifacts

In [0]:
path = './out'
isExist = os.path.exists(path)
if isExist:
  for root, dirs, files in os.walk(path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(path)

In [0]:
from functions.plot import plot_features

plot_features(df, time_series, num_col)

Normalisation

In [0]:
scaler = StandardScaler()
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), index = df.index, columns = df.columns)
df

### 3. Run the Models
The autoencoder model would expect sequences as input. These sequences are groups of data and may be grouped together by a common time period (eg. Samples in a day/week/month).

In [0]:
from functions.autoencoder import create_sequences

# Use an even factor/multiple of 32
# <halerium id="81b8c3ec-0282-4d4d-bd42-87dc83b06dd9">
TIME_STEPS = 32 # In the example dataset, there is one data point every 5 minutes. 288 will be the timestamps in a day.
# </halerium id="81b8c3ec-0282-4d4d-bd42-87dc83b06dd9">

# Generated training sequences for use in the model.
X_train = create_sequences(df.values, TIME_STEPS)
print('(Number of timestamps - time steps, time steps, num features)')
X_train.shape

In [0]:
# Convolutional Reconstruction Autoencoder
# <halerium id="81b8c3ec-0282-4d4d-bd42-87dc83b06dd9">
model = keras.Sequential(
    [
        layers.Input(shape=(X_train.shape[1], X_train.shape[2])),
        layers.Conv1D(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1D(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1DTranspose(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(filters=1, kernel_size=7, padding="same"),
    ]
)
# </halerium id="81b8c3ec-0282-4d4d-bd42-87dc83b06dd9">

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
model.summary()

In [0]:
# LSTM for time series
# <halerium id="81b8c3ec-0282-4d4d-bd42-87dc83b06dd9">
if time_series:
    lstm_model = keras.Sequential()
    lstm_model.add(keras.layers.LSTM(
        units = 64,
        input_shape=(X_train.shape[1], X_train.shape[2])
        ))
    lstm_model.add(keras.layers.Dropout(rate=0.2))
    lstm_model.add(keras.layers.RepeatVector(n=X_train.shape[1]))

    lstm_model.add(keras.layers.LSTM(
        units = 64,
        return_sequences = True
        ))
    lstm_model.add(keras.layers.Dropout(rate=0.2))
    lstm_model.add(keras.layers.TimeDistributed(keras.layers.Dense(units = X_train.shape[2])))

    lstm_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
    lstm_model.summary()
# </halerium id="81b8c3ec-0282-4d4d-bd42-87dc83b06dd9">


Train the model

In [0]:
history = model.fit(
    X_train,
    X_train,
    epochs=50,
    batch_size=128,
    validation_split=0.1,
    shuffle=False, # No assumption that data is independent
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
    ],
)

In [0]:
if time_series:
    lstm_history = lstm_model.fit(
        X_train,
        X_train,
        epochs=50,
        batch_size=128,
        validation_split=0.1,
        shuffle=False, # No assumption that data is independent
        callbacks=[
            keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
        ],
    )

### 4. Get the results

In [0]:
from functions.plot import plot_train_loss

plot_train_loss(history)

In [0]:
if time_series:
    plot_train_loss(lstm_history)

In [0]:
from functions.plot import plot_mae_loss

# Get train MAE loss.
# <halerium id="93af5872-6fc8-4da2-8179-97cb52aac109">
X_train_pred, train_mae_loss, threshold = plot_mae_loss(model, X_train, df)
# </halerium id="93af5872-6fc8-4da2-8179-97cb52aac109">


In [0]:
if time_series:
    # Get train MAE loss from LSTM model.
# <halerium id="93af5872-6fc8-4da2-8179-97cb52aac109">
    lstm_X_train_pred, lstm_train_mae_loss, lstm_threshold = plot_mae_loss(lstm_model, X_train, df)
# </halerium id="93af5872-6fc8-4da2-8179-97cb52aac109">


In [0]:
from functions.plot import plot_first_sequence

# Check how the first sequence is learnt
plot_first_sequence(X_train, X_train_pred, df)

In [0]:
if time_series:
    plot_first_sequence(X_train, lstm_X_train_pred, df)

### 6. Export and use Streamlit
Picking the model with the lower total mae loss

In [0]:
from functions.autoencoder import export_model

if time_series:
    if len(df.columns) > 1:
        sum_mae_loss = sum(sum(train_mae_loss))
        lstm_sum_mae_loss = sum(sum(lstm_train_mae_loss))
    else:
        sum_mae_loss = sum(train_mae_loss)
        lstm_sum_mae_loss = sum(lstm_train_mae_loss)
    print(sum_mae_loss)
    print(lstm_sum_mae_loss)
else:
    print(sum_mae_loss)

if time_series and lstm_sum_mae_loss < sum_mae_loss:
# <halerium id="575b8adf-c236-4cdd-9ba9-de22998864ca">
    export_model(lstm_model, scaler, TIME_STEPS, lstm_threshold, time_series, df)
# </halerium id="575b8adf-c236-4cdd-9ba9-de22998864ca">
else:
# <halerium id="575b8adf-c236-4cdd-9ba9-de22998864ca">
    export_model(lstm_model, scaler, TIME_STEPS, threshold, time_series, df)
# </halerium id="575b8adf-c236-4cdd-9ba9-de22998864ca">
