# Unsupervised outlier detection
This is a template notebook for unsupervised outlier detection.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}

In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

## How to use the notebook

The following cells:
- specify objective, variables, and data types,
- set up the outlier detection models,
- read dataset,
- present results from the models.

By default, the notebook is set up to run with an example (wine quality). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to unsupervised.board for detailed instructions.

In [0]:
# <halerium id="25366418-1570-4dd6-943b-bd96f34bb53c">
# Link to unsupervised.board
# </halerium id="25366418-1570-4dd6-943b-bd96f34bb53c">


### Imports
* Requires seaborn and statsmodels installation

In [0]:
import os
import shutil
from distutils.dir_util import copy_tree

import time
from datetime import datetime

# <halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">
import seaborn as sns
# </halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from scipy import stats

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA

from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
from sklearn.linear_model import SGDOneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=UserWarning)

from joblib import dump, load

### 2. Import the Dataset

In [0]:
# <halerium id="0e9343d5-c4f5-428b-bbf8-156102aa5c90">
time_series = False # Specify if the data is time series
path = 'default example' # Specify the path of the data
# </halerium id="0e9343d5-c4f5-428b-bbf8-156102aa5c90">


Importing the dataset

In [0]:
if path == 'default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/outlier_detection/WineQT.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'], index_col = 'date')
else:
    df = pd.read_csv(path)

Visualising the dataset

In [0]:
df

Creating the /out folder for artifacts

In [0]:
out_path = './out'
isExist = os.path.exists(out_path)
if isExist:
  for root, dirs, files in os.walk(out_path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(out_path)

### 3. Select the Outlier Features

In [0]:
# <halerium id="71d11ec0-857c-4018-aad9-7f3159d630e9">
features_to_consider = 'all' # Specify features in list ['x', 'y'] or 'all' to identify outliers
# </halerium id="71d11ec0-857c-4018-aad9-7f3159d630e9">

if features_to_consider != 'all':
    df = df[features_to_consider]
num_col = len(df.columns)
df_uni = df.copy()

In [0]:
from functions.plot import plot_features

plot_features(time_series, df, num_col)

### 4. Select the Models

In [0]:
# <halerium id="0bc5f0ec-5288-4467-8caa-d96a5cc9ffe2">
run_models = ['z_score', 'iqr', 'percentile', 'elliptic', 'svm', 'sgd_svm', 'iso', 'lof'] # Select the outlier detection o run, note certain models have restrictions on data
# </halerium id="0bc5f0ec-5288-4467-8caa-d96a5cc9ffe2">
run_models_data = {}
num_models = len(run_models)

### 5. Specify the thresholds

Univariate Approaches

In [0]:
# z-score
# <halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">
num_std = 3 # The number of standard deviations from mean to flag as outlier, 99.7% of data occurs within 3 std of mean in normal distribution
# </halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">

# Interquartile range
# <halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">
num_iqr = 1.5 # The number of interquartile ranges from first and third quantile to flag as outlier, this actually corresponds to a std in normal distribution
# </halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">
#iqr of 1.7 for std 3

# Percentile
# <halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">
percentile = 0.99
# </halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">

# STL (Time series)
# <halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">
stl_num_std = 3 # Using std to determine residual outliers
# </halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">

# ARIMA (Time series)
# <halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">
p = 1 # Lag order: number of lag observations included in the model
d = 1 # Degree of differencing: Number of times the raw observations are differenced
q = 1 # Order of moving average: Size of moving average window
# </halerium id="9e6dc3c9-b29e-482a-af94-93614299120e">
arima_threshold = (p, d, q)

Multivariate Approaches

In [0]:
contamination = 0.1 # The proportion of outliers in the dataset (range from (0,0.5]), 0.1 default

#### Univariate approaches

Test for normality

In [0]:
from functions.unsupervised import test_normality

alpha = 1e-3 # You may change this alpha to adjust the strictness of the normal test
normal_variables = test_normality(df, df_uni, alpha)

normal_variables

In [0]:
from functions.unsupervised import run_uni_model

Z-score
For normally distributed data (only runs on features determined to be sufficiently normally distributed)

In [0]:
if 'z_score' in run_models:
    run_models_data['z_score'] = [list(run_uni_model('z_score', num_std, df, normal_variables, df_uni))]

Interquartile Range (IQR)
For skewed data

In [0]:
if 'iqr' in run_models:
    run_models_data['iqr'] = [list(run_uni_model('iqr', num_iqr, df, normal_variables, df_uni))]

Percentile
For other distributions

In [0]:
if 'percentile' in run_models:
    run_models_data['percentile'] = [list(run_uni_model('percentile', percentile, df, normal_variables, df_uni))]

STL
For Time Series

In [0]:
if time_series and 'stl' in run_models:
    run_models_data['stl'] = [list(run_uni_model('stl', stl_num_std, df, normal_variables, df_uni))]

ARIMA
For Time Series

In [0]:
if time_series and 'arima' in run_models:
    run_models_data['arima'] = [list(run_uni_model('arima', arima_threshold, df, normal_variables, df_uni))]

#### Multivariate approaches

In [0]:
from functions.unsupervised import run_multi_model

Elliptic Envelope Covariance

In [0]:
if 'elliptic' in run_models:
    run_models_data['elliptic'] = [run_multi_model('elliptic', contamination, df)]

One Class SVM
Using Gaussian kernel by default

In [0]:
if 'svm' in run_models:
    run_models_data['svm'] = [run_multi_model('svm', contamination, df)]

One Class SVM with Stochastic Gradient Descent
With kernel approximation

In [0]:
if 'sgd_svm' in run_models:
    run_models_data['sgd_svm'] = [run_multi_model('sgd_svm', contamination, df)]

Isolation Forest

In [0]:
if 'iso' in run_models:
    run_models_data['iso'] = [run_multi_model('iso', contamination, df)]

Local Outlier Factor

In [0]:
if 'lof' in run_models:
    run_models_data['lof'] = [run_multi_model('lof', contamination, df)]

### 6. Get the results

#### Univariate approaches results
Note that a point will be classified as an outlier if ANY specified feature is identified as an outlier

In [0]:
from functions.plot import plot_uni_results

uni_outliers = plot_uni_results(run_models_data, time_series, features_to_consider, num_col, df)
# <halerium id="4fdde096-5ef5-4884-ac9c-132796937e91">
run_models_data = uni_outliers[0]
# </halerium id="4fdde096-5ef5-4884-ac9c-132796937e91">


In [0]:
from functions.plot import plot_uni_visual

uni_approach_visual = 'z_score' # 'z_score', 'iqr', 'percentile', 'stl', or 'arima'
plot_uni_visual(uni_approach_visual, uni_outliers, time_series, df)

#### Multivariate approaches result

In [0]:
from functions.plot import plot_multi_results

# <halerium id="4fdde096-5ef5-4884-ac9c-132796937e91">
plot_multi_results(run_models, run_models_data, num_models, num_col, df)
# </halerium id="4fdde096-5ef5-4884-ac9c-132796937e91">


Outlier prediction

In [0]:
show_model = 'elliptic' # ['z_score', 'iqr', 'percentile', 'elliptic', 'svm', 'sgd_svm', 'iso', 'lof']

df_show = df.copy()
df_show['outlier'] = 0
outliers_show = run_models_data[show_model][0]
df_show.loc[df_show.index[outliers_show], 'outlier'] = 1
df_show

Decision Boundary Visualisation
Only available for data with 2 features

In [0]:
from functions.plot import plot_decision_boundary

plot_decision_boundary(num_col, run_models, run_models_data, df)

In [0]:
from functions.plot import plot_multi_visual

multi_approach_visual = 'elliptic' # 'elliptic', 'svm', 'sgd_svm', 'iso', or 'lof'
plot_multi_visual(multi_approach_visual, run_models_data, df)

### 7. Interpret the results

## Evaluation Metrics
### Clustering approaches
Silhouette Coefficient: Higher means better defined clusters. 
Calinski-Harabasz Index: Higher means better defined clusters.
Davies-Bouldin Index: Lower means better separation between clusters.

Note that clustering outlier detection evaluation may not reflect your use case. Eg if abnormal data is hidden in the clusters. 

In [0]:
multi_model_scores = {k:v[2] for (k, v) in run_models_data.items() if k not in ['z_score, iqr, percentile']}
multi_model_scores_df = pd.DataFrame(multi_model_scores, index = ['silhouette', 'calinski-harabasz', 'davies-bouldin'])
# <halerium id="fae2ad25-c157-46cf-a434-e2cf1871f76b">
multi_model_scores_df
# </halerium id="fae2ad25-c157-46cf-a434-e2cf1871f76b">


In [0]:
from functions.plot import plot_clustering_metrics

# <halerium id="fae2ad25-c157-46cf-a434-e2cf1871f76b">
plot_clustering_metrics(multi_model_scores)
# </halerium id="fae2ad25-c157-46cf-a434-e2cf1871f76b">


### 8. Export and use Streamlit

In [0]:
def export_outlier(df, model, export_path):
    df['outlier'] = 0
    outliers = run_models_data[model][0]
    df.loc[df.index[outliers], 'outlier'] = 1
    df.to_csv(export_path)


export_outlier(df, 'z_score', './out/labeled_data')

In [0]:
# Exports the data
# <halerium id="426210b3-cc96-468b-8438-dfc234e90e71">
dump([run_models_data, df, time_series], './out/unsupervised_model_data.joblib')
# </halerium id="426210b3-cc96-468b-8438-dfc234e90e71">
