# Supervised outlier detection
This is a template notebook for supervised outlier detection.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}

## How to use the notebook

The following cells:
- specify objective, variables, and data types,
- set up the outlier detection models,
- read dataset,
- present results from the models.

By default, the notebook is set up to run with an example (cpu4). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to supervised.board for detailed instructions.

In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

# Imports and General Setup
Requires imbalanced-learn

In [0]:
import os
import shutil
from distutils.dir_util import copy_tree

import time
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE 

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import classification_report, plot_roc_curve, roc_auc_score, confusion_matrix, precision_recall_fscore_support

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=FutureWarning)

from joblib import dump, load

# Project

In [0]:
experiment_name = '{{cookiecutter.use_case_name}}'  # please provide a name for the outlier detection experiment

# Dataset

In [0]:
time_series = True # Specify if the data is time series
path = '{{cookiecutter.data_path}}' # Specify the path of the data
test_size = 0.25

if path == 'default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/outlier_detection/cpu4.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'], index_col = 'date')
else:
    df = pd.read_csv(path)

num_col = len(df.columns)

path = './../out'
isExist = os.path.exists(path)
if isExist:
  for root, dirs, files in os.walk(path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(path)

## Visualising the dataset

In [0]:
print('Number of outliers: ', len(df[df['outlier'] == 1]))
df

In [0]:
pairplot_hue = 'outlier'
palette ={0: "C0", 1: "C3"}
sns.pairplot(df, hue = pairplot_hue, palette=palette)

In [0]:
X = df.drop(columns='outlier')
y = df['outlier']

labels = list(X.columns)
num_labels = len(labels)
print(labels)

Split train and test data

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)

# Dealing with imbalanced data

In [0]:
data_skew = pd.Series({'Non-outlier': len(y_train.loc[y_train == 0]), 'Outlier': len(y_train.loc[y_train == 1])})
print(data_skew)

In [0]:
 balance_data = 'smote' # 'smote', 'oversampling', or 'none'

In [0]:
if balance_data == 'oversampling':
    ros = RandomOverSampler()
    X_train, y_train = ros.fit_resample(X_train, y_train)
elif balance_data == 'smote':
    sm = SMOTE()
    X_train, y_train = sm.fit_resample(X_train, y_train)
data_skew = pd.Series({'Non-outlier': len(y_train.loc[y_train == 0]), 'Outlier': len(y_train.loc[y_train == 1])})
print(data_skew)

# Normalising the data

In [0]:
scaler = StandardScaler()

scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

dump(scaler, path + '/scaler.joblib')

# Classification Models

In [0]:
run_models = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA"]
run_models_data = {}
num_models = len(run_models)

models = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process", # May be quite slow
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]


## K Nearest Neighbors

In [0]:
n_neighbors = 5

def run_knn():
    print("Running K Nearest Neighbors")
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [n_neighbors], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "Nearest Neighbors" in run_models:
    run_models_data['knn'] = run_knn()

## Linear SVM

In [0]:
c = 0.025

def run_linear_svm():
    print("Running Linear SVM")
    model = SVC(kernel="linear", C=c)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [c], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "Linear SVM" in run_models:
    run_models_data['linear_svm'] = run_linear_svm()

## RBF SVM

In [0]:
gamma=2
c = 1

def run_rbf_svm():
    print("Running RBF SVM")
    model = SVC(gamma=gamma, C=c)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [gamma, c], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "RBF SVM" in run_models:
    run_models_data['rbf_svm'] = run_rbf_svm()

## Gaussian Process

In [0]:
factor = 1
kernel_factor = 1

def run_gaussian():
    print("Running Gaussian Process")
    model = GaussianProcessClassifier(factor * RBF(kernel_factor))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [factor, kernel_factor], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "Gaussian Process" in run_models:
    run_models_data['gaussian'] = run_gaussian()

## Decision Tree

In [0]:
max_depth = 5

def run_tree():
    print("Running Decision Tree")
    model = DecisionTreeClassifier(max_depth=max_depth)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [max_depth], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "Decision Tree" in run_models:
    run_models_data['tree'] = run_tree()

## Random Forests

In [0]:
max_depth = 5
n_estimators = 10
max_features = 1

def run_forest():
    print("Running Random Forests")
    model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [max_depth, n_estimators, max_features], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "Random Forest" in run_models:
    run_models_data['forest'] = run_forest()

## Neural Net

In [0]:
alpha = 1
max_iter = 1000

def run_mlp():
    print("Running Neural Net")
    model = MLPClassifier(alpha=alpha, max_iter=max_iter)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [alpha, max_iter], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "Neural Net" in run_models:
    run_models_data['mlp'] = run_mlp()

## AdaBoost

In [0]:
def run_adaboost():
    print("Running AdaBoost")
    model = AdaBoostClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "AdaBoost" in run_models:
    run_models_data['adaboost'] = run_adaboost()

## Naive Bayes

In [0]:
def run_nb():
    print("Running Naive Bayes")
    model = GaussianNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "Naive Bayes" in run_models:
    run_models_data['nb'] = run_nb()

## Quadratic Discriminant Analyisis

In [0]:
def run_qda():
    print("Running Quadratic Discriminant Analysis")
    model = QuadraticDiscriminantAnalysis()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = model.score(X_test, y_test)
    roc_auc = roc_auc_score(y_test, model.predict(X_test))
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])
    plot_roc_curve(model, X_test, y_test)
    plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
    print(report)
    return [model, [], [accuracy, roc_auc, precision, recall, fscore], report] # model, parameters, metrics, report

if "QDA" in run_models:
    run_models_data['qda'] = run_qda()

# Classification Results

In [0]:
run_models_scores = {k:v[2] for (k, v) in run_models_data.items()}
run_models_scores_df = pd.DataFrame(run_models_scores, index = ['accuracy', 'roc_auc', 'precision', 'recall', 'fscore'])
run_models_scores_df

In [0]:
run_models_scores_df.plot(kind='bar', figsize=(12, 10))
plt.show()

## Outlier Prediction

In [0]:
show_model_prediction = "knn" # ["knn", "linear_svm", "rbf_svm", "gaussian", "tree", "forest", "mlp", "adaboost", "nb", "qda"]

show_model = run_models_data[show_model_prediction][0]
y_pred = show_model.predict(X_test)
pred = pd.concat([X_test, pd.DataFrame(y_pred, columns=['outlier'])], axis=1)
pred

## Which metric to optimise

In [0]:
df_best = run_models_scores_df.idxmax(axis=1)
df_best

In [0]:
optimise_metric = 'accuracy' # ['accuracy', 'roc_auc', 'precision', 'recall', 'fscore']
best = df_best[optimise_metric]
print(run_models_data[best][3])

Export the data

In [0]:
dump([run_models_data[best], df, time_series, scaler], './../out/supervised_model.joblib')