# sklearn novelty detection

Docs: https://scikit-learn.org/stable/modules/outlier_detection.html#novelty-detection

As defined in the above link, the difference between outlier detection and novelty detection is that outlier is unsupervised, while novelty is semi-supervised.

In our case, since the first part of each data file has no abnormal data points, it can be used for novelty detection.

The difference to anomalies is, that novelties are considered normal after being detected once. [1](https://arxiv.org/pdf/2004.00433.pdf)

# Matrix Profile Benchmark

In [24]:
import json

matrixprofile_file = '../matrixprofile.json'
with open(matrixprofile_file) as file:
    matrixprofile = json.load(file)

# only pick out records with merged_discord
mp = {k: v['merged_discord'] for (k,v) in matrixprofile.items() if v['merged_discord'] is not None}
mp

{'002_UCR_Anomaly_35000.txt': 56598,
 '004_UCR_Anomaly_2500.txt': 5484,
 '005_UCR_Anomaly_4000.txt': 5299,
 '006_UCR_Anomaly_4000.txt': 5672,
 '007_UCR_Anomaly_4000.txt': 6447,
 '008_UCR_Anomaly_4000.txt': 7171,
 '010_UCR_Anomaly_4000.txt': 5974,
 '012_UCR_Anomaly_15000.txt': 25849,
 '013_UCR_Anomaly_15000.txt': 16000,
 '014_UCR_Anomaly_8000.txt': 16972,
 '016_UCR_Anomaly_5000.txt': 16949,
 '017_UCR_Anomaly_5000.txt': 16948,
 '018_UCR_Anomaly_8000.txt': 16920,
 '019_UCR_Anomaly_5000.txt': 5340,
 '021_UCR_Anomaly_5000.txt': 11029,
 '022_UCR_Anomaly_4000.txt': 8394,
 '024_UCR_Anomaly_3200.txt': 4476,
 '025_UCR_Anomaly_2800.txt': 5556,
 '026_UCR_Anomaly_1700.txt': 5710,
 '027_UCR_Anomaly_1200.txt': 5670,
 '028_UCR_Anomaly_1600.txt': 3114,
 '031_UCR_Anomaly_2700.txt': 3471,
 '032_UCR_Anomaly_1000.txt': 4699,
 '033_UCR_Anomaly_4000.txt': 6147,
 '034_UCR_Anomaly_1500.txt': 3694,
 '035_UCR_Anomaly_2500.txt': 5818,
 '036_UCR_Anomaly_4200.txt': 5215,
 '037_UCR_Anomaly_5000.txt': 29801,
 '041_UC

# Load Data

In [2]:
BASE_DIR = '../../data-sets/KDD-Cup/data/'

In [60]:
import os
import pandas as pd
import re
import numpy as np

filenames = sorted([i for i in os.listdir(BASE_DIR) if 'txt' in i])

filename = filenames[1]
df = pd.read_csv(BASE_DIR + filename, names=['series'])
regex = re.compile(r'^\d{3}_UCR_Anomaly_(?P<pos>\d+)\.txt$')
result = regex.search(filename)
threshold = int(result.group('pos'))
print(f'threshold: {threshold}')

threshold: 35000


In [61]:
X_train = df.series[df.index <= threshold].to_numpy().reshape(-1, 1)
# no abnormal data points before threshold, so all 0
y_train = np.zeros(X_train.shape[0])
X_inf = df.series[df.index > threshold].to_numpy().reshape(-1, 1)

# One Class SVM

Docs: https://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html#sphx-glr-auto-examples-svm-plot-oneclass-py

In [25]:
from sklearn.svm import OneClassSVM

clf = OneClassSVM(nu=0.1, gamma=0.1)
clf.fit(X_train)
# print(clf.predict(X_train))
# print(clf.predict(X_inf))

OneClassSVM(gamma=0.1, nu=0.1)

In [37]:
y_hat_train = clf.predict(X_train)
np.unique(y_hat_train, return_counts=True)

(array([-1,  1]), array([11644, 23357]))

In [38]:
y_hat_test = clf.predict(X_inf)
np.unique(y_hat_test, return_counts=True)

(array([-1,  1]), array([14941, 29853]))

# Local Outlier Factor

Docs: https://scikit-learn.org/stable/modules/outlier_detection.html#novelty-detection-with-local-outlier-factor

In [62]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(novelty=True, n_jobs=-1)
lof.fit(X_train)
# Note: do not call predict on training data!

LocalOutlierFactor(n_jobs=-1, novelty=True)

In [63]:
y_hat_test = lof.predict(X_inf)
np.unique(y_hat_test, return_counts=True)

(array([-1,  1]), array([   16, 44984]))

In [64]:
lof_score_df = pd.DataFrame.from_dict({'score': lof.score_samples(X_inf)})

In [65]:
lof_score_df.describe()

Unnamed: 0,score
count,45000.0
mean,-1.026708
std,0.056667
min,-3.25769
25%,-1.046834
50%,-1.014971
75%,-0.991647
max,-0.912334


In [66]:
# score the lower, the more abnormal

inf_id = lof_score_df['score'].idxmin()
id = threshold + inf_id
print(id)

67095
