In [1]:
import numpy as np

# Outlier detection methods

- Z-Score
- Interquartile Range
- Isolation Forest
- Local Outlier Factor
- One Class Support Vector Machine
- DBSCAN

***

## Z-Score

- measures how many standard deviation units away a sample $x$ is from the mean $\mu$

- when Z-Score of a sample is greater than some threshold the sample is labeled as an outlier

- $ z = \frac{x - \mu}{\sigma} $

In [2]:
from scipy.stats import zscore

data = np.array([1, 2, 2, 2, 3, 1, 1, 15, 2, 2])

z_scores = np.abs(zscore(data))
print(z_scores)

threshold = 2
outlier_idxs = np.where(z_scores > threshold)

outliers = data[outlier_idxs]
print(outliers)

[0.52352964 0.27422981 0.27422981 0.27422981 0.02492998 0.52352964
 0.52352964 2.96666795 0.27422981 0.27422981]
[15]


In [3]:
def _zscore(x):
    """
    Custom implementation of zscore.
    """
    mean = np.mean(x)
    std = np.std(x)
    return (x - mean) / std

z_scores = np.abs(_zscore(data))
print(z_scores)

threshold = 2
outlier_idxs = np.where(z_scores > threshold)

outliers = data[outlier_idxs]
print(outliers)

[0.52352964 0.27422981 0.27422981 0.27422981 0.02492998 0.52352964
 0.52352964 2.96666795 0.27422981 0.27422981]
[15]


## Interquartile Range

- range given by the difference between the third quartile and the first quartile, $\text{IQR} = Q3 - Q1$

- sample is labeled as an outlier if it is not contained in the interval $\left( Q1 - k \cdot \text{IQR}, Q3 + k \cdot \text{IQR} \right)$ where $k$ is some threshold (most often 1.5 or 3)

In [4]:
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1

print(f'Q1: {Q1}, Q3: {Q3}, IQR: {IQR}')

outliers = [x for x in data if x < (Q1 - 1.5 * IQR) or x > (Q3 + 1.5 * IQR)]
print(outliers)

Q1: 1.25, Q3: 2.0, IQR: 0.75
[15]


In [5]:
def iqr_outlier(data, k=1.5):
    """
    Function for finding indices of outliers which are not contained in the range (Q1 - k * IQR, Q3 + k * IQR).
    """
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    idxs = np.logical_or(data < (Q1 - k * IQR), data > (Q3 + k * IQR))
    return idxs

outlier_idxs = iqr_outlier(data)
print(data[outlier_idxs])

[15]


# Isolation Forest

- variation of Random Forest, searches for samples that are the easiest to separate from the rest

- contamination parameter is the proportion of outliers present in the dataset

In [6]:
from sklearn.ensemble import IsolationForest

data = np.array([1, 2, 2, 2, 3, 1, 1, 15, 2, 2])
data = data.reshape(-1, 1)
clf = IsolationForest(contamination=0.1)
clf.fit(data)

predictions = clf.predict(data)
print(f'IF prediction: {predictions}')

outlier_idxs = np.where(predictions != 1)
outliers = data[outlier_idxs].reshape(-1)
print(outliers)

IF prediction: [ 1  1  1  1  1  1  1 -1  1  1]
[15]


# Local Outlier Factor

- computes local density of a point with respect to its neighbors

- outliers are points which have lower density than neighbors

In [7]:
from sklearn.neighbors import LocalOutlierFactor

data = np.array([1, 2, 2, 2, 3, 1, 1, 15, 2, 2])
data = data.reshape(-1, 1)
clf = LocalOutlierFactor(n_neighbors=8)

predictions = clf.fit_predict(data)
print(predictions)

outlier_idxs = np.where(predictions != 1)
outliers = data[outlier_idxs].reshape(-1)
print(outliers)

[ 1  1  1  1  1  1  1 -1  1  1]
[15]


# One Class SVM

- unsupervised version of SVM adjusted for outlier detection

In [8]:
from sklearn.svm import OneClassSVM

data = np.array([1, 2, 2, 2, 3, 1, 1, 15, 2, 2])
data = data.reshape(-1, 1)
clf = OneClassSVM(nu=0.1)

predictions = clf.fit_predict(data)
print(predictions)

outlier_idxs = np.where(predictions != 1)
outliers = data[outlier_idxs].reshape(-1)
print(outliers)

[ 1  1  1  1 -1  1  1  1  1  1]
[3]


# DBSCAN

- clustering method which uses density

- outliers are points which do not belong to any cluster

In [10]:
from sklearn.cluster import DBSCAN

data = np.array([1, 2, 2, 2, 3, 1, 1, 15, 2, 2])
data = data.reshape(-1, 1)

dbs = DBSCAN(eps=3, min_samples=2)
predictions = dbs.fit_predict(data)
print(predictions)

outlier_idxs = np.where(predictions != 0)
outliers = data[outlier_idxs].reshape(-1)
print(outliers)

[ 0  0  0  0  0  0  0 -1  0  0]
[15]
