<a href="https://colab.research.google.com/github/dingowhiz/Full_Stack/blob/main/anomaly2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. anomaly detection using basic statistics
from sklearn.datasets import make_classification
from sklearn.metrics import precision_score, recall_score
import pandas as pd

print('start >>>')
x, y = make_classification(
    n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2,
    n_clusters_per_class=2, weights=[0.98, ], class_sep=0.5, scale=1.0, shuffle=True, flip_y=0, random_state=0
)

hourly_traffic = [
    120, 123, 124, 119, 196,
    121, 118, 117, 500, 132
]

pd.Series(hourly_traffic) > pd.Series(hourly_traffic).quantile(0.95)

class PercentileDetection:
    def __init__(self, percentile=0.9):
        self.percentile = percentile
    def fit(self, x, y=None):
        self.threshold = pd.Series(x).quantile(self.percentile)
    def predict(self, x, y=None):
        return (pd.Series(x) > self.threshold).values
    def fit_predict(self, x, y=None):
        self.fit(x)
        return self.predict(x)

outlierd = PercentileDetection(percentile=0.95)
df = pd.DataFrame(
    { 
        'hourly_traffic' : hourly_traffic,
        'is_outlier' : outlierd.fit_predict(hourly_traffic)
    }
).style.apply(
    lambda row: ['font-weight:bold'] * len(row)
      if row['is_outlier'] == True
      else ['font-weight: normal'] * len(row), axis=1 
)
df

start >>>


Unnamed: 0,hourly_traffic,is_outlier
0,120,False
1,123,False
2,124,False
3,119,False
4,196,False
5,121,False
6,118,False
7,117,False
8,500,True
9,132,False


In [2]:
# 1b. using percentiles for multi-dimensional data
outlierd = PercentileDetection(percentile=0.98)
y_pred = outlierd.fit_predict(x[:,0])

outlierd = PercentileDetection(percentile=0.98)
y_pred = outlierd.fit_predict(x[:,1])

class PercentileDetection:
    def __init__(self, percentile=0.9):
        self.percentile = percentile
    def fit(self, x, y=None):
        self.thresholds = [
            pd.Series(x[:,i]).quantile(self.percentile)
            for i in range(x.shape[1])
        ]
    def predict(self, x, y=None):
        return (x > self.thresholds).max(axis=1)
    def fit_predict(self, x, y=None):
        self.fit(x)
        return self.predict(x)
        
outlierd = PercentileDetection(percentile=0.98)
y_pred = outlierd.fit_predict(x)

print(
    'Precision: {:.02%}, Recall: {:.02%} [Percentile Detection]'.format(
        precision_score(y, y_pred, pos_label=1),
        recall_score(y, y_pred, pos_label=1),
    )
)


Precision: 4.00%, Recall: 5.00% [Percentile Detection]


In [3]:
# 2. anomaly detection using EllipticEnvelope
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import precision_score, recall_score

ee = EllipticEnvelope(random_state=0)
y_pred = ee.fit_predict(x) == -1

print(
    'Precision: {:.02%}, Recall: {:.02%} [Percentile Detection]'.format(
        precision_score(y, y_pred, pos_label=1),
        recall_score(y, y_pred, pos_label=1),
    )
)


Precision: 9.00%, Recall: 45.00% [Percentile Detection]


In [4]:
# 3. anomaly detection using Local Outlier Factor (LOF)

from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import precision_score, recall_score
import numpy as np

lof = LocalOutlierFactor(n_neighbors=50)
y_pred = lof.fit_predict(x) == -1

lof = LocalOutlierFactor(n_neighbors=50)
lof.fit(x)

for quantile in [0.01, 0.02, 0.1]:
    y_pred = lof.negative_outlier_factor_ < np.quantile(
        lof.negative_outlier_factor_, quantile
    )
    print(
        'LOF: Precision: {:.02%}, Recall: {:.02%} [Quantile={:.0%}]'.format(
            precision_score(y, y_pred, pos_label=1),
            recall_score(y, y_pred, pos_label=1),
            quantile
                 )
          )




LOF: Precision: 80.00%, Recall: 40.00% [Quantile=1%]
LOF: Precision: 50.00%, Recall: 50.00% [Quantile=2%]
LOF: Precision: 14.00%, Recall: 70.00% [Quantile=10%]


In [5]:
# Novelty detection using LOF

from sklearn.neighbors import LocalOutlierFactor

x_inliers = x[y==0]

lof = LocalOutlierFactor(n_neighbors=50, novelty=True)
lof.fit(x_inliers)
y_pred = lof.predict(x) == -1

print(
        'Novelty LOF: Precision: {:.02%}, Recall: {:.02%} [Quantile={:.0%}]'.format(
            precision_score(y, y_pred, pos_label=1),
            recall_score(y, y_pred, pos_label=1),
            quantile
                 )
          )

Novelty LOF: Precision: 26.53%, Recall: 65.00% [Quantile=10%]


In [6]:
# 4. anomaly detection using isolation forest

from sklearn.ensemble import IsolationForest

iforest = IsolationForest(n_estimators=200, n_jobs=-1, random_state=10)
y_pred = iforest.fit_predict(x) ==-1

print(
        'Isolation Forest: Precision: {:.02%}, Recall: {:.02%} [Quantile={:.0%}]'.format(
            precision_score(y, y_pred, pos_label=1),
            recall_score(y, y_pred, pos_label=1),
            quantile
                 )
          )


Isolation Forest: Precision: 6.45%, Recall: 60.00% [Quantile=10%]
