In [105]:
%load_ext autoreload
%autoreload

import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from anomalyzer.config import loader
from anomalyzer.evaluate_performance import evaluate_performance
from anomalyzer.prepare_data import prepare_data

%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 8]

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_auc_score

from sklearn.decomposition import PCA

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor

### Load the data

In [None]:
day_of_week = 'Tuesday'

data_file= loader(day_of_week=day_of_week,data_type='processed',subtype='Normalized')

df=pd.read_pickle(data_file)

## Preparing the Dataset for Anomaly Detection 

Here we need to separate our class variable, which in our case is "Label" from the rest of the dataset.

Anomaly detection algorithms output +1 for inlier and -1 for outliers. Therefore:

1. We need to map multiple classes to binary classes.
2. Since we are interesting in detecting anomalies, the negative class is 'BENIGN'.

In [93]:
X,y=prepare_data(data=df,class_column='Label',job='anomaly',classes='binary',neg_class='BENIGN')

### Subset: Dimensionality Reduction w/ PCA

We first implement PCA on the dataset to reduce the number of features. The number of features is informed by the feature ranking provided by the xgboost models trained on the dataset.

In [110]:
new_dimension = 28 #Choose the number of features for PCA

pca_model = PCA(n_components=new_dimension, random_state=4162)
x_red_pca = pca_model.fit_transform(X)

## 1. Training the Isolation Forest Method

We train an Isolation Forest model to detect anomalies in the dataset. Returns -1 for outlier and 1 for inliers.

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

### Train and save the model

We first train our model and pickle save it for future retrieval. 

The contamination parameter of the model is the expected fraction of outliers in the dataset and should be adjusted if necessary.

In [86]:
isf = IsolationForest(contamination=0.05, behaviour='new', random_state=42, n_jobs=3)
isf.fit(x_red_pca)

filename='./models/isf_model_'+day_of_week+'_PCA_'+str(new_dimension)+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(isf,file)

### Predict anomalies

Returns 1 for inlier and -1 for outlier (aka anomalies)

In [87]:
isf_anomalies = isf.predict(x_red_pca)
isf_anomalies

array([1, 1, 1, ..., 1, 1, 1])

### Performance measures

We choose to investigate the performance using 

1. Confusion Matrix (full picture of anomaly classification)
2. Accuracy
3. Precision of the anomaly class
4. Recall on the anomaly class
5. F-1 Measure on the anomaly class
6. AUC score

In [62]:
confusion_matrix(y,isf_anomalies,labels=[-1,1])

array([[ 22156, 409657],
       [   121,  13711]], dtype=int64)

In [63]:
evaluate_performance(y,isf_anomalies)

Accuracy: 0.08
Precision: 0.995
Recall: 0.051
f-1 score: 0.098
AUC score: 0.521


## 2. Training the one-class SVM Method

We train an one class SVM model to detect anomalies in the dataset. Returns -1 for outlier and 1 for inliers.

See: https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM

### Train and save the model

We first train our model and pickle save it for future retrieval. The kernel choice is **Radial Basis Function** ('rbf) as we would like to have locality in the decision boundaries learned by our SVM.

In [25]:
svm_model=OneClassSVM(kernel='rbf',gamma='auto')
svm_model.fit(x_red_pca)

filename='./models/svm_model'+day_of_week+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(svm_model,file)



### Predict anomalies

Returns 1 for inlier and -1 for outlier (aka anomalies)

In [None]:
svm_anomalies = svm_model.predict(x_red_pca)
svm_anomalies

### Performance measures

We choose to investigate the performance using 

1. Confusion Matrix (full picture of anomaly classification)
2. Accuracy
3. Precision of the anomaly class
4. Recall on the anomaly class
5. F-1 Measure on the anomaly class
6. AUC score

In [None]:
evaluate_performance(y,svm_anomalies,3)

## 3. Training the Elliptic Envelope method.

We train an an Elliptic envelope model designed to detect outliers in a Gaussian distributed dataset.

Returns -1 for outlier and 1 for inliers.

See: https://scikit-learn.org/stable/modules/generated/sklearn.covariance.EllipticEnvelope.html

### Train and save the model

We first train our model and pickle save it for future retrieval. 

The contamination parameter of the model is the expected fraction of outliers in the dataset and should be adjusted if necessary.

In [91]:
cov = EllipticEnvelope(contamination=0.1,random_state=0)
cov.fit(x_red_pca)

filename='./models/cov_model_'+day_of_week+'_PCA_'+str(new_dimension)+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(cov,file)



## 4. Training the Local Outlier Factor (LOF) method.

We train an an LOF model to detect anomalies in the dataset.
Returns -1 for outlier and 1 for inliers.

Novelty needs to be set to True to work for anomaly detection. Otherwise, works as an outlier detector.

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html

In [112]:
lof = LocalOutlierFactor(contamination=0.05,n_jobs=3,)

lof_anomalies=lof.fit_predict(x_red_pca)

filename='./predictions/lof_anomalies_'+day_of_week+'_PCA_'+str(new_dimension)+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(lof_anomalies,file)