In [1]:
%load_ext autoreload
%autoreload

import numpy as np
import pandas as pd
import os
import sys
import pickle

import plotly.express as px

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from anomalyzer.config import *
from anomalyzer.evaluate_performance import evaluate_performance
from anomalyzer.prepare_data import prepare_data

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_auc_score

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor

from sklearn.manifold import TSNE

# Data Preparation

### Load the data and combine all BENIGN traffic into a single data frame

Days to consider: 
- Monday (all benign)
- Tuesday (97% benign)
- Wednesday (64% benign)

Choose method. "Anomaly" leaves only attacks for test data, "Mixed" creates test data with benign and attack type communications.

In [2]:
method = "Mixed"

In [3]:
train_data, tue_test_data, wed_test_data = novelty_detection_preparation(method=method)

In [4]:
train_data.shape

(1313826, 68)

In [5]:
tue_test_data.shape

(57014, 68)

In [6]:
wed_test_data.shape

(295692, 68)

### Preparing the Dataset for Training and Testing

Here we need to separate our class variable, which in our case is "Label" from the rest of the dataset.

Anomaly detection algorithms output +1 for inlier and -1 for outliers. Therefore:

1. We need to map multiple classes to binary classes.
2. Since we are interesting in detecting anomalies, the negative class is 'BENIGN'.

In [7]:
X_train, y_train = prepare_data(data=train_data,class_column='Label',classes='binary',neg_class='BENIGN')

X_tue_test, y_tue_test = prepare_data(data=tue_test_data,class_column='Label',classes='binary',neg_class='BENIGN')

X_wed_test, y_wed_test = prepare_data(data=wed_test_data,class_column='Label',classes='binary',neg_class='BENIGN')

The class label vector $y$ has the following mapping structure:

- 1: Attack
- 0: Benign

In [8]:
X_test ={'Tuesday':X_tue_test,'Wednesday':X_wed_test}
y_test ={'Tuesday':y_tue_test,'Wednesday':y_wed_test}

# Training

## 1. Training the Isolation Forest Method

We train an Isolation Forest model to detect anomalies in the dataset. Returns -1 for outlier and 1 for inliers.

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

In [None]:
isf = IsolationForest(behaviour='new', random_state=42, n_jobs=3)
isf.fit(X_train)

filename='./models/novelty/isf_model_'+method+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(isf,file)

## 2. Training the one-class SVM Method

We train an one class SVM model to detect anomalies in the dataset. Returns -1 for outlier and 1 for inliers.

See: https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html#sklearn.svm.OneClassSVM

In [None]:
svm_model = OneClassSVM(kernel='rbf',gamma='auto')
svm_model.fit(X_train)

filename = './models/novelty/svm_model_'+method+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(svm_model,file)

## 3. Training the Elliptic Envelope method

We train an an Elliptic envelope model designed to detect outliers in a Gaussian distributed dataset.

Returns -1 for outlier and 1 for inliers.

See: https://scikit-learn.org/stable/modules/generated/sklearn.covariance.EllipticEnvelope.html

In [None]:
cov = EllipticEnvelope(contamination=0.1,random_state=0)
cov.fit(X_train)

filename = './models/novelty/cov_model_'+method+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(cov,file)

## 4. Training the Local Outlier Factor (LOF) method

We train an an LOF model to detect anomalies in the dataset.
Returns -1 for outlier and 1 for inliers.

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html

In [None]:
lof = LocalOutlierFactor(n_neighbors=35, novelty=True,n_jobs=3)
lof = lof.fit(X_train)

filename = './models/novelty/lof_model_'+method+'.pkl'

with open(filename,'wb') as file:
    pickle.dump(lof,file)

# Predict & Evaluate

### Input Parameters

Specificy: 

1. Which _Day of Week_ to use for anomaly detection ('Tuesday' or 'Wednesday')
2. _Model_ to make predictions ('isf' for Isolation Forest, 'svm' for One-Class SVM, 'cov' for Elliptic Envelope or 'lof' for Local Outlier Factor.
3. What _fraction_ of the dataset to use in visualizations in this notebook

In [9]:
day_of_week = 'Tuesday'

model = 'lof'

fraction = 0.1 

### Predict or Load Anomalies

Predict method returns 1 for inlier and -1 for outlier (aka anomalies)

First the code will try to load existing predictions. 

If such predictions do not exist, the code will try to load the model that can make such predictions and save the predictions for future use.

If neither predictions nor the model exists, it will raise an error.

These are transformed into:
- 1: Anomaly
- 0: Normal

In [10]:
try:
    
    print('Loading predictions...')
    filename='./predictions/novelty/'+day_of_week+'/'+model+'_anomalies_'+method+'.pkl'
    
    with open(filename,'rb') as file:
        anomalies=pickle.load(file)
        
    print('Predictions loaded!')
    
except:
    
    filename = './models/novelty/'+model+'_model_'+method+'.pkl'
    
    with open(filename,'rb') as file:
        anomaly_model=pickle.load(file)
    
    print('Making predictions...')
    
    anomalies = anomaly_model.predict(X_test[day_of_week])
    
    filename_2 = './predictions/novelty/'+day_of_week+'/'+model+'_anomalies_'+method+'.pkl'
    
    with open(filename_2,'wb') as file:
        pickle.dump(anomalies,file)   
        
    print('Predictions made! Also saved for future reference.')
        
anomalies = (anomalies*-1 +1)*0.5
anomalies = anomalies.astype(int)

Loading predictions...
Predictions loaded!


### Performance measures

If method is 'Anomaly', we investigate the detection performance only by accuracy.

If method is 'Mixed', we investigate the detection performance using:

1. Confusion Matrix (full picture of anomaly classification)
2. Accuracy
3. Precision of the anomaly class
4. Recall on the anomaly class
5. F-1 Measure on the anomaly class
6. AUC score

In [None]:
if method == 'Anomaly':
    accuracy_metric = sum(anomalies)/anomalies.size
    [sum(anomalies==1),sum(anomalies==0)]
else:
    accuracy_metric= None
    
print(accuracy_metric)

In [None]:
confusion_matrix(y_test[day_of_week],anomalies,labels=[1,0])

In [None]:
evaluate_performance(y_test[day_of_week],anomalies,dec_digits=10,pos_label=1,just_numbers=True)

# Visualize

Load the t-SNE transformed data in 2-D for visualization. If this data is being visualized for the first time, implement t-SNE and save it for the future.

In [16]:
try:
    filename='./tsne_transforms/novelty/'+day_of_week+'_perplexity_30_anomaly.pkl'
    with open(filename,'rb') as file:
        x_vis=pickle.load(file)
    print('Visualization loaded!')
except:

    print('Performing t-SNE on the dataset.')
    x_red_tsne = TSNE(n_components=2,perplexity=30).fit_transform(X_test[day_of_week])
    print('Completed.')
    
    filename='./tsne_transforms/novelty/'+day_of_week+'_perplexity_30_anomaly.pkl'
    with open(filename,'wb') as file:
        pickle.dump(x_red_tsne,file)      
    print('Saved the t-SNE transformation for future retrieval.')

Visualization loaded!


Plot the detected anomalies in the t-SNE transformed 2-D representation of the data.

In [17]:
sample_size = int(x_vis.shape[0]*fraction)

index = np.random.randint(0,x_vis.shape[0],sample_size)

x_red_vis = x_vis[index,:]

### Prepare a data frame for the plotly interactive scatter plot.

In [18]:
x_plotly = pd.DataFrame(x_red_vis,columns=['dim1','dim2'])
x_plotly['anomaly']=anomalies[index]
x_plotly['index']=index
anomaly_dict={0:'Anomaly',1:'Normal'}
x_plotly['Label']=x_plotly['anomaly'].map(anomaly_dict)

In [19]:
x_plotly['Label'].unique()

array(['Anomaly', 'Normal'], dtype=object)

### Plot.ly Interactive Visual

In [72]:
fig = px.scatter(x_plotly, 
                 x = "dim1",
                 y = 'dim2',
                 color = 'Label',
                 color_discrete_map = {'Anomaly':'limegreen','Normal':'midnightblue'},
                 hover_name = 'Label',
                 labels = {'dim1':'Latent Dimension 1','dim2':'Latent Dimension 2'},
                 title = 'Anomaly Detection Results for '+day_of_week+' Network Traffic',
                 render_mode = 'webgl',
                 width = 900,
                 height = 600,
                 template = "seaborn"
                 )
    
fig.update(layout={'title': {'x':0.5,
                             'font':{'size':24}
                            }, 
                   'legend':{
                             'font':{'size':18}
                            },
                   'xaxis':{
                            'title':
                                {
                                 'font':{'size':18}
                                }
                            },
                   'yaxis':{
                            'title':
                                {
                                 'font':{'size':18}
                                }
                            }
                  }
          )

fig.show()