In [1]:
import pyod
import os
import sys
from time import time 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.io import loadmat #matlab file reading

In [2]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



Import metric packages

In [3]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores

In [4]:
from sklearn.metrics import roc_auc_score

# Assigning all the datasets to a list

In [5]:
mat_file_list = ['arrhythmia.mat','cardio.mat',
                 'glass.mat','ionosphere.mat',
                 'letter.mat','lympho.mat',
                 'mnist.mat','musk.mat',
                 'optdigits.mat','pendigits.mat',
                 'pima.mat','satellite.mat',
                 'satimage-2.mat','shuttle.mat',
                 'vertebral.mat','vowels.mat','wbc.mat']

In [6]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

In [7]:
len(mat_file_list)

17

#### Making a list of the column name to record the result

In [8]:
df_columns=['Data','#Sample','#Dimensions','Outlier Perc','PCA','MCD','OCSVM','LOF','CBLOF','KNN','HBOS','ABOD',
            'IFOREST','FEATUREBAGGING']


####  Creating empty dataframes
#####  ROC Dataframe to record all Roc values performed on each dataset
###### Precison Dataframe to record all Precison values performed on each dataset
##### Execution Time Dataframe to record the time taken to perform algorithm on each dataset, So as to find the algorithm which takes minimum amount of time and gives best accuracy


### 1.) ROC Dataframe

In [9]:
roc_df=pd.DataFrame(columns=df_columns)
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING


In [10]:
#2.) Precison Dataframe
prn_df=pd.DataFrame(columns=df_columns)
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING


In [11]:
#3.) Execution Time Dataframe
time_df=pd.DataFrame(columns=df_columns)
time_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING


In [14]:
#Load one Dataframe to check values of X and y
data_1 = loadmat(r"C:\Users\ADMIN\Desktop\LetsUpgrade\Anomaly Detection_project\Anamoly_detec_data\vowels.mat") 
data_1

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-26 08:42:13 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.58046914, -0.90253404,  0.61789919, ...,  1.60463715,
         -0.6230598 , -0.38312549],
        [ 0.78437493, -1.07736635,  0.6157809 , ...,  1.26023551,
         -0.42333934, -0.2877912 ],
        [ 0.79129238, -1.08624216,  0.66977272, ...,  1.08179729,
         -0.26720104, -0.17220348],
        ...,
        [ 0.9470763 ,  0.35810832,  0.27472497, ..., -1.08832841,
          0.3271257 ,  1.69283401],
        [ 1.58485142,  0.69359118, -0.37568588, ..., -3.07682047,
         -0.24109405,  1.94433536],
        [ 2.32735022,  0.38281412,  0.77590669, ..., -0.48257003,
         -0.59043614, -0.72199018]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

#### Inference:
Mat files are in form of Dictionary
In the file, header , version , globals are predefined clases
X and y are the variables we are going to use, Like in ML we don't need to define the X & y seperately.
X & y are 2D numpy arrays

### Exploring Mat files and finding best Algorithm to detect Anomaly

In [15]:
# Creating random state
random_state = np.random.RandomState(42)

# Processing mat files one by one : 
for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join('Anamoly_detec_data', mat_file))

    X = mat['X']
    y = mat['y'].ravel()  #ravel() function converts 2D to 1D
    
    # Counting Outlier :
    
    # Counts the number of non-zero values in the array y and divide by length of y : It gives outlier in fraction
    outliers_fraction = np.count_nonzero(y) / len(y)
    
    # Calculating Outlier percentage
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

    # Construct containers for saving results
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

    # Spliting Data into : 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=random_state)
    
    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    # Applying all the algorithms and storing thier result in a dictionary format:
    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                   
                   'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False,
                                                               random_state=random_state),
                   
                   'Feature Bagging': FeatureBagging(contamination=outliers_fraction, random_state=random_state),
                   
                   'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                   
                   'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state),
                   
                   'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
                   
                   'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction),
                   
                   'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state),
                   
                   'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
                   
                   'Principal Component Analysis (PCA)': PCA(contamination=outliers_fraction, random_state=random_state),
   }


    # Calculating Time taken for each algorithm
    for clf_name, clf in classifiers.items():
        # Initialize the start time 
        t0 = time() 
        
        # Fit( Train )the data
        clf.fit(X_train_norm) 
        
        # Predicting Value on Xtest
        test_scores = clf.decision_function(X_test_norm)  
        
        # Final Time
        t1 = time()   
        
        # Total time duration : t1 - t0
        duration = round(t1 - t0, ndigits=4) 
        
        # Append duration in time list
        time_list.append(duration)

        #Calculating roc and precision value of the algorithm
        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        # Print the roc , precision and executing time 
        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        # Append roc and precision value to their respective list
        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)
    
    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)



... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 3.7185s




Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643, execution time: 2.3701s
Feature Bagging ROC:0.7799, precision @ rank n:0.5, execution time: 0.5702s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 1.482s
Isolation Forest ROC:0.8478, precision @ rank n:0.5357, execution time: 0.5277s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.09s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.07s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 1.775s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.0999s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.1704s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @ rank n:0.1875, execution time: 0.3532s
Cluster-based Local Outlier Factor ROC:0.8221, precision @ rank n:0.4844, execution time: 0.1399s




Feature Bagging ROC:0.4879, precision @ rank n:0.1406, execution time: 0.6978s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precision @ rank n:0.4688, execution time: 0.0s
Isolation Forest ROC:0.9316, precision @ rank n:0.4531, execution time: 0.3703s
K Nearest Neighbors (KNN) ROC:0.6959, precision @ rank n:0.2812, execution time: 0.1299s
Local Outlier Factor (LOF) ROC:0.4715, precision @ rank n:0.125, execution time: 0.0951s




Minimum Covariance Determinant (MCD) ROC:0.8778, precision @ rank n:0.3906, execution time: 0.5915s
One-class SVM (OCSVM) ROC:0.9507, precision @ rank n:0.5938, execution time: 0.0699s
Principal Component Analysis (PCA) ROC:0.9638, precision @ rank n:0.6875, execution time: 0.0999s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @ rank n:0.25, execution time: 0.0421s
Cluster-based Local Outlier Factor ROC:0.8506, precision @ rank n:0.25, execution time: 0.031s
Feature Bagging ROC:0.7043, precision @ rank n:0.25, execution time: 0.03s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precision @ rank n:0.0, execution time: 0.0s




Isolation Forest ROC:0.7195, precision @ rank n:0.25, execution time: 0.3435s
K Nearest Neighbors (KNN) ROC:0.7805, precision @ rank n:0.25, execution time: 0.01s
Local Outlier Factor (LOF) ROC:0.7774, precision @ rank n:0.25, execution time: 0.0s
Minimum Covariance Determinant (MCD) ROC:0.7165, precision @ rank n:0.0, execution time: 0.1499s
One-class SVM (OCSVM) ROC:0.6189, precision @ rank n:0.25, execution time: 0.0s
Principal Component Analysis (PCA) ROC:0.622, precision @ rank n:0.25, execution time: 0.0199s

... Processing ionosphere.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9004, precision @ rank n:0.8214, execution time: 0.064s
Cluster-based Local Outlier Factor ROC:0.8952, precision @ rank n:0.8036, execution time: 0.05s




Feature Bagging ROC:0.8933, precision @ rank n:0.75, execution time: 0.0714s
Histogram-base Outlier Detection (HBOS) ROC:0.5195, precision @ rank n:0.3393, execution time: 0.009s
Isolation Forest ROC:0.8294, precision @ rank n:0.6607, execution time: 0.3101s
K Nearest Neighbors (KNN) ROC:0.9134, precision @ rank n:0.8393, execution time: 0.02s
Local Outlier Factor (LOF) ROC:0.8989, precision @ rank n:0.75, execution time: 0.01s
Minimum Covariance Determinant (MCD) ROC:0.9399, precision @ rank n:0.8571, execution time: 0.0599s
One-class SVM (OCSVM) ROC:0.8372, precision @ rank n:0.7143, execution time: 0.0s
Principal Component Analysis (PCA) ROC:0.7971, precision @ rank n:0.5893, execution time: 0.08s

... Processing letter.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.8465, precision @ rank n:0.275, execution time: 0.3466s
Cluster-based Local Outlier Factor ROC:0.7423, precision @ rank n:0.175, execution time: 0.0999s




Feature Bagging ROC:0.866, precision @ rank n:0.4, execution time: 0.6756s
Histogram-base Outlier Detection (HBOS) ROC:0.5728, precision @ rank n:0.125, execution time: 0.01s
Isolation Forest ROC:0.5836, precision @ rank n:0.05, execution time: 0.3766s
K Nearest Neighbors (KNN) ROC:0.845, precision @ rank n:0.3, execution time: 0.1199s
Local Outlier Factor (LOF) ROC:0.8409, precision @ rank n:0.325, execution time: 0.0871s
Minimum Covariance Determinant (MCD) ROC:0.7499, precision @ rank n:0.075, execution time: 0.952s
One-class SVM (OCSVM) ROC:0.5744, precision @ rank n:0.1, execution time: 0.07s
Principal Component Analysis (PCA) ROC:0.48, precision @ rank n:0.05, execution time: 0.01s

... Processing lympho.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9382, precision @ rank n:0.4, execution time: 0.03s
Cluster-based Local Outlier Factor ROC:0.9709, precision @ rank n:0.6, execution time: 0.0643s
Feature Bagging ROC:0.9673, precision @ rank n:0.6, execution time: 0.03s
Histogram



Isolation Forest ROC:0.9855, precision @ rank n:0.6, execution time: 0.277s
K Nearest Neighbors (KNN) ROC:0.9636, precision @ rank n:0.6, execution time: 0.01s
Local Outlier Factor (LOF) ROC:0.9636, precision @ rank n:0.6, execution time: 0.01s
Minimum Covariance Determinant (MCD) ROC:0.9164, precision @ rank n:0.6, execution time: 0.03s
One-class SVM (OCSVM) ROC:0.9636, precision @ rank n:0.6, execution time: 0.0s
Principal Component Analysis (PCA) ROC:0.9818, precision @ rank n:0.8, execution time: 0.0s

... Processing mnist.mat ...




Angle-based Outlier Detector (ABOD) ROC:0.7813, precision @ rank n:0.3562, execution time: 6.3057s




Cluster-based Local Outlier Factor ROC:0.8447, precision @ rank n:0.4007, execution time: 0.6192s
Feature Bagging ROC:0.7259, precision @ rank n:0.3664, execution time: 43.9612s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precision @ rank n:0.1199, execution time: 0.04s
Isolation Forest ROC:0.7813, precision @ rank n:0.3116, execution time: 1.7916s
K Nearest Neighbors (KNN) ROC:0.8409, precision @ rank n:0.4144, execution time: 6.0038s
Local Outlier Factor (LOF) ROC:0.7085, precision @ rank n:0.339, execution time: 5.7245s




Minimum Covariance Determinant (MCD) ROC:0.863, precision @ rank n:0.3973, execution time: 2.7034s
One-class SVM (OCSVM) ROC:0.8417, precision @ rank n:0.3801, execution time: 4.0896s
Principal Component Analysis (PCA) ROC:0.8396, precision @ rank n:0.3767, execution time: 0.1399s

... Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.0809, precision @ rank n:0.0333, execution time: 1.892s
Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 0.1799s




Feature Bagging ROC:0.5228, precision @ rank n:0.1667, execution time: 11.8304s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precision @ rank n:0.9667, execution time: 0.0529s
Isolation Forest ROC:0.9992, precision @ rank n:0.9, execution time: 1.116s
K Nearest Neighbors (KNN) ROC:0.7348, precision @ rank n:0.2333, execution time: 1.5459s
Local Outlier Factor (LOF) ROC:0.5323, precision @ rank n:0.1333, execution time: 1.4611s
Minimum Covariance Determinant (MCD) ROC:1.0, precision @ rank n:0.9667, execution time: 10.9051s
One-class SVM (OCSVM) ROC:1.0, precision @ rank n:1.0, execution time: 1.0863s
Principal Component Analysis (PCA) ROC:1.0, precision @ rank n:1.0, execution time: 0.1199s

... Processing optdigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.4428, precision @ rank n:0.0161, execution time: 2.1612s




Cluster-based Local Outlier Factor ROC:0.7852, precision @ rank n:0.0, execution time: 0.244s
Feature Bagging ROC:0.4641, precision @ rank n:0.0484, execution time: 12.4361s
Histogram-base Outlier Detection (HBOS) ROC:0.8822, precision @ rank n:0.2581, execution time: 0.03s
Isolation Forest ROC:0.5442, precision @ rank n:0.0161, execution time: 0.7995s
K Nearest Neighbors (KNN) ROC:0.3824, precision @ rank n:0.0, execution time: 1.5302s
Local Outlier Factor (LOF) ROC:0.4584, precision @ rank n:0.0484, execution time: 1.4101s




Minimum Covariance Determinant (MCD) ROC:0.3486, precision @ rank n:0.0, execution time: 1.0794s
One-class SVM (OCSVM) ROC:0.4972, precision @ rank n:0.0, execution time: 1.2594s
Principal Component Analysis (PCA) ROC:0.504, precision @ rank n:0.0, execution time: 0.04s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7008, precision @ rank n:0.0308, execution time: 1.3106s
Cluster-based Local Outlier Factor ROC:0.9609, precision @ rank n:0.3077, execution time: 0.1899s




Feature Bagging ROC:0.4687, precision @ rank n:0.0462, execution time: 4.1249s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precision @ rank n:0.2615, execution time: 0.0101s
Isolation Forest ROC:0.9482, precision @ rank n:0.2615, execution time: 0.595s
K Nearest Neighbors (KNN) ROC:0.7602, precision @ rank n:0.0462, execution time: 0.5556s
Local Outlier Factor (LOF) ROC:0.481, precision @ rank n:0.0462, execution time: 0.4975s
Minimum Covariance Determinant (MCD) ROC:0.8271, precision @ rank n:0.0615, execution time: 2.0542s
One-class SVM (OCSVM) ROC:0.93, precision @ rank n:0.2923, execution time: 1.1941s
Principal Component Analysis (PCA) ROC:0.9332, precision @ rank n:0.3385, execution time: 0.014s

... Processing pima.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.6757, precision @ rank n:0.5106, execution time: 0.2484s
Cluster-based Local Outlier Factor ROC:0.684, precision @ rank n:0.4681, execution time: 0.1396s




Feature Bagging ROC:0.6446, precision @ rank n:0.4468, execution time: 0.1786s
Histogram-base Outlier Detection (HBOS) ROC:0.7169, precision @ rank n:0.5213, execution time: 0.005s
Isolation Forest ROC:0.6777, precision @ rank n:0.4787, execution time: 0.4311s
K Nearest Neighbors (KNN) ROC:0.7252, precision @ rank n:0.5106, execution time: 0.0309s
Local Outlier Factor (LOF) ROC:0.6604, precision @ rank n:0.4787, execution time: 0.01s
Minimum Covariance Determinant (MCD) ROC:0.7047, precision @ rank n:0.4787, execution time: 0.0438s
One-class SVM (OCSVM) ROC:0.6423, precision @ rank n:0.4574, execution time: 0.014s
Principal Component Analysis (PCA) ROC:0.6639, precision @ rank n:0.5, execution time: 0.003s

... Processing satellite.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5681, precision @ rank n:0.3918, execution time: 1.5792s




Cluster-based Local Outlier Factor ROC:0.7234, precision @ rank n:0.5574, execution time: 0.2513s
Feature Bagging ROC:0.557, precision @ rank n:0.4051, execution time: 7.5387s
Histogram-base Outlier Detection (HBOS) ROC:0.7393, precision @ rank n:0.5466, execution time: 0.017s
Isolation Forest ROC:0.7094, precision @ rank n:0.578, execution time: 0.8433s
K Nearest Neighbors (KNN) ROC:0.6781, precision @ rank n:0.4994, execution time: 1.0686s
Local Outlier Factor (LOF) ROC:0.5551, precision @ rank n:0.4051, execution time: 0.8546s
Minimum Covariance Determinant (MCD) ROC:0.792, precision @ rank n:0.6747, execution time: 2.4413s
One-class SVM (OCSVM) ROC:0.636, precision @ rank n:0.5224, execution time: 1.1827s
Principal Component Analysis (PCA) ROC:0.5783, precision @ rank n:0.4559, execution time: 0.02s

... Processing satimage-2.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.86, precision @ rank n:0.2593, execution time: 2.3697s




Cluster-based Local Outlier Factor ROC:0.9987, precision @ rank n:0.8889, execution time: 0.251s
Feature Bagging ROC:0.4971, precision @ rank n:0.0741, execution time: 6.4527s
Histogram-base Outlier Detection (HBOS) ROC:0.9837, precision @ rank n:0.5926, execution time: 0.0291s
Isolation Forest ROC:0.9973, precision @ rank n:0.8889, execution time: 0.7785s
K Nearest Neighbors (KNN) ROC:0.9505, precision @ rank n:0.3704, execution time: 0.7552s
Local Outlier Factor (LOF) ROC:0.5006, precision @ rank n:0.0741, execution time: 0.6453s
Minimum Covariance Determinant (MCD) ROC:0.9946, precision @ rank n:0.5185, execution time: 1.782s
One-class SVM (OCSVM) ROC:0.9976, precision @ rank n:0.9259, execution time: 1.0605s
Principal Component Analysis (PCA) ROC:0.9841, precision @ rank n:0.8519, execution time: 0.0289s

... Processing shuttle.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.6186, precision @ rank n:0.1918, execution time: 17.1251s




Cluster-based Local Outlier Factor ROC:0.6286, precision @ rank n:0.2336, execution time: 0.6324s
Feature Bagging ROC:0.5211, precision @ rank n:0.111, execution time: 48.6s
Histogram-base Outlier Detection (HBOS) ROC:0.9851, precision @ rank n:0.9857, execution time: 0.0229s
Isolation Forest ROC:0.9972, precision @ rank n:0.9337, execution time: 4.7039s
K Nearest Neighbors (KNN) ROC:0.645, precision @ rank n:0.2199, execution time: 8.6679s
Local Outlier Factor (LOF) ROC:0.5347, precision @ rank n:0.1406, execution time: 12.0245s






Minimum Covariance Determinant (MCD) ROC:0.9903, precision @ rank n:0.7534, execution time: 13.2101s
One-class SVM (OCSVM) ROC:0.9922, precision @ rank n:0.9553, execution time: 49.6731s
Principal Component Analysis (PCA) ROC:0.9902, precision @ rank n:0.9503, execution time: 0.0738s

... Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.2797, precision @ rank n:0.0, execution time: 0.0987s
Cluster-based Local Outlier Factor ROC:0.3908, precision @ rank n:0.0, execution time: 0.0876s
Feature Bagging ROC:0.3027, precision @ rank n:0.0, execution time: 0.078s




Histogram-base Outlier Detection (HBOS) ROC:0.2695, precision @ rank n:0.0, execution time: 0.004s
Isolation Forest ROC:0.3576, precision @ rank n:0.0, execution time: 0.6761s
K Nearest Neighbors (KNN) ROC:0.318, precision @ rank n:0.0, execution time: 0.0219s
Local Outlier Factor (LOF) ROC:0.318, precision @ rank n:0.0, execution time: 0.006s
Minimum Covariance Determinant (MCD) ROC:0.3308, precision @ rank n:0.0, execution time: 0.0798s
One-class SVM (OCSVM) ROC:0.4087, precision @ rank n:0.0, execution time: 0.004s
Principal Component Analysis (PCA) ROC:0.3397, precision @ rank n:0.0, execution time: 0.003s

... Processing vowels.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9521, precision @ rank n:0.4706, execution time: 0.4199s
Cluster-based Local Outlier Factor ROC:0.9278, precision @ rank n:0.4118, execution time: 0.1077s




Feature Bagging ROC:0.9385, precision @ rank n:0.3529, execution time: 0.2872s
Histogram-base Outlier Detection (HBOS) ROC:0.6758, precision @ rank n:0.1765, execution time: 0.0039s
Isolation Forest ROC:0.7469, precision @ rank n:0.1176, execution time: 0.4043s
K Nearest Neighbors (KNN) ROC:0.9568, precision @ rank n:0.5294, execution time: 0.0728s
Local Outlier Factor (LOF) ROC:0.9345, precision @ rank n:0.4118, execution time: 0.032s
Minimum Covariance Determinant (MCD) ROC:0.6779, precision @ rank n:0.0, execution time: 0.9043s
One-class SVM (OCSVM) ROC:0.7415, precision @ rank n:0.2941, execution time: 0.0399s
Principal Component Analysis (PCA) ROC:0.5787, precision @ rank n:0.1176, execution time: 0.003s

... Processing wbc.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9232, precision @ rank n:0.3, execution time: 0.0638s
Cluster-based Local Outlier Factor ROC:0.9063, precision @ rank n:0.6, execution time: 0.0714s
Feature Bagging ROC:0.9415, precision @ rank n:0.5, execution 



Isolation Forest ROC:0.9451, precision @ rank n:0.5, execution time: 0.3208s
K Nearest Neighbors (KNN) ROC:0.9437, precision @ rank n:0.5, execution time: 0.017s
Local Outlier Factor (LOF) ROC:0.9352, precision @ rank n:0.4, execution time: 0.0069s
Minimum Covariance Determinant (MCD) ROC:0.8986, precision @ rank n:0.4, execution time: 0.0614s
One-class SVM (OCSVM) ROC:0.9408, precision @ rank n:0.5, execution time: 0.004s
Principal Component Analysis (PCA) ROC:0.9324, precision @ rank n:0.6, execution time: 0.003s


In [16]:
#ROC Dataframe
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8478,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9316,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8294,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5836,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7813,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9992,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5442,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9482,0.7602,0.481,0.8271,0.93,0.9332


In [17]:
#Precision Dataframe
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5357,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.1406,0.4688,0.4531,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3664,0.1199,0.3116,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.1667,0.9667,0.9,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2615,0.0462,0.0462,0.0615,0.2923,0.3385


In [19]:
n_samples = len(y)
clusters_separation = [0]

n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1

In [20]:
# Show the statics of the data
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print('Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(shape=ground_truth.shape))
print(ground_truth)

Number of inliers: 357
Number of outliers: 21
Ground truth shape is (378,). Outlier are 1 and inliers are 0.

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]


In [21]:
# Show all detectors
for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)

Model 1 Angle-based Outlier Detector (ABOD)
Model 2 Cluster-based Local Outlier Factor
Model 3 Feature Bagging
Model 4 Histogram-base Outlier Detection (HBOS)
Model 5 Isolation Forest
Model 6 K Nearest Neighbors (KNN)
Model 7 Local Outlier Factor (LOF)
Model 8 Minimum Covariance Determinant (MCD)
Model 9 One-class SVM (OCSVM)
Model 10 Principal Component Analysis (PCA)
