## Import Packages

In [18]:
import os
import sys
import numpy as np
import pandas as pd
from time import time

from sklearn.model_selection import train_test_split
from scipy.io import loadmat



## Import PyOD packages and methods

In [2]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging





## Import Metrics Packages

In [3]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

## Define data file and read X and y

In [5]:
mat_file_list = ['E:/LetsUpgrade/Project Day 1/arrhythmia.mat',
                 'E:/LetsUpgrade/Project Day 1/cardio.mat',
                 'E:/LetsUpgrade/Project Day 1/glass.mat',
                 'E:/LetsUpgrade/Project Day 1/ionosphere.mat',
                 'E:/LetsUpgrade/Project Day 1/letter.mat',
                 'E:/LetsUpgrade/Project Day 1/lympho.mat',
                 'E:/LetsUpgrade/Project Day 1/mnist.mat',
                 'E:/LetsUpgrade/Project Day 1/musk.mat',
                 'E:/LetsUpgrade/Project Day 1/optdigits.mat',
                 'E:/LetsUpgrade/Project Day 1/pendigits.mat',
                 'E:/LetsUpgrade/Project Day 1/pima.mat',
                 'E:/LetsUpgrade/Project Day 1/satellite.mat',
                 'E:/LetsUpgrade/Project Day 1/satimage-2.mat',
                 'E:/LetsUpgrade/Project Day 1/shuttle.mat',
                 'E:/LetsUpgrade/Project Day 1/vertebral.mat',
                 'E:/LetsUpgrade/Project Day 1/vowels.mat',
                 'E:/LetsUpgrade/Project Day 1/wbc.mat']

## Reading a mat file

In [7]:
data=loadmat('E:/LetsUpgrade/Project Day 1/cardio.mat')
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

## Define 9 outlier detection tools to be compared

In [14]:
df_columns = ['Data','#Samples','# Dimensions','Outlier Perc','ABOD','CBLOF','FB','HBOS','IForest','KNN','LOF','MCD','OCSVM','PCA']

In [9]:
#Creating empty dataframes
roc_df=pd.DataFrame(columns=df_columns)  #region of characteristics Performance evolution table
prn_df=pd.DataFrame(columns=df_columns)  #precision_n_scores Performance evolution table
time_df=pd.DataFrame(columns=df_columns) 

## Exploring all mat files

In [26]:
for mat_file in mat_file_list:
    print("\n..Processing",mat_file,'...')
    mat=loadmat(os.path.join('E:/LetsUpgrade/Project Day 1',mat_file))
    
    X=mat['X']
    y=mat['y'].ravel()  #converting 2D to 1D so (that space is redcued)
    outliers_fraction=np.count_nonzero(y)/len(y)
    outliers_percentage=round(outliers_fraction*100,ndigits=4)
    
     #Construct containers for saving result
    roc_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    prn_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]
    time_list=[mat_file[:-4],X.shape[0],X.shape[1],outliers_percentage]

    #60% data for trainin g and 40% for testing
    random_state=np.random.RandomState(42)
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=random_state)
    
    #Standardizing data for processing
    X_train_norm,X_test_norm=standardizer(X_train,X_test)
    classifiers={'Angle-based Outlier Detector (ABOD)' : ABOD(contamination=outliers_fraction),
                 'Cluster-based Local Outlier Factor' : CBLOF(contamination=outliers_fraction,check_estimator=False,random_state=random_state),
                 'Feature Bagging' : FeatureBagging(contamination=outliers_fraction,random_state=random_state),
                 'Histogram-base Outlier Detection (HBOS)' : HBOS(contamination=outliers_fraction),
                 'Isolation Forest' : IForest(contamination=outliers_fraction, random_state=random_state),
                 'K Nearest Neignors (KNN)' : KNN(contamination=outliers_fraction),
                 'Local Outlier Factor (LOF)' : LOF(contamination=outliers_fraction),
                 'Minimum Covariance Determinat (MCD)' : MCD(contamination=outliers_fraction, random_state=random_state),
                 'One-class SVM (OCSVM)' : OCSVM(contamination=outliers_fraction),
                 'Principal Component Ananlysis (PCA)' : PCA(contamination=outliers_fraction, random_state=random_state),
                }
    for clf_name, clf in classifiers.items():
        t0=time() #time before training
        clf.fit(X_train_norm)   #for each algorithm, the model is trained
        test_scores=clf.decision_function(X_test_norm) #testing model score
        t1=time() #time for testing or time when testing completed
        duration=round(t1-t0,ndigits=4) #time taken to complete training
        time_list.append(duration)
        roc=round(roc_auc_score(y_test,test_scores),ndigits=4)
        prn=round(precision_n_scores(y_test,test_scores),ndigits=4)
        print('{clf_name} ROC:{roc},precision @ rank n:{prn},' 'execution time : {duration}s'.format(clf_name=clf_name,roc=roc,prn=prn,duration=duration))
        roc_list.append(roc)
        prn_list.append(prn)
    
    temp_df=pd.DataFrame(time_list).transpose()
    temp_df.columns=df_columns
    time_df=pd.concat([time_df,temp_df],axis=0)

    temp_df=pd.DataFrame(roc_list).transpose()
    temp_df.columns=df_columns
    roc_df=pd.concat([roc_df,temp_df],axis=0)

    temp_df=pd.DataFrame(prn_list).transpose()
    temp_df.columns=df_columns
    prn_df=pd.concat([prn_df,temp_df],axis=0)


..Processing E:/LetsUpgrade/Project Day 1/arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687,precision @ rank n:0.3571,execution time : 0.1811s
Cluster-based Local Outlier Factor ROC:0.7684,precision @ rank n:0.4643,execution time : 0.1318s
Feature Bagging ROC:0.7799,precision @ rank n:0.5,execution time : 0.637s
Histogram-base Outlier Detection (HBOS) ROC:0.8511,precision @ rank n:0.5714,execution time : 0.0855s
Isolation Forest ROC:0.8527,precision @ rank n:0.5714,execution time : 0.4974s
K Nearest Neignors (KNN) ROC:0.782,precision @ rank n:0.5,execution time : 0.0937s
Local Outlier Factor (LOF) ROC:0.7787,precision @ rank n:0.4643,execution time : 0.0781s




Minimum Covariance Determinat (MCD) ROC:0.8228,precision @ rank n:0.4286,execution time : 0.7248s
One-class SVM (OCSVM) ROC:0.7986,precision @ rank n:0.5,execution time : 0.0312s
Principal Component Ananlysis (PCA) ROC:0.7997,precision @ rank n:0.5,execution time : 0.0469s

..Processing E:/LetsUpgrade/Project Day 1/cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5928,precision @ rank n:0.2838,execution time : 0.5108s
Cluster-based Local Outlier Factor ROC:0.8547,precision @ rank n:0.5541,execution time : 0.1562s
Feature Bagging ROC:0.6565,precision @ rank n:0.2297,execution time : 1.0094s
Histogram-base Outlier Detection (HBOS) ROC:0.87,precision @ rank n:0.5135,execution time : 0.0s
Isolation Forest ROC:0.9304,precision @ rank n:0.5405,execution time : 0.5032s
K Nearest Neignors (KNN) ROC:0.7642,precision @ rank n:0.4054,execution time : 0.1809s
Local Outlier Factor (LOF) ROC:0.6432,precision @ rank n:0.2162,execution time : 0.1139s




Minimum Covariance Determinat (MCD) ROC:0.811,precision @ rank n:0.4865,execution time : 0.7077s
One-class SVM (OCSVM) ROC:0.9462,precision @ rank n:0.527,execution time : 0.1149s
Principal Component Ananlysis (PCA) ROC:0.9581,precision @ rank n:0.6216,execution time : 0.0s

..Processing E:/LetsUpgrade/Project Day 1/glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7805,precision @ rank n:0.25,execution time : 0.0469s
Cluster-based Local Outlier Factor ROC:0.9024,precision @ rank n:0.25,execution time : 0.1404s
Feature Bagging ROC:0.8293,precision @ rank n:0.25,execution time : 0.1072s
Histogram-base Outlier Detection (HBOS) ROC:0.7256,precision @ rank n:0.25,execution time : 0.008s
Isolation Forest ROC:0.7287,precision @ rank n:0.25,execution time : 0.6406s
K Nearest Neignors (KNN) ROC:0.8902,precision @ rank n:0.25,execution time : 0.016s
Local Outlier Factor (LOF) ROC:0.7287,precision @ rank n:0.25,execution time : 0.008s
Minimum Covariance Determinat (MCD) ROC:0.7957,precisio



Angle-based Outlier Detector (ABOD) ROC:0.9247,precision @ rank n:0.8868,execution time : 0.1559s
Cluster-based Local Outlier Factor ROC:0.8999,precision @ rank n:0.7925,execution time : 0.0998s
Feature Bagging ROC:0.9286,precision @ rank n:0.7736,execution time : 0.1127s
Histogram-base Outlier Detection (HBOS) ROC:0.5154,precision @ rank n:0.3585,execution time : 0.016s
Isolation Forest ROC:0.8433,precision @ rank n:0.6604,execution time : 0.593s
K Nearest Neignors (KNN) ROC:0.9226,precision @ rank n:0.8868,execution time : 0.0156s
Local Outlier Factor (LOF) ROC:0.9312,precision @ rank n:0.7736,execution time : 0.0s
Minimum Covariance Determinat (MCD) ROC:0.9657,precision @ rank n:0.8868,execution time : 0.125s
One-class SVM (OCSVM) ROC:0.8799,precision @ rank n:0.7736,execution time : 0.0s
Principal Component Ananlysis (PCA) ROC:0.8068,precision @ rank n:0.6226,execution time : 0.0156s

..Processing E:/LetsUpgrade/Project Day 1/letter.mat ...
Angle-based Outlier Detector (ABOD) ROC:0



Angle-based Outlier Detector (ABOD) ROC:0.7682,precision @ rank n:0.3643,execution time : 8.3704s
Cluster-based Local Outlier Factor ROC:0.8339,precision @ rank n:0.3941,execution time : 1.7816s
Feature Bagging ROC:0.6936,precision @ rank n:0.3234,execution time : 53.2317s
Histogram-base Outlier Detection (HBOS) ROC:0.5578,precision @ rank n:0.0967,execution time : 0.0625s
Isolation Forest ROC:0.8045,precision @ rank n:0.2714,execution time : 2.5819s
K Nearest Neignors (KNN) ROC:0.8423,precision @ rank n:0.4349,execution time : 7.6343s
Local Outlier Factor (LOF) ROC:0.7012,precision @ rank n:0.3309,execution time : 7.6104s




Minimum Covariance Determinat (MCD) ROC:0.8553,precision @ rank n:0.2268,execution time : 4.2366s
One-class SVM (OCSVM) ROC:0.8622,precision @ rank n:0.4015,execution time : 4.938s
Principal Component Ananlysis (PCA) ROC:0.8613,precision @ rank n:0.3903,execution time : 0.1875s

..Processing E:/LetsUpgrade/Project Day 1/musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.1012,precision @ rank n:0.0294,execution time : 2.6001s
Cluster-based Local Outlier Factor ROC:1.0,precision @ rank n:1.0,execution time : 0.465s
Feature Bagging ROC:0.5705,precision @ rank n:0.1471,execution time : 14.2617s
Histogram-base Outlier Detection (HBOS) ROC:1.0,precision @ rank n:1.0,execution time : 0.0781s
Isolation Forest ROC:1.0,precision @ rank n:0.9706,execution time : 1.6095s
K Nearest Neignors (KNN) ROC:0.7729,precision @ rank n:0.2059,execution time : 1.9476s
Local Outlier Factor (LOF) ROC:0.5567,precision @ rank n:0.1471,execution time : 1.7916s
Minimum Covariance Determinat (MCD) ROC:1.0,precis



Minimum Covariance Determinat (MCD) ROC:0.4226,precision @ rank n:0.0,execution time : 1.713s
One-class SVM (OCSVM) ROC:0.4817,precision @ rank n:0.0,execution time : 1.5114s
Principal Component Ananlysis (PCA) ROC:0.507,precision @ rank n:0.0,execution time : 0.0625s

..Processing E:/LetsUpgrade/Project Day 1/pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.677,precision @ rank n:0.0725,execution time : 2.147s
Cluster-based Local Outlier Factor ROC:0.8488,precision @ rank n:0.2029,execution time : 0.3474s
Feature Bagging ROC:0.4674,precision @ rank n:0.0725,execution time : 3.8395s
Histogram-base Outlier Detection (HBOS) ROC:0.9261,precision @ rank n:0.2609,execution time : 0.0156s
Isolation Forest ROC:0.9555,precision @ rank n:0.3768,execution time : 0.8169s
K Nearest Neignors (KNN) ROC:0.7603,precision @ rank n:0.1594,execution time : 0.7465s
Local Outlier Factor (LOF) ROC:0.4578,precision @ rank n:0.0725,execution time : 0.7181s
Minimum Covariance Determinat (MCD) ROC:0.



Minimum Covariance Determinat (MCD) ROC:0.9901,precision @ rank n:0.7332,execution time : 12.5548s
One-class SVM (OCSVM) ROC:0.9922,precision @ rank n:0.9578,execution time : 56.4797s
Principal Component Ananlysis (PCA) ROC:0.9904,precision @ rank n:0.9542,execution time : 0.059s

..Processing E:/LetsUpgrade/Project Day 1/vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.3968,precision @ rank n:0.0909,execution time : 0.072s
Cluster-based Local Outlier Factor ROC:0.3882,precision @ rank n:0.0,execution time : 0.067s
Feature Bagging ROC:0.4289,precision @ rank n:0.0909,execution time : 0.042s
Histogram-base Outlier Detection (HBOS) ROC:0.3166,precision @ rank n:0.0,execution time : 0.002s
Isolation Forest ROC:0.3444,precision @ rank n:0.0,execution time : 0.3918s
K Nearest Neignors (KNN) ROC:0.384,precision @ rank n:0.0,execution time : 0.02s
Local Outlier Factor (LOF) ROC:0.4342,precision @ rank n:0.0909,execution time : 0.006s
Minimum Covariance Determinat (MCD) ROC:0.4214,p



Angle-based Outlier Detector (ABOD) ROC:0.9515,precision @ rank n:0.5,execution time : 0.4148s
Cluster-based Local Outlier Factor ROC:0.9419,precision @ rank n:0.2727,execution time : 0.1119s
Feature Bagging ROC:0.943,precision @ rank n:0.2273,execution time : 0.3128s
Histogram-base Outlier Detection (HBOS) ROC:0.6365,precision @ rank n:0.0,execution time : 0.005s
Isolation Forest ROC:0.7601,precision @ rank n:0.1364,execution time : 0.4987s
K Nearest Neignors (KNN) ROC:0.9763,precision @ rank n:0.4545,execution time : 0.0979s
Local Outlier Factor (LOF) ROC:0.9119,precision @ rank n:0.2727,execution time : 0.047s
Minimum Covariance Determinat (MCD) ROC:0.6877,precision @ rank n:0.0455,execution time : 1.0204s
One-class SVM (OCSVM) ROC:0.7763,precision @ rank n:0.1364,execution time : 0.051s
Principal Component Ananlysis (PCA) ROC:0.5675,precision @ rank n:0.0909,execution time : 0.003s

..Processing E:/LetsUpgrade/Project Day 1/wbc.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.9181

In [27]:
roc_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,E:/LetsUpgrade/Project Day 1/arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.7997
0,E:/LetsUpgrade/Project Day 1/cardio,1831,21,9.6122,0.5928,0.8547,0.6565,0.87,0.9304,0.7642,0.6432,0.811,0.9462,0.9581
0,E:/LetsUpgrade/Project Day 1/glass,214,9,4.2056,0.7805,0.9024,0.8293,0.7256,0.7287,0.8902,0.7287,0.7957,0.753,0.747
0,E:/LetsUpgrade/Project Day 1/ionosphere,351,33,35.8974,0.9247,0.8999,0.9286,0.5154,0.8433,0.9226,0.9312,0.9657,0.8799,0.8068
0,E:/LetsUpgrade/Project Day 1/letter,1600,32,6.25,0.8949,0.7612,0.8663,0.6248,0.6403,0.8689,0.8781,0.8119,0.5827,0.5142
0,E:/LetsUpgrade/Project Day 1/lympho,148,18,4.0541,0.9138,1.0,1.0,1.0,1.0,0.9828,1.0,0.8966,1.0,1.0
0,E:/LetsUpgrade/Project Day 1/mnist,7603,100,9.2069,0.7682,0.8339,0.6936,0.5578,0.8045,0.8423,0.7012,0.8553,0.8622,0.8613
0,E:/LetsUpgrade/Project Day 1/musk,3062,166,3.1679,0.1012,1.0,0.5705,1.0,1.0,0.7729,0.5567,1.0,1.0,1.0
0,E:/LetsUpgrade/Project Day 1/optdigits,5216,64,2.8758,0.4601,0.7515,0.4197,0.8962,0.6531,0.3717,0.4191,0.4226,0.4817,0.507
0,E:/LetsUpgrade/Project Day 1/pendigits,6870,16,2.2707,0.677,0.8488,0.4674,0.9261,0.9555,0.7603,0.4578,0.8387,0.946,0.9402


In [28]:
time_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,E:/LetsUpgrade/Project Day 1/wbc,378,30,5.5556,0.5455,0.5455,0.7273,0.6364,0.6364,0.7273,0.6364,0.5455,0.6364,0.6364
0,E:/LetsUpgrade/Project Day 1/arrhythmia,452,274,14.6018,0.1811,0.1318,0.637,0.0855,0.4974,0.0937,0.0781,0.7248,0.0312,0.0469
0,E:/LetsUpgrade/Project Day 1/cardio,1831,21,9.6122,0.5108,0.1562,1.0094,0.0,0.5032,0.1809,0.1139,0.7077,0.1149,0.0
0,E:/LetsUpgrade/Project Day 1/glass,214,9,4.2056,0.0469,0.1404,0.1072,0.008,0.6406,0.016,0.008,0.0728,0.0,0.0
0,E:/LetsUpgrade/Project Day 1/ionosphere,351,33,35.8974,0.1559,0.0998,0.1127,0.016,0.593,0.0156,0.0,0.125,0.0,0.0156
0,E:/LetsUpgrade/Project Day 1/letter,1600,32,6.25,0.5671,0.1719,0.9953,0.0156,0.7178,0.1875,0.1232,1.8417,0.1406,0.0
0,E:/LetsUpgrade/Project Day 1/lympho,148,18,4.0541,0.0859,0.0854,0.0625,0.0,0.4455,0.0,0.0156,0.0469,0.0156,0.0
0,E:/LetsUpgrade/Project Day 1/mnist,7603,100,9.2069,8.3704,1.7816,53.2317,0.0625,2.5819,7.6343,7.6104,4.2366,4.938,0.1875
0,E:/LetsUpgrade/Project Day 1/musk,3062,166,3.1679,2.6001,0.465,14.2617,0.0781,1.6095,1.9476,1.7916,14.6163,1.1688,0.218
0,E:/LetsUpgrade/Project Day 1/optdigits,5216,64,2.8758,3.2994,0.5576,13.8039,0.04,1.241,2.1082,1.762,1.713,1.5114,0.0625


In [29]:
prn_df

Unnamed: 0,Data,#Samples,# Dimensions,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,E:/LetsUpgrade/Project Day 1/arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,E:/LetsUpgrade/Project Day 1/cardio,1831,21,9.6122,0.2838,0.5541,0.2297,0.5135,0.5405,0.4054,0.2162,0.4865,0.527,0.6216
0,E:/LetsUpgrade/Project Day 1/glass,214,9,4.2056,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.0,0.25,0.25
0,E:/LetsUpgrade/Project Day 1/ionosphere,351,33,35.8974,0.8868,0.7925,0.7736,0.3585,0.6604,0.8868,0.7736,0.8868,0.7736,0.6226
0,E:/LetsUpgrade/Project Day 1/letter,1600,32,6.25,0.3721,0.186,0.3721,0.093,0.093,0.3488,0.3721,0.186,0.1395,0.1163
0,E:/LetsUpgrade/Project Day 1/lympho,148,18,4.0541,0.0,1.0,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0
0,E:/LetsUpgrade/Project Day 1/mnist,7603,100,9.2069,0.3643,0.3941,0.3234,0.0967,0.2714,0.4349,0.3309,0.2268,0.4015,0.3903
0,E:/LetsUpgrade/Project Day 1/musk,3062,166,3.1679,0.0294,1.0,0.1471,1.0,0.9706,0.2059,0.1471,1.0,1.0,1.0
0,E:/LetsUpgrade/Project Day 1/optdigits,5216,64,2.8758,0.0,0.0,0.0164,0.1803,0.0164,0.0,0.0164,0.0,0.0,0.0
0,E:/LetsUpgrade/Project Day 1/pendigits,6870,16,2.2707,0.0725,0.2029,0.0725,0.2609,0.3768,0.1594,0.0725,0.1159,0.3333,0.3478
