# Importing Python Packages

In [1]:
import os
import sys
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy.io import loadmat

# Importing PyOD packages and Methods

In [3]:
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging

# Importing Metrics Packages

In [5]:
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

# Define data file and read X and y

In [6]:
mat_file_list = ['arrhythmia.mat','cardio.mat','glass.mat','ionosphere.mat','letter.mat','lympho.mat','mnist.mat',
                'musk.mat','optdigits.mat','pendigits.mat','pima.mat','satellite.mat','satimage-2.mat','shuttle.mat',
                'vertebral.mat','vowels.mat','wbc.mat']

In [7]:
mat_file_list

['arrhythmia.mat',
 'cardio.mat',
 'glass.mat',
 'ionosphere.mat',
 'letter.mat',
 'lympho.mat',
 'mnist.mat',
 'musk.mat',
 'optdigits.mat',
 'pendigits.mat',
 'pima.mat',
 'satellite.mat',
 'satimage-2.mat',
 'shuttle.mat',
 'vertebral.mat',
 'vowels.mat',
 'wbc.mat']

# Loading Mat File

In [8]:
data = loadmat('C:\Anamoly__detec_data/cardio.mat')

In [9]:
data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [10]:
len(data)

5

In [11]:
df_columns = ['Data','#Sample','#Dimensions','Outlier Perc','PCA','MCD','OCSVM','LOF','CBLOF','KNN','HBOS','ABOD','IFOREST',
             'FEATUREBAGGING']

# Precision Time and Roc evolution tables creation

In [12]:
roc_df = pd.DataFrame(columns=df_columns)
prn_df = pd.DataFrame(columns=df_columns)
time_df = pd.DataFrame(columns=df_columns)
print (roc_df, prn_df, time_df)

Empty DataFrame
Columns: [Data, #Sample, #Dimensions, Outlier Perc, PCA, MCD, OCSVM, LOF, CBLOF, KNN, HBOS, ABOD, IFOREST, FEATUREBAGGING]
Index: [] Empty DataFrame
Columns: [Data, #Sample, #Dimensions, Outlier Perc, PCA, MCD, OCSVM, LOF, CBLOF, KNN, HBOS, ABOD, IFOREST, FEATUREBAGGING]
Index: [] Empty DataFrame
Columns: [Data, #Sample, #Dimensions, Outlier Perc, PCA, MCD, OCSVM, LOF, CBLOF, KNN, HBOS, ABOD, IFOREST, FEATUREBAGGING]
Index: []


# Exploring Mat Files

In [13]:
from time import time
random_state = np.random.RandomState(42)

for mat_file in mat_file_list:
    print("\n... Processing", mat_file, '...')
    mat = loadmat(os.path.join('C:/\Anamoly__detec_data', mat_file))

    X = mat['X']
    y = mat['y'].ravel()
    outliers_fraction = np.count_nonzero(y) / len(y)
    outliers_percentage = round(outliers_fraction * 100, ndigits=4)

   
    roc_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    prn_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]
    time_list = [mat_file[:-4], X.shape[0], X.shape[1], outliers_percentage]

   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                       random_state=random_state)

   
    X_train_norm, X_test_norm = standardizer(X_train, X_test)
    
    classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(
       contamination=outliers_fraction),
       'Cluster-based Local Outlier Factor': CBLOF(
           contamination=outliers_fraction, check_estimator=False,
           random_state=random_state),
       'Feature Bagging': FeatureBagging(contamination=outliers_fraction,
                                         random_state=random_state),
       'Histogram-base Outlier Detection (HBOS)': HBOS(
           contamination=outliers_fraction),
       'Isolation Forest': IForest(contamination=outliers_fraction,
                                   random_state=random_state),
       'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
       'Local Outlier Factor (LOF)': LOF(
           contamination=outliers_fraction),
       'Minimum Covariance Determinant (MCD)': MCD(
           contamination=outliers_fraction, random_state=random_state),
       'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
       'Principal Component Analysis (PCA)': PCA(
           contamination=outliers_fraction, random_state=random_state),
   }

    for clf_name, clf in classifiers.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)
        time_list.append(duration)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('{clf_name} ROC:{roc}, precision @ rank n:{prn}, '
             'execution time: {duration}s'.format(
           clf_name=clf_name, roc=roc, prn=prn, duration=duration))

        roc_list.append(roc)
        prn_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df, temp_df], axis=0)

    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df, temp_df], axis=0)

    temp_df = pd.DataFrame(prn_list).transpose()
    temp_df.columns = df_columns
    prn_df = pd.concat([prn_df, temp_df], axis=0)


... Processing arrhythmia.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7687, precision @ rank n:0.3571, execution time: 4.6594s
Cluster-based Local Outlier Factor ROC:0.7684, precision @ rank n:0.4643, execution time: 6.808s
Feature Bagging ROC:0.7799, precision @ rank n:0.5, execution time: 1.6311s
Histogram-base Outlier Detection (HBOS) ROC:0.8511, precision @ rank n:0.5714, execution time: 5.888s
Isolation Forest ROC:0.8527, precision @ rank n:0.5714, execution time: 1.5585s
K Nearest Neighbors (KNN) ROC:0.782, precision @ rank n:0.5, execution time: 0.2946s
Local Outlier Factor (LOF) ROC:0.7787, precision @ rank n:0.4643, execution time: 0.2134s




Minimum Covariance Determinant (MCD) ROC:0.8228, precision @ rank n:0.4286, execution time: 2.0219s
One-class SVM (OCSVM) ROC:0.7986, precision @ rank n:0.5, execution time: 0.1219s
Principal Component Analysis (PCA) ROC:0.7997, precision @ rank n:0.5, execution time: 0.1645s

... Processing cardio.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.5763, precision @ rank n:0.1875, execution time: 1.3468s
Cluster-based Local Outlier Factor ROC:0.8221, precision @ rank n:0.4844, execution time: 0.6567s
Feature Bagging ROC:0.4879, precision @ rank n:0.1406, execution time: 2.3665s
Histogram-base Outlier Detection (HBOS) ROC:0.8453, precision @ rank n:0.4688, execution time: 0.0219s
Isolation Forest ROC:0.9414, precision @ rank n:0.5, execution time: 1.4207s
K Nearest Neighbors (KNN) ROC:0.6959, precision @ rank n:0.2812, execution time: 0.5057s
Local Outlier Factor (LOF) ROC:0.4715, precision @ rank n:0.125, execution time: 0.2893s




Minimum Covariance Determinant (MCD) ROC:0.8778, precision @ rank n:0.3906, execution time: 2.4447s
One-class SVM (OCSVM) ROC:0.9507, precision @ rank n:0.5938, execution time: 0.2227s
Principal Component Analysis (PCA) ROC:0.9638, precision @ rank n:0.6875, execution time: 0.0161s

... Processing glass.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7104, precision @ rank n:0.25, execution time: 0.1927s
Cluster-based Local Outlier Factor ROC:0.8506, precision @ rank n:0.25, execution time: 0.1939s
Feature Bagging ROC:0.7043, precision @ rank n:0.25, execution time: 0.1483s
Histogram-base Outlier Detection (HBOS) ROC:0.6524, precision @ rank n:0.0, execution time: 0.0113s
Isolation Forest ROC:0.7195, precision @ rank n:0.25, execution time: 1.1497s
K Nearest Neighbors (KNN) ROC:0.7805, precision @ rank n:0.25, execution time: 0.0311s
Local Outlier Factor (LOF) ROC:0.7774, precision @ rank n:0.25, execution time: 0.01s
Minimum Covariance Determinant (MCD) ROC:0.7165, precision @ rank 




One-class SVM (OCSVM) ROC:0.9636, precision @ rank n:0.6, execution time: 0.007s
Principal Component Analysis (PCA) ROC:0.9818, precision @ rank n:0.8, execution time: 0.0034s

... Processing mnist.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7813, precision @ rank n:0.3562, execution time: 22.0413s
Cluster-based Local Outlier Factor ROC:0.8447, precision @ rank n:0.4007, execution time: 3.3012s
Feature Bagging ROC:0.7259, precision @ rank n:0.3664, execution time: 152.5688s
Histogram-base Outlier Detection (HBOS) ROC:0.5675, precision @ rank n:0.1199, execution time: 0.1979s
Isolation Forest ROC:0.7801, precision @ rank n:0.2979, execution time: 6.531s
K Nearest Neighbors (KNN) ROC:0.8409, precision @ rank n:0.4144, execution time: 19.7327s
Local Outlier Factor (LOF) ROC:0.7085, precision @ rank n:0.339, execution time: 18.8933s




Minimum Covariance Determinant (MCD) ROC:0.863, precision @ rank n:0.3973, execution time: 9.2717s
One-class SVM (OCSVM) ROC:0.8417, precision @ rank n:0.3801, execution time: 12.5478s
Principal Component Analysis (PCA) ROC:0.8396, precision @ rank n:0.3767, execution time: 0.3844s

... Processing musk.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.0809, precision @ rank n:0.0333, execution time: 7.0569s
Cluster-based Local Outlier Factor ROC:1.0, precision @ rank n:1.0, execution time: 1.3633s
Feature Bagging ROC:0.5228, precision @ rank n:0.1667, execution time: 45.1093s
Histogram-base Outlier Detection (HBOS) ROC:0.9999, precision @ rank n:0.9667, execution time: 0.2219s
Isolation Forest ROC:0.9996, precision @ rank n:0.9333, execution time: 4.8328s
K Nearest Neighbors (KNN) ROC:0.7348, precision @ rank n:0.2333, execution time: 5.9107s
Local Outlier Factor (LOF) ROC:0.5323, precision @ rank n:0.1333, execution time: 5.3366s
Minimum Covariance Determinant (MCD) ROC:1.0, precision



Minimum Covariance Determinant (MCD) ROC:0.3486, precision @ rank n:0.0, execution time: 4.8701s
One-class SVM (OCSVM) ROC:0.4972, precision @ rank n:0.0, execution time: 4.0066s
Principal Component Analysis (PCA) ROC:0.504, precision @ rank n:0.0, execution time: 0.1431s

... Processing pendigits.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.7008, precision @ rank n:0.0308, execution time: 5.4205s
Cluster-based Local Outlier Factor ROC:0.9609, precision @ rank n:0.3077, execution time: 0.9799s
Feature Bagging ROC:0.4687, precision @ rank n:0.0462, execution time: 16.2959s
Histogram-base Outlier Detection (HBOS) ROC:0.9294, precision @ rank n:0.2615, execution time: 0.0363s
Isolation Forest ROC:0.9422, precision @ rank n:0.2769, execution time: 2.8508s
K Nearest Neighbors (KNN) ROC:0.7602, precision @ rank n:0.0462, execution time: 2.4292s
Local Outlier Factor (LOF) ROC:0.481, precision @ rank n:0.0462, execution time: 1.8705s
Minimum Covariance Determinant (MCD) ROC:0.8271, precis



Minimum Covariance Determinant (MCD) ROC:0.9903, precision @ rank n:0.7534, execution time: 42.5197s
One-class SVM (OCSVM) ROC:0.9922, precision @ rank n:0.9553, execution time: 153.5508s
Principal Component Analysis (PCA) ROC:0.9902, precision @ rank n:0.9503, execution time: 0.1027s

... Processing vertebral.mat ...
Angle-based Outlier Detector (ABOD) ROC:0.2797, precision @ rank n:0.0, execution time: 0.1835s
Cluster-based Local Outlier Factor ROC:0.3908, precision @ rank n:0.0, execution time: 0.1519s
Feature Bagging ROC:0.3027, precision @ rank n:0.0, execution time: 0.1179s
Histogram-base Outlier Detection (HBOS) ROC:0.2695, precision @ rank n:0.0, execution time: 0.009s
Isolation Forest ROC:0.3576, precision @ rank n:0.0, execution time: 1.0677s
K Nearest Neighbors (KNN) ROC:0.318, precision @ rank n:0.0, execution time: 0.0399s
Local Outlier Factor (LOF) ROC:0.318, precision @ rank n:0.0, execution time: 0.0122s
Minimum Covariance Determinant (MCD) ROC:0.3308, precision @ rank 

In [14]:
roc_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.7687,0.7684,0.7799,0.8511,0.8527,0.782,0.7787,0.8228,0.7986,0.7997
0,cardio,1831,21,9.6122,0.5763,0.8221,0.4879,0.8453,0.9414,0.6959,0.4715,0.8778,0.9507,0.9638
0,glass,214,9,4.2056,0.7104,0.8506,0.7043,0.6524,0.7195,0.7805,0.7774,0.7165,0.6189,0.622
0,ionosphere,351,33,35.8974,0.9004,0.8952,0.8933,0.5195,0.8309,0.9134,0.8989,0.9399,0.8372,0.7971
0,letter,1600,32,6.25,0.8465,0.7423,0.866,0.5728,0.5778,0.845,0.8409,0.7499,0.5744,0.48
0,lympho,148,18,4.0541,0.9382,0.9709,0.9673,0.9964,0.9855,0.9636,0.9636,0.9164,0.9636,0.9818
0,mnist,7603,100,9.2069,0.7813,0.8447,0.7259,0.5675,0.7801,0.8409,0.7085,0.863,0.8417,0.8396
0,musk,3062,166,3.1679,0.0809,1.0,0.5228,0.9999,0.9996,0.7348,0.5323,1.0,1.0,1.0
0,optdigits,5216,64,2.8758,0.4428,0.7852,0.4641,0.8822,0.5764,0.3824,0.4584,0.3486,0.4972,0.504
0,pendigits,6870,16,2.2707,0.7008,0.9609,0.4687,0.9294,0.9422,0.7602,0.481,0.8271,0.93,0.9332


In [15]:
prn_df

Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,0.3571,0.4643,0.5,0.5714,0.5714,0.5,0.4643,0.4286,0.5,0.5
0,cardio,1831,21,9.6122,0.1875,0.4844,0.1406,0.4688,0.5,0.2812,0.125,0.3906,0.5938,0.6875
0,glass,214,9,4.2056,0.25,0.25,0.25,0.0,0.25,0.25,0.25,0.0,0.25,0.25
0,ionosphere,351,33,35.8974,0.8214,0.8036,0.75,0.3393,0.6607,0.8393,0.75,0.8571,0.7143,0.5893
0,letter,1600,32,6.25,0.275,0.175,0.4,0.125,0.05,0.3,0.325,0.075,0.1,0.05
0,lympho,148,18,4.0541,0.4,0.6,0.6,0.8,0.6,0.6,0.6,0.6,0.6,0.8
0,mnist,7603,100,9.2069,0.3562,0.4007,0.3664,0.1199,0.2979,0.4144,0.339,0.3973,0.3801,0.3767
0,musk,3062,166,3.1679,0.0333,1.0,0.1667,0.9667,0.9333,0.2333,0.1333,0.9667,1.0,1.0
0,optdigits,5216,64,2.8758,0.0161,0.0,0.0484,0.2581,0.0161,0.0,0.0484,0.0,0.0,0.0
0,pendigits,6870,16,2.2707,0.0308,0.3077,0.0462,0.2615,0.2769,0.0462,0.0462,0.0615,0.2923,0.3385


In [16]:
time_df






Unnamed: 0,Data,#Sample,#Dimensions,Outlier Perc,PCA,MCD,OCSVM,LOF,CBLOF,KNN,HBOS,ABOD,IFOREST,FEATUREBAGGING
0,arrhythmia,452,274,14.6018,4.6594,6.808,1.6311,5.888,1.5585,0.2946,0.2134,2.0219,0.1219,0.1645
0,cardio,1831,21,9.6122,1.3468,0.6567,2.3665,0.0219,1.4207,0.5057,0.2893,2.4447,0.2227,0.0161
0,glass,214,9,4.2056,0.1927,0.1939,0.1483,0.0113,1.1497,0.0311,0.01,0.1141,0.005,0.009
0,ionosphere,351,33,35.8974,0.2831,0.1634,0.2362,0.023,1.1082,0.0817,0.0232,0.2877,0.0128,0.0102
0,letter,1600,32,6.25,1.5444,0.5076,2.4443,0.0356,1.4684,0.4731,0.2896,4.3769,0.2379,0.0149
0,lympho,148,18,4.0541,0.1194,0.198,0.1227,0.0177,1.2208,0.0289,0.0092,0.1276,0.007,0.0034
0,mnist,7603,100,9.2069,22.0413,3.3012,152.569,0.1979,6.531,19.7327,18.8933,9.2717,12.5478,0.3844
0,musk,3062,166,3.1679,7.0569,1.3633,45.1093,0.2219,4.8328,5.9107,5.3366,49.3834,3.7448,0.4357
0,optdigits,5216,64,2.8758,11.192,1.7423,46.8778,0.1498,3.5445,5.8616,5.075,4.8701,4.0066,0.1431
0,pendigits,6870,16,2.2707,5.4205,0.9799,16.2959,0.0363,2.8508,2.4292,1.8705,7.0905,2.9453,0.0449
