In [2]:
import numpy as np
import pandas as pd 
import os
import sys

from sklearn.model_selection import train_test_split
from scipy.io import loadmat


In [3]:
#import Pyod
from pyod.models.pca import PCA
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.lof import LOF
from pyod.models.cblof import CBLOF
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.feature_bagging import FeatureBagging



In [4]:
#Import metrics Packages
from pyod.utils.utility import standardizer
from pyod.utils.utility import precision_n_scores
from sklearn.metrics import roc_auc_score

In [5]:
#load the file
mat_list = ["arrhythmia.mat",
           "cardio.mat",
           "glass.mat",
           "ionosphere.mat",
           "letter.mat",
           "lympho.mat",
           "mnist.mat","musk.mat","optdigits.mat","pendigits.mat","pima.mat","satellite.mat","satimage-2.mat",
            "shuttle.mat","vertebral.mat","vowels.mat","wbc.mat"]

In [6]:
Data = loadmat("cardio.mat")
Data

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [7]:
len(Data)

5

In [8]:
Data.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [9]:
Data["X"].shape

(1831, 21)

In [10]:
df_columns = ["Data","#samples","#Dimension","Outlier Perc", "ABOD","CBLOF","FB","HBOS", "IForest", "KNN", "LOF", "MCD",
              "OCSVM", " PCA"]

In [11]:
roc_df = pd.DataFrame(columns=df_columns)
pre_df = pd.DataFrame(columns=df_columns)
time_df = pd.DataFrame(columns=df_columns)

In [12]:
roc_df

Unnamed: 0,Data,#samples,#Dimension,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA


In [46]:
from time import time

for mat_f in mat_list:
    print("\n ..... Processing...", mat_f, "....")
    mat = loadmat(os.path.join(mat_f))
    #print(mat)
    X = mat["X"]
    Y = mat["y"].ravel()
    outlires_fraction = np.count_nonzero(Y) / len(Y)
    outliers_percentage = round(outlires_fraction * 100,ndigits=4)
    
    #construct a containers
    #print(X.shape)
    #print(Y)
    
    roc_list = [mat_f.split(".")[0], X.shape[0], X.shape[1],outliers_percentage] #mat_f[:-4]
    pre_list = [mat_f.split(".")[0], X.shape[0], X.shape[1],outliers_percentage]
    time_list = [mat_f.split(".")[0], X.shape[0], X.shape[1],outliers_percentage]
    
    X_train,X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.4, random_state=42)
    
    #standarsing data
    X_train_norm, X_test_norm = standardizer(X_train,X_test)
    
    
    Classifer = {"Angle-based outlier Detector ABOD" : ABOD(contamination= outlires_fraction),
                "Cluster based local outlier factor CBlOF" : CBLOF(contamination= outlires_fraction,check_estimator=False,random_state=42),
                "Feature Bagging" : FeatureBagging(contamination= outlires_fraction,random_state=42),
                "Histogram-based Outlier Detection HBOS" : HBOS(contamination= outlires_fraction),
                 "Isolation Forest" : IForest(contamination= outlires_fraction,random_state=42),
                 "K nearest Neighbors KNN" : KNN(contamination= outlires_fraction),
                 "Local Outlier Factor LOF" : LOF(contamination= outlires_fraction),
                 "Minimun Covariance Determinat MCD " : MCD(contamination= outlires_fraction,random_state=42),
                 "One-class SVM OCSVM" : OCSVM(contamination= outlires_fraction),
                 "Principal Component Analysis PCS" : PCA(contamination= outlires_fraction,random_state=42)
                                }
    for clf_name, clf in Classifer.items():
        t0 = time()
        clf.fit(X_train_norm)
        test_score = clf.decision_function(X_test_norm)
        #print(test_score)
        t1 = time()
        duration = round(t1-t0, ndigits=4)
        time_list.append(duration)
    
        roc = round(roc_auc_score(Y_test, test_score), ndigits=4)
        prn = round(precision_n_scores(Y_test, test_score), ndigits=4)
    
        roc_list.append(roc)
        pre_list.append(prn)

    temp_df = pd.DataFrame(time_list).transpose()
    temp_df.columns = df_columns
    time_df = pd.concat([time_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(roc_list).transpose()
    temp_df.columns = df_columns
    roc_df = pd.concat([roc_df,temp_df],axis=0)
    
    temp_df = pd.DataFrame(pre_list).transpose()
    temp_df.columns = df_columns
    pre_df = pd.concat([pre_df,temp_df],axis=0)
    


 ..... Processing... arrhythmia.mat ....





 ..... Processing... cardio.mat ....





 ..... Processing... glass.mat ....





 ..... Processing... ionosphere.mat ....

 ..... Processing... letter.mat ....





 ..... Processing... lympho.mat ....





 ..... Processing... mnist.mat ....





 ..... Processing... musk.mat ....





 ..... Processing... optdigits.mat ....





 ..... Processing... pendigits.mat ....





 ..... Processing... pima.mat ....





 ..... Processing... satellite.mat ....





 ..... Processing... satimage-2.mat ....





 ..... Processing... shuttle.mat ....









 ..... Processing... vertebral.mat ....





 ..... Processing... vowels.mat ....





 ..... Processing... wbc.mat ....




In [44]:
time_df

Unnamed: 0,Data,#samples,#Dimension,Outlier Perc,ABOD,CBLOF,FB,HBOS,IForest,KNN,LOF,MCD,OCSVM,PCA
0,wbc,378,30,5.5556,0.0565,0.047,0.0889,0.008,0.2254,0.0156,0.0,0.0624,0.0156,0.0
0,arrhythmia,452,274,14.6018,0.2599,0.1135,0.6131,0.0524,0.3423,0.0794,0.0723,0.7415,0.0481,0.0639
0,cardio,1831,21,9.6122,0.5026,0.1046,0.7868,0.0042,0.3417,0.1599,0.1039,0.6719,0.08,0.0079
0,glass,214,9,4.2056,0.048,0.0483,0.032,0.0,0.3836,0.0081,0.0,0.0401,0.0,0.0
0,ionosphere,351,33,35.8974,0.0628,0.048,0.0823,0.0078,0.2882,0.016,0.008,0.064,0.0079,0.0
0,letter,1600,32,6.25,0.352,0.0937,0.6526,0.008,0.3231,0.1261,0.1039,1.1425,0.064,0.0081
0,lympho,148,18,4.0541,0.024,0.04,0.0313,0.008,0.3397,0.009,0.0,0.0321,0.0,0.0
0,mnist,7603,100,9.2069,7.2134,0.7083,43.2956,0.048,1.6758,5.779,5.4907,2.8247,3.6264,0.1219
0,musk,3062,166,3.1679,2.0221,0.2217,10.7559,0.04,1.083,1.6249,1.4929,11.6152,1.1165,0.1311
0,optdigits,5216,64,2.8758,2.2693,0.2561,12.0859,0.024,0.8329,1.5397,1.4786,1.3552,1.1523,0.0399


Project Day 1 Agenda :
Anamoly Detection In Machine Learning
Types of Anomaly Detection
PyOD
Benchmark of Various outlier detection models
Model Building Using
Anomaly Detection In Machine Learning :
So far we have learnt EDA,supervised and unsupervised learning .
WE will get into a real business problem .
We will learn to build real time project .
We will do the first project on Anomaly detection or something Abnormal .
Anomaly is referred to as uncertain behavior .
Objective is identify the anomaly .
In Medical industry Anomaly is a miracle .
In IT industry if there is any attack on the system we will get an abnormal response from our system 
Novelty/Outlier/Forgery and out of distribution detection are all same.
Hawkins defined anomaly as â€œan abbreviation which deviates sp much from the other observations as to arouse suspicious that it was generated by a different mechanism .
Anomaly detection has received considerable attention in the field of data mining due to the valuable insight that the detection of unusual events can provide in a variety of application 
WE can use this to detect faulty sensor.
In anomaly detection Domain Knowledge is very important .
An example is the breed of dogs if a new species is seen it is novrl class and if any other breed comes in it is outlier .
Types of Anomaly Detection:
Time-series anomaly is like attack on a system .
Video-Level Detection:In Banks,ATM and other important places in cctv recording we can set certain limits and category as to if someone does any unwanted behvior it will set an alarm .
Image-level detection:Can be used in cases where human cannot check the similarity n 2 two images we can find the percentage of similarity in 2 pictures .There are 3 types of categories .
Anomaly Classification target.
Out-of-Distribution Detection.
Anomaly Segmentation Target .
PyOD:
PyOD is a comprehensive and scaleable python toolkit for detecting outlying objects in multivariate data .
It was developed back in 2017 and has been used in many academic research and commercial products .
PyOD Uses:
It is featured for Unified APIs ,detailed documentation and interactive examples across Various Algorithms.
In Advanced models,includinf Neural Networks and outlier Ensembles
Optimized performance with JIT and parallelization when possible,using namba and joblib.
Benchmark of Various outlier detection models:
Linear Models for Outlier Detection :Wehn one increases or decreases with respect to other it is linear.
Principal Component Analysis :Based on the contribution can we remove any of the feature and choose the most important ones is PCA.
Minimum Covariance Determinant :Covariance is the difference bet std deviation and other variance values .So using a limit from the midpoint we can detect outliers after a range.
One-Class Support Vector Machine:WE can take all the inliner and remove the uncertain and out of the line problems.
Proximity Based Outlier Detection Models :Using the proximity to detect the outliers
Local Outlier Factor 
Clustering Based LOF
KNN
Histogram Based Outlier Score
Probability Model for outlier Detection:
Angle-Based Outlier Detection
Ensemble and combination Framework
Isolation Forest
Feature bagging 