In [1]:
import numpy as np
import pandas as pd
from numpy import percentile
import matplotlib.pyplot as plt
import matplotlib.font_manager
from pyod.utils.utility import standardizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
import multiprocessing as mp
import time


# Import all models
from sklearn.model_selection import train_test_split
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.lscp import LSCP
from pyod.models.lmdd import LMDD
from pyod.models.cof import COF
from pyod.models.loci import LOCI
from pyod.models.sod import SOD
from pyod.models.rod import ROD
from pyod.models.ecod import ECOD
from pyod.models.copod import COPOD
from pyod.models.loda import LODA
from pyod.models.mad import MAD
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.deep_svdd import DeepSVDD
from pyod.models.mo_gaal import MO_GAAL
from pyod.models.so_gaal import SO_GAAL
from pyod.models.vae import VAE
from pyod.models.sos import SOS

In [None]:
training_set = pd.read_csv('yahoo_sub_5_0.05.csv')

In [None]:
!pip install ipython-autotime
%load_ext autotime

In [3]:
random_state = np.random.RandomState(42)
outliers_fraction = 0.05
#detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15),
#                 LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30),
#                 LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45),
#                 LOF(n_neighbors=50)]

detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15)]

# Define nine outlier detection tools to be compared
classifiers = {
    ## TRADITIONAL ##
    ## Linear Models for Outlier Detection: ##
    'Principal Component Analysis (PCA)': PCA(
        contamination=outliers_fraction, random_state=random_state),
    'Minimum Covariance Determinant (MCD)': MCD(
        contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
    ## Proximity-Based Outlier Detection Models: ##
    'Local Outlier Factor (LOF)':
        LOF(n_neighbors=10, contamination=outliers_fraction),
    'K Nearest Neighbors (KNN)': KNN(
        contamination=outliers_fraction),
    'Average KNN': KNN(method='mean',
                       contamination=outliers_fraction),
    'Histogram-base Outlier Detection (HBOS)': HBOS(
        contamination=outliers_fraction),
    'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False, random_state=random_state),
    ## Probabilistic Models for Outlier Detection: ##
    'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
    ## Outlier Ensembles and Combination Frameworks: ##
    'Isolation Forest': IForest(contamination=outliers_fraction,
                                random_state=random_state),
    'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=10),
                       contamination=outliers_fraction,
                       random_state=random_state),
    'Locally Selective Combination (LSCP)': LSCP(
        detector_list, contamination=outliers_fraction,
        random_state=random_state),
    
    ## STATE OF THE ART ##
    ## Linear Models for Outlier Detection: ##
    #'Deviation-based Outlier Detection (LMDD)' : LMDD(
    #    contamination=outliers_fraction, random_state=random_state),
    ## Proximity-Based Outlier Detection Models: ##
    'Connectivity-Based Outlier Factor (COF)' : 
        COF(n_neighbors=10, contamination=outliers_fraction),
    'Median kNN': KNN(method='median',
                       contamination=outliers_fraction),
    #'SOD': SOD(contamination=outliers_fraction, n_neighbors=35),
    ## Probabilistic Models for Outlier Detection: ##
    #'ecod': ECOD(contamination=outliers_fraction),
    'COPOD: Copula-Based Outlier Detection': COPOD(contamination=outliers_fraction),
    'SOS':  SOS(contamination=outliers_fraction, perplexity=10),
    ## Outlier Ensembles and Combination Frameworks: ##
    'loda': LODA(contamination=outliers_fraction),
}

In [4]:
list1 = ['PCA', 'MCD', 'OCSVM', 'LOF', 'KNN', 'AvgKNN', 'HBOS','CBLOF', 'ABOD', 'IForest', 'FB', 'LSCP', 'COF', 'MedKNN', 'COPOD', 'SOS', 'LODA']

In [45]:
## SEQUENTIAL ##

start = time.time()

results = []

for ii in training_set:
    
    m = len(ii.columns) - 1
    
    X = ii.iloc[:,0:m]
    y = ii.iloc[:,m]
    
    
    X_norm = pd.DataFrame(standardizer(X))
    #X_norm = X
    
    results = pd.DataFrame(list1, columns = ['Method'] )
    results['Recall'] = ''
    results['Precision'] = ''
    results['F1_Macro'] = ''
    results['AUC'] = ''
                  
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        # fit the data and tag outliers
        clf.fit(X_norm)
    
        # get the prediction on the test data
        y_pred = clf.predict(X_norm)  # outlier labels (0 or 1)
    
        results.at[i, 'Recall'] = recall_score(y, y_pred)  
        results.at[i, 'Precision'] = precision_score(y, y_pred)
        results.at[i, 'F1_Macro'] = f1_score(y, y_pred, average='macro') 
        results.at[i, 'AUC'] = roc_auc_score(y, y_pred)
        #print(results)
    
    # changes based on metric
    best_method_results = results[results.F1_Macro == results.F1_Macro.max()]

    results.append(best_method_results)
    
end = time.time()

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(resu

In [46]:
end-start

594.2008852958679

In [10]:
import workers

In [47]:
# check using parallel processing

all_results = []

start = time.time()

if __name__ == '__main__':
    with mp.Pool() as pool:
        for result in pool.map (workers.versus, training_set):
            print(result)
            all_results.append(result)
            
end = time.time()

   Method Recall Precision  F1_Macro      AUC
5  AvgKNN    1.0  0.719298  0.913097  0.98957
   Method Recall Precision  F1_Macro       AUC
5  AvgKNN    1.0  0.759259  0.927305  0.991525
   Method Recall Precision  F1_Macro       AUC
5  AvgKNN    1.0  0.630769  0.878845  0.984355
   Method Recall Precision  F1_Macro       AUC
5  AvgKNN    1.0  0.650794  0.886956  0.985658
   Method Recall Precision  F1_Macro       AUC
5  AvgKNN    1.0  0.706897  0.908538  0.988918
   Method Recall Precision  F1_Macro       AUC
5  AvgKNN    1.0  0.773585  0.932228  0.992177
   Method Recall Precision  F1_Macro       AUC
5  AvgKNN    1.0  0.732143  0.917743  0.990222
   Method Recall Precision  F1_Macro       AUC
5  AvgKNN    1.0  0.706897  0.908538  0.988918
   Method Recall Precision F1_Macro       AUC
5  AvgKNN    1.0  0.683333  0.89967  0.987614
   Method Recall Precision  F1_Macro       AUC
5  AvgKNN    1.0  0.745455  0.922478  0.990874
   Method Recall Precision F1_Macro       AUC
5  AvgKNN    1.0  

In [48]:
end-start

308.53568983078003

In [53]:
df = pd.read_csv('skab2_0.10.csv')

568

In [54]:
#Search Strategy

import workers


frequency = 10
min_chunk = 30
chunk_list = []
checkpt = frequency

one = 0
two = min_chunk
three = min_chunk
four = len(df)

while checkpt < (len(df)-min_chunk):

    group1 = df.iloc[one:two,:]
    group2 = df.iloc[three:four,:]
    args = [group1, group2]

    if group1['anomaly'].nunique() == 1:
        two = two + frequency
        three = two
        checkpt = three

    elif group2['anomaly'].nunique() == 1:
        checkpt = len(df)-min_chunk

    else:

        all_results = []

        if __name__ == '__main__':
            with mp.Pool() as pool:
                for result in pool.map (workers.versus, args):
                    all_results.append(result)

        group1_results = all_results[0]
        group2_results = all_results[1]

        group1_best_methods = group1_results.Method.values.tolist()
        group2_best_methods = group2_results.Method.values.tolist()

        check =  any(item in group1_best_methods for item in group2_best_methods)

        if check is True:
            two = two + frequency
            three = two

        else:
            if chunk_list:
                last_results = (chunk_list or [None])[-1]
                last_results_methods = last_results.Method.values.tolist()

                check_last = any(item in group1_best_methods for item in last_results_methods)

                if check_last is True:
                    del chunk_list[-1]
                    first = last_results.iloc[0]['One']
                    group = df.iloc[first:two,:]
                    args = [group]

                    if __name__ == '__main__':
                        with mp.Pool() as pool:
                            for result in pool.map (workers.versus, args):
                                result['One'] = first
                                result['Two'] = two
                                chunk_list.append(result)

                else:
                    group1_results['One'] = one
                    group1_results['Two'] = two
                    chunk_list.append(group1_results)

            if not chunk_list:
                group1_results['One'] = one
                group1_results['Two'] = two
                chunk_list.append(group1_results)

            one = two
            two = two + min_chunk
            three = two
        checkpt = three
    print(check, one, two, chunk_list)


if checkpt >= (len(df)-min_chunk):
    group1 = df.iloc[one:four,:]
    args = [group1]

    if group1['anomaly'].nunique() == 1:
        last_results = (chunk_list or [None])[-1]
        one = last_results['One']
        del chunk_list[-1]

        group1 = df.iloc[one:four,:]
        args = [group1]

        if __name__ == '__main__':
            with mp.Pool() as pool:
                for result in pool.map (workers.versus, args):
                    result['One'] = one
                    result['Two'] = four
                    chunk_list.append(result)

    else:
        if __name__ == '__main__':
            with mp.Pool() as pool:
                for result in pool.map (workers.versus, args):
                    result['One'] = one
                    result['Two'] = four
                    chunk_list.append(result)


False 30 60 [   Method Recall Precision F1_Macro  AUC  One  Two
5  AvgKNN    1.0       1.0      1.0  1.0    0   30]
True 30 70 [   Method Recall Precision F1_Macro  AUC  One  Two
5  AvgKNN    1.0       1.0      1.0  1.0    0   30]
False 70 100 [   Method Recall Precision F1_Macro  AUC  One  Two
5  AvgKNN    1.0       1.0      1.0  1.0    0   30,     Method Recall Precision F1_Macro  AUC  One  Two
12  MedKNN    1.0       1.0      1.0  1.0   30   70]
True 70 110 [   Method Recall Precision F1_Macro  AUC  One  Two
5  AvgKNN    1.0       1.0      1.0  1.0    0   30,     Method Recall Precision F1_Macro  AUC  One  Two
12  MedKNN    1.0       1.0      1.0  1.0   30   70]
True 70 120 [   Method Recall Precision F1_Macro  AUC  One  Two
5  AvgKNN    1.0       1.0      1.0  1.0    0   30,     Method Recall Precision F1_Macro  AUC  One  Two
12  MedKNN    1.0       1.0      1.0  1.0   30   70]
True 70 130 [   Method Recall Precision F1_Macro  AUC  One  Two
5  AvgKNN    1.0       1.0      1.0  1.0 

In [55]:
len(chunk_list)

8

In [56]:
chunk_list

[   Method Recall Precision F1_Macro  AUC  One  Two
 5  AvgKNN    1.0       1.0      1.0  1.0    0   30,
     Method Recall Precision F1_Macro  AUC  One  Two
 12  MedKNN    1.0       1.0      1.0  1.0   30   70,
   Method Recall Precision  F1_Macro       AUC  One  Two
 3    LOF    0.8       0.8  0.890909  0.890909   70  130,
      Method Recall Precision F1_Macro  AUC  One  Two
 8   IForest    1.0       1.0      1.0  1.0  130  170
 13    COPOD    1.0       1.0      1.0  1.0  130  170,
   Method    Recall Precision  F1_Macro       AUC  One  Two
 6   HBOS  0.818182  0.782609  0.889157  0.897072  170  400,
   Method Recall Precision F1_Macro  AUC  One  Two
 9     FB    1.0       1.0      1.0  1.0  400  430,
    Method    Recall Precision  F1_Macro       AUC  One  Two
 1     MCD  0.583333  0.777778  0.811321  0.778846  430  520
 6    HBOS  0.583333  0.777778  0.811321  0.778846  430  520
 13  COPOD  0.583333  0.777778  0.811321  0.778846  430  520,
   Method Recall Precision  F1_Macro     

In [14]:
## Weighted average

recall = []
precision = []
f1_macro = []
auc = []

for i in range (0,len(chunk_list)):
    current = chunk_list[i]
    current.reset_index(drop=True, inplace=True)
    recall.append(current.at[0, 'Recall'])
    precision.append(current.at[0, 'Precision'])
    f1_macro.append(current.at[0, 'F1_Macro'])
    auc.append(current.at[0, 'AUC'])
    
recall_avg = sum(recall)/len(recall)
precision_avg = sum(precision)/len(precision)
f1_macro_avg = sum(f1_macro)/len(f1_macro)
auc_avg = sum(auc)/len(auc)

NameError: name 'chunk_list' is not defined

In [30]:
print(recall_avg, precision_avg, f1_macro_avg, auc_avg)

0.8533143939393939 0.9200483091787439 0.9322411959488569 0.9224158653846154
