In [1]:
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
#         Albert Thomas <albert.thomas@telecom-paristech.fr>
# License: BSD 3 clause

import time
from sklearn import svm
from sklearn.datasets import make_moons, make_blobs
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

print(__doc__)

Automatically created module for IPython interactive environment


In [170]:
from sklearn.metrics import confusion_matrix

In [2]:
import numpy as np
import data_manipulation as dm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from tsfresh.utilities.dataframe_functions import impute
import ADP

In [3]:
def ADP_Offline_Granularity_Iteration_5th(static, streaming, gra, b_test, n_i):
    begin = dm.datetime.now()

    L1, W = static.shape
    L2, _ = streaming.shape

    ##################################
    ##### ----- STATIC ADP ----- #####
    ##### ---------------------- #####

    Input1 = {'data': np.vstack((static,streaming)),
             'granularity': gra,
             'distancetype': 'euclidean'}
    
    Input2 = {'data': static,
             'granularity': gra,
             'distancetype': 'euclidean'}
    
    Input3 = {'data': streaming,
             'granularity': gra,
             'distancetype': 'euclidean'}
            
    ADP_output = ADP.ADP(Input1, 'Offline')
    
    # Computing the number of clouds
    ADP_output['n_data_clouds'] = max(ADP_output['IDX']) + 1

    ADP_static_output = ADP.ADP(Input2, 'Offline')
    
    # Computing the number of clouds
    ADP_static_output['n_data_clouds'] = max(ADP_static_output['IDX']) + 1

    ADP_streaming_output = ADP.ADP(Input3, 'Offline')

    # Computing the number of clouds
    ADP_streaming_output['n_data_clouds'] = max(ADP_streaming_output['IDX']) + 1
    
    Output = {'ADP_output': ADP_output,
             'ADP_static_output': ADP_static_output,
             'ADP_streaming_output': ADP_streaming_output}
    
    return Output

In [4]:
if __name__ == '__main__':
    ##########################################################
    # ------------------------------------------------------ #
    # --------------------- INITIATION --------------------- #
    # ------------------------------------------------------ #
    ##########################################################
    ### Define User Variables ###

    # List of Granularities
    gra_list = [i for i in range(1,11)]

    # Number of Iterations
    iterations = 1

    # Number of events
    total = 10000

    # Number of Data-set divisions
    windows = 100

    # Percentage of background samples on the testing phase
    background_percent = 0.99

    # Percentage of samples on the training phase
    test_size = 0.3
    

In [5]:
if __name__ == '__main__':
    ##########################################################
    # ------------------------------------------------------ #
    # ----------------------- LOADING ---------------------- #
    # ------------------------------------------------------ #
    ##########################################################
    # Firstly the model loads the background and signal data, 
    # then it removes the attributes first string line, which 
    # are the column names, in order to avoid NaN values in 
    # the array.

    print('         ==== Commencing Initiation ====\n')

    ### Background    
    b_name='Input_Background_1.csv'
    background = np.genfromtxt(b_name, delimiter=',')
    background = background[1:,:]
    print("     .Background Loaded..." )
    print("     .Background shape: {}".format(background.shape))

    ### Signal
    s_name='Input_Signal_1.csv'
    signal = np.genfromtxt(s_name, delimiter=',')
    signal = signal[1:,:]
    print("     .Signal Loaded...")
    print("     .Signal shape: {}\n".format(signal.shape))

    print('\n          ==== Initiation Complete ====\n')
    print('=*='*17 )
    print('      ==== Commencing Data Processing ====')

         ==== Commencing Initiation ====

     .Background Loaded...
     .Background shape: (543500, 21)
     .Signal Loaded...
     .Signal shape: (522467, 21)


          ==== Initiation Complete ====

=*==*==*==*==*==*==*==*==*==*==*==*==*==*==*==*==*=
      ==== Commencing Data Processing ====


In [6]:
if __name__ == '__main__':
    for n_i in range(iterations):
        print('\n     => Iteration Number', (n_i+1) )

        # Divide data-set into training and testing sub-sets
        print('         .Dividing training and testing sub-sets')
        divided_background, _ = dm.divide(background, windows, total)

        test = int(total*test_size)
        b_test = int(test*background_percent)
        static_data_raw, background_test = train_test_split(divided_background, test_size=test_size, random_state=42)
        background_test, _ = dm.divide(background_test, windows, b_test)

        # Defining number of events Signal events on online phase.
        signal_online_samples = int(test - b_test)
        reduced_signal, _ = dm.divide(signal, windows, signal_online_samples)

        print('         .Selecting Signal on the following porpotion:')
        print('             .{}% Background samples'.format(int(background_percent*100)))
        print('             .{}% Signal samples'.format(int((1-background_percent)*100)))
        print('             .{:9d} of Background samples (Offline)'.format(int(total*(1-test_size))))
        print('             .{:9d} of Background samples (Online)'.format(int(b_test)) )
        print('             .{:9d} of Signal samples (Online)'.format(int(signal_online_samples)))

        # Concatenating Signal and the Test Background sub-set
        streaming_data_raw = np.concatenate((background_test,reduced_signal), axis=0)
        print("             .Offline shape: {}".format(static_data_raw.shape))
        print("             .Online shape: {}\n".format(streaming_data_raw.shape))

        # Normalize Data
        print('         .Normalizing Data')
        static_data = normalize(static_data_raw,norm='max',axis=0)
        streaming_data = normalize(streaming_data_raw,norm='max',axis=0)
        
        ADP_outputs = {}
        
        print('             .Executing for granularities', gra_list)
        for gra in gra_list:
            print('\n\n             .Iter: {} - Granularity: {}'.format(n_i, gra))
            print('                 .ADP (5th Method)')
            output = ADP_Offline_Granularity_Iteration_5th(static_data, streaming_data, gra, b_test, n_i)
            ADP_outputs ['granularity_'+str(gra)] = output

    print('\n        ====Data Processing Complete====\n' )
    print('=*='*17 ) 


     => Iteration Number 1
         .Dividing training and testing sub-sets
         .Selecting Signal on the following porpotion:
             .99% Background samples
             .1% Signal samples
             .     7000 of Background samples (Offline)
             .     2970 of Background samples (Online)
             .       30 of Signal samples (Online)
             .Offline shape: (7000, 21)
             .Online shape: (3000, 21)

         .Normalizing Data
             .Executing for granularities [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


             .Iter: 0 - Granularity: 1
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 2
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 3
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 4
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 5
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 6
                 .ADP (5th Method)


In [177]:
def probability(data):
    L = len(data)
    mu = np.mean(data, axis=0)
    variance = np.mean((data - mu)**2, axis=0)/L
    var_dia = np.diag(variance)
    k = len(mu)
    X = data - mu

    if np.prod(variance) == 0:
        p = np.array(L*[0])
    else:
        p = 1/((2*np.pi)**(k/2)*(np.linalg.det(var_dia)**0.5))* np.exp(-0.5* np.sum(X @ np.linalg.pinv(var_dia) * 
                                                                                    X,axis=1))
    
    #print('-----')
    #print(mu.shape)
    #print(variance.shape)
    #print(var_dia.shape)
    #print('var - ',variance)
    #print('det - ', np.linalg.det(var_dia)**0.5)
    #print('fat - ', 1/((2*np.pi)**(k/2)*(np.linalg.det(var_dia)**0.5)))
    #print('sum - ',X @ np.linalg.pinv(var_dia) * X )
    #print('pot - ', np.sum(X @ np.linalg.pinv(var_dia) * X,axis=1))
    #print('exp - ', np.exp(-0.5* np.sum(X @ np.linalg.pinv(var_dia) * X,axis=1)) )
    #print('p - ', p)
    return p

In [180]:
#### divide data clouds data anda compute probability

data_clouds_data = {}

for gra in ADP_outputs.keys():
    idx = ADP_outputs[gra]['ADP_streaming_output']['IDX']
    u = np.unique(idx)
    dc_dic = {}
    for i in u:
        data = []
    
        for j in range(len(idx)):
            if idx[j] == i:
                data.append(list(streaming_data[j]))
        
        dc_dic[i] = np.nan_to_num(probability(data))
    data_clouds_data[gra] = dc_dic
    

In [183]:
L2, _ = streaming_data.shape
target = np.zeros(L2)
target[b_test:] = 1

### Make detections with the scikit methods

# Example settings
signal_fraction = 0.01

anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=signal_fraction)),
    ("One-Class SVM", svm.OneClassSVM(nu=signal_fraction, kernel="rbf",
                                      gamma=0.1)),
    ("Isolation Forest", IsolationForest(contamination=signal_fraction,
                                         random_state=42)),
    ("Local Outlier Factor", LocalOutlierFactor(
        n_neighbors=3, contamination=signal_fraction))]

anomaly_algorithms = anomaly_algorithms[1:] # retirei o Robust Covariance pq tava dando um erro pela pouca variação dos
                                            # eventos dentro de um grupo, depois podemos voltar com ele
accuracy_dict = {}

for gra in data_clouds_data:
    acc_dict = {"True_Positive": 0,
                "True_Negative": 0,
                "False_Positive": 0,
                "False_Negative": 0,
                "Time": 0}
    models_dict = {"Robust covariance":acc_dict.copy(),
                   "One-Class SVM": acc_dict.copy(),
                   "Isolation Forest": acc_dict.copy(),
                   "Local Outlier Factor": acc_dict.copy()}

    for i, dc in enumerate(data_clouds_data[gra]):
        X = data_clouds_data[gra][dc].reshape(-1, 1)
        if len(np.unique(X)) != 1:
            for name, algorithm in anomaly_algorithms:
                t0 = time.time()
                #algorithm.fit(X)
                #y_pred = algorithm.predict(X)
                y_pred = algorithm.fit_predict(X)
                
                y_pred[y_pred == 1] = 0
                y_pred[y_pred == -1] = 1
                
                y_target = target[ADP_outputs[gra]['ADP_streaming_output']['IDX'] == i]
                
                tn, fp, fn, tp = confusion_matrix(y_target, y_pred, labels=[0,1]).ravel()
                
                models_dict[name]["True_Positive"] += tp
                models_dict[name]["True_Negative"] += tn
                models_dict[name]["False_Positive"] += fp
                models_dict[name]["False_Negative"] += fn
                
                
                t1 = time.time()
                models_dict[name]["Time"] += t1-t0
                
    accuracy_dict[gra] = models_dict.copy()
    print(models_dict)

{'Robust covariance': {'True_Positive': 0, 'True_Negative': 0, 'False_Positive': 0, 'False_Negative': 0, 'Time': 0}, 'One-Class SVM': {'True_Positive': 30, 'True_Negative': 58, 'False_Positive': 2910, 'False_Negative': 0, 'Time': 0.029787540435791016}, 'Isolation Forest': {'True_Positive': 1, 'True_Negative': 2744, 'False_Positive': 224, 'False_Negative': 29, 'Time': 5.546315431594849}, 'Local Outlier Factor': {'True_Positive': 0, 'True_Negative': 2964, 'False_Positive': 4, 'False_Negative': 30, 'Time': 0.0399327278137207}}




{'Robust covariance': {'True_Positive': 0, 'True_Negative': 0, 'False_Positive': 0, 'False_Negative': 0, 'Time': 0}, 'One-Class SVM': {'True_Positive': 27, 'True_Negative': 263, 'False_Positive': 2688, 'False_Negative': 3, 'Time': 0.05987191200256348}, 'Isolation Forest': {'True_Positive': 4, 'True_Negative': 2508, 'False_Positive': 443, 'False_Negative': 26, 'Time': 16.011354684829712}, 'Local Outlier Factor': {'True_Positive': 1, 'True_Negative': 2899, 'False_Positive': 52, 'False_Negative': 29, 'Time': 0.11822152137756348}}


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [182]:
accuracy_dict

{'granularity_1': {'One-Class SVM': {'True_Positive': 30,
   'True_Negative': 58,
   'False_Positive': 2910,
   'False_Negative': 0,
   'Time': 0.029009103775024414},
  'Isolation Forest': {'True_Positive': 1,
   'True_Negative': 2744,
   'False_Positive': 224,
   'False_Negative': 29,
   'Time': 5.822308778762817},
  'Local Outlier Factor': {'True_Positive': 0,
   'True_Negative': 2964,
   'False_Positive': 4,
   'False_Negative': 30,
   'Time': 0.04284811019897461}},
 'granularity_2': {'One-Class SVM': {'True_Positive': 27,
   'True_Negative': 263,
   'False_Positive': 2688,
   'False_Negative': 3,
   'Time': 0.06191539764404297},
  'Isolation Forest': {'True_Positive': 4,
   'True_Negative': 2508,
   'False_Positive': 443,
   'False_Negative': 26,
   'Time': 17.870941162109375},
  'Local Outlier Factor': {'True_Positive': 1,
   'True_Negative': 2899,
   'False_Positive': 52,
   'False_Negative': 29,
   'Time': 0.13837218284606934}}}