In [122]:
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
#         Albert Thomas <albert.thomas@telecom-paristech.fr>
# License: BSD 3 clause

import time
from sklearn import svm
from sklearn.datasets import make_moons, make_blobs
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

print(__doc__)

Automatically created module for IPython interactive environment


In [109]:
import numpy as np
import data_manipulation as dm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from tsfresh.utilities.dataframe_functions import impute
import ADP

In [44]:
def ADP_Offline_Granularity_Iteration_5th(static, streaming, gra, b_test, n_i):
    begin = dm.datetime.now()

    L1, W = static.shape
    L2, _ = streaming.shape

    ##################################
    ##### ----- STATIC ADP ----- #####
    ##### ---------------------- #####

    Input1 = {'data': np.vstack((static,streaming)),
             'granularity': gra,
             'distancetype': 'euclidean'}
    
    Input2 = {'data': static,
             'granularity': gra,
             'distancetype': 'euclidean'}
    
    Input3 = {'data': streaming,
             'granularity': gra,
             'distancetype': 'euclidean'}
            
    ADP_output = ADP.ADP(Input1, 'Offline')
    
    # Computing the number of clouds
    ADP_output['n_data_clouds'] = max(ADP_output['IDX']) + 1

    ADP_static_output = ADP.ADP(Input2, 'Offline')
    
    # Computing the number of clouds
    ADP_static_output['n_data_clouds'] = max(ADP_static_output['IDX']) + 1

    ADP_streaming_output = ADP.ADP(Input3, 'Offline')

    # Computing the number of clouds
    ADP_streaming_output['n_data_clouds'] = max(ADP_streaming_output['IDX']) + 1
    
    Output = {'ADP_output': ADP_output,
             'ADP_static_output': ADP_static_output,
             'ADP_streaming_output': ADP_streaming_output}
    
    return Output

In [45]:
if __name__ == '__main__':
    ##########################################################
    # ------------------------------------------------------ #
    # --------------------- INITIATION --------------------- #
    # ------------------------------------------------------ #
    ##########################################################
    ### Define User Variables ###

    # List of Granularities
    gra_list = [i for i in range(1,11)]

    # Number of Iterations
    iterations = 1

    # Number of events
    total = 10000

    # Number of Data-set divisions
    windows = 100

    # Percentage of background samples on the testing phase
    background_percent = 0.99

    # Percentage of samples on the training phase
    test_size = 0.3
    

In [46]:
if __name__ == '__main__':
    ##########################################################
    # ------------------------------------------------------ #
    # ----------------------- LOADING ---------------------- #
    # ------------------------------------------------------ #
    ##########################################################
    # Firstly the model loads the background and signal data, 
    # then it removes the attributes first string line, which 
    # are the column names, in order to avoid NaN values in 
    # the array.

    print('         ==== Commencing Initiation ====\n')

    ### Background    
    b_name='Input_Background_1.csv'
    background = np.genfromtxt(b_name, delimiter=',')
    background = background[1:,:]
    print("     .Background Loaded..." )
    print("     .Background shape: {}".format(background.shape))

    ### Signal
    s_name='Input_Signal_1.csv'
    signal = np.genfromtxt(s_name, delimiter=',')
    signal = signal[1:,:]
    print("     .Signal Loaded...")
    print("     .Signal shape: {}\n".format(signal.shape))

    print('\n          ==== Initiation Complete ====\n')
    print('=*='*17 )
    print('      ==== Commencing Data Processing ====')

         ==== Commencing Initiation ====

     .Background Loaded...
     .Background shape: (543500, 21)
     .Signal Loaded...
     .Signal shape: (522467, 21)


          ==== Initiation Complete ====

=*==*==*==*==*==*==*==*==*==*==*==*==*==*==*==*==*=
      ==== Commencing Data Processing ====


In [47]:
if __name__ == '__main__':
    for n_i in range(iterations):
        print('\n     => Iteration Number', (n_i+1) )

        # Divide data-set into training and testing sub-sets
        print('         .Dividing training and testing sub-sets')
        divided_background, _ = dm.divide(background, windows, total)

        test = int(total*test_size)
        b_test = int(test*background_percent)
        static_data_raw, background_test = train_test_split(divided_background, test_size=test_size, random_state=42)
        background_test, _ = dm.divide(background_test, windows, b_test)

        # Defining number of events Signal events on online phase.
        signal_online_samples = int(test - b_test)
        reduced_signal, _ = dm.divide(signal, windows, signal_online_samples)

        print('         .Selecting Signal on the following porpotion:')
        print('             .{}% Background samples'.format(int(background_percent*100)))
        print('             .{}% Signal samples'.format(int((1-background_percent)*100)))
        print('             .{:9d} of Background samples (Offline)'.format(int(total*(1-test_size))))
        print('             .{:9d} of Background samples (Online)'.format(int(b_test)) )
        print('             .{:9d} of Signal samples (Online)'.format(int(signal_online_samples)))

        # Concatenating Signal and the Test Background sub-set
        streaming_data_raw = np.concatenate((background_test,reduced_signal), axis=0)
        print("             .Offline shape: {}".format(static_data_raw.shape))
        print("             .Online shape: {}\n".format(streaming_data_raw.shape))

        # Normalize Data
        print('         .Normalizing Data')
        static_data = normalize(static_data_raw,norm='max',axis=0)
        streaming_data = normalize(streaming_data_raw,norm='max',axis=0)
        
        ADP_outputs = {}
        
        print('             .Executing for granularities', gra_list)
        for gra in gra_list:
            print('\n\n             .Iter: {} - Granularity: {}'.format(n_i, gra))
            print('                 .ADP (5th Method)')
            output = ADP_Offline_Granularity_Iteration_5th(static_data, streaming_data, gra, b_test, n_i)
            ADP_outputs ['granularity_'+str(gra)] = output

    print('\n        ====Data Processing Complete====\n' )
    print('=*='*17 ) 


     => Iteration Number 1
         .Dividing training and testing sub-sets
         .Selecting Signal on the following porpotion:
             .99% Background samples
             .1% Signal samples
             .     7000 of Background samples (Offline)
             .     2970 of Background samples (Online)
             .       30 of Signal samples (Online)
             .Offline shape: (7000, 21)
             .Online shape: (3000, 21)

         .Normalizing Data
             .Executing for granularities [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


             .Iter: 0 - Granularity: 1
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 2
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 3
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 4
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 5
                 .ADP (5th Method)


             .Iter: 0 - Granularity: 6
                 .ADP (5th Method)


In [117]:
def probability(data):
    mu = np.mean(data, axis=0)
    variance = np.mean((data - mu)**2, axis=0)
    var_dia = np.diag(variance)
    k = len(mu)
    X = data - mu
    p = 1/((2*np.pi)**(k/2)*(np.linalg.det(var_dia)**0.5))* np.exp(-0.5* np.sum(X @ np.linalg.pinv(var_dia) * 
                                                                                    X,axis=1))
    return p

In [134]:
#### divide data clouds data anda compute probability

data_clouds_data = {}

for gra in ADP_outputs.keys():
    idx = ADP_outputs[gra]['ADP_streaming_output']['IDX']
    u = np.unique(idx)
    dc_dic = {}
    for i in u:
        data = []
    
        for j in range(len(idx)):
            if idx[j] == i:
                data.append(list(streaming_data[j]))
        
        dc_dic[i] = np.nan_to_num(probability(data))
    data_clouds_data[gra] = dc_dic
    

  import sys


In [136]:
### Make detections with the scikit methods

# Example settings
signal_fraction = 0.01

anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=signal_fraction)),
    ("One-Class SVM", svm.OneClassSVM(nu=signal_fraction, kernel="rbf",
                                      gamma=0.1)),
    ("Isolation Forest", IsolationForest(contamination=signal_fraction,
                                         random_state=42)),
    ("Local Outlier Factor", LocalOutlierFactor(
        n_neighbors=3, contamination=signal_fraction))]

for gra in data_clouds_data:
    for dc in data_clouds_data[gra]:
        X = data_clouds_data[gra][dc].reshape(-1, 1)

        for name, algorithm in anomaly_algorithms:
            t0 = time.time()
            algorithm.fit(X)
            t1 = time.time()

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: array must not contain infs or NaNs