In [None]:
import pandas as pd
import numpy as np
import ADP
import data_manipulation as dm
from datetime import datetime
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from scipy.spatial.distance import cdist

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model
import tensorflow as tf

# Initiation Variables

In [None]:
if __name__ == '__main__':
    ##########################################################
    # ------------------------------------------------------ #
    # --------------------- INITIATION --------------------- #
    # ------------------------------------------------------ #
    ##########################################################
    ### Define User Variables ###

    # List of Granularities
    gra_list = [i for i in range(1,11)]

    # Number of Iterations
    iterations = 33

    # Number of events
    total = 10000

    # Number of Data-set divisions
    windows = 100

    # Percentage of background samples on the testing phase
    background_percent = 0.99

    # Percentage of samples on the training phase
    test_size = 0.3

# Load Data Base

In [None]:
if __name__ == '__main__':
    ##########################################################
    # ------------------------------------------------------ #
    # ----------------------- LOADING ---------------------- #
    # ------------------------------------------------------ #
    ##########################################################
    # Firstly the model loads the background and signal data, 
    # then it removes the attributes first string line, which 
    # are the column names, in order to avoid NaN values in 
    # the array.

    print('         ==== Commencing Initiation ====\n')

    ### Background    
    b_name='Input_Background_1.csv'
    background = np.genfromtxt(b_name, delimiter=',')
    background = background[1:,:]
    Lb, W = background.shape
    print("     .Background Loaded..." )
    print("     .Background shape: {}".format(background.shape))

    ### Signal
    s_name='Input_Signal_1.csv'
    signal = np.genfromtxt(s_name, delimiter=',')
    signal = signal[1:,:]
    Ls, _ = signal.shape
    print("     .Signal Loaded...")
    print("     .Signal shape: {}\n".format(signal.shape))

    print('\n          ==== Initiation Complete ====\n')
    print('=*='*17 )
    print('      ==== Commencing Data Processing ====')

# Extraction Functions

### Probability

In [None]:
def probability(df):
    s = np.sum(df, axis=0)
    m = len(df)
    mu = s/m
    vr = np.sum((df - mu)**2, axis=0)
    variance = vr/m
    var_dia = np.diag(variance)
    k = len(mu)
    X = df - mu
    p = 1/((2*np.pi)**(k/2)*(np.linalg.det(var_dia)**0.5))* np.exp(-0.5* np.sum(X @ np.linalg.pinv(var_dia) * X,axis=1))
    return p

### Autoencoder

In [None]:
def autoencoder(data, reduction=11, epochs=200, batch_size=1024, layer1=42, layer2=21, activation='exponential'):
    L, W = data.shape
    
    visible = Input(shape=(W,))
    e = Dense(layer1)(visible)
    e = BatchNormalization()(e)
    e = LeakyReLU()(e)

    e = Dense(layer2)(e)
    e = BatchNormalization()(e)
    e = LeakyReLU()(e)

    n_bottleneck = round(reduction)
    bottleneck = Dense(n_bottleneck)(e)

    d = Dense(layer2)(bottleneck)
    d = BatchNormalization()(d)
    d = LeakyReLU()(d)

    d = Dense(layer1)(d)
    d = BatchNormalization()(d)
    d = LeakyReLU()(d)

    cp = tf.keras.callbacks.ModelCheckpoint(filepath="autoencoder_fraud.h5",
                               mode='min', monitor='loss', verbose=0, save_best_only=True)
    early_stop = tf.keras.callbacks.EarlyStopping(
            monitor='loss',
            min_delta=0.0001,
            patience=10,
            verbose=0, 
            mode='min',
            restore_best_weights=True)
    
    output = Dense(W, activation=activation)(d)
    model = Model(inputs=visible, outputs=output)
    model.compile(optimizer='adam', loss='mse')
    history = model.fit(data, data, epochs=epochs, batch_size=batch_size, 
                        verbose=0,
                    callbacks=[cp, early_stop])
    
    predictions = model.predict(data)
    rmse = np.sqrt(np.mean(np.power(data - predictions, 2), axis=1))
    
    return rmse

### EDA

In [None]:
def EDA_calc (data):
    L, W = data.shape
    EDA = np.zeros((L,6))
    
    dist = cdist(data, data, metric='euclidean')
    EDA[:,0] = 1/np.sum(dist,axis=0)
    CumulativeProximity = np.sum(dist**2,axis=0)
    EDA[:,1] = L*np.sum(CumulativeProximity)/(2*CumulativeProximity)
    
    dist = cdist(data, data, metric='cosine')
    EDA[:,2] = 1/np.sum(dist,axis=0)
    CumulativeProximity = np.sum(dist**2,axis=0)
    EDA[:,3] = L*np.sum(CumulativeProximity)/(2*CumulativeProximity)
    
    dist = cdist(data, data, metric='mahalanobis')
    EDA[:,4] = 1/np.sum(dist,axis=0)
    CumulativeProximity = np.sum(dist**2,axis=0)
    EDA[:,5] = L*np.sum(CumulativeProximity)/(2*CumulativeProximity)
    
    return EDA

# Defining Threshold

In [None]:
if __name__ == '__main__':
    threshold_list = []
    
    for n_i in range(iterations):
        t_dict = {}
        
        ##########################################################
        # ------------------------------------------------------ #
        # ------------------- Preparing Data ------------------- #
        # ------------------------------------------------------ #
        ##########################################################
        print('\n     => Iteration Number', (n_i+1) )
        
        # Divide data-set
        b_samples = int(total*background_percent)
        s_samples = total - b_samples
        
        print('         .Dividing background and signal sub-sets')
        _, divided_background = train_test_split(background, test_size=b_samples/Lb)
        _, divided_signal = train_test_split(signal, test_size=s_samples/Ls)

        print('         .Selecting Signal on the following porpotion:')
        print('             .{}% Background samples'.format(int(background_percent*100)))
        print('             .{}% Signal samples'.format(int((1-background_percent)*100)))
        print('             .{:9d} of Background samples'.format(int(b_samples)) )
        print('             .{:9d} of Signal samples)'.format(int(s_samples)))

        # Concatenating Signal and the Test Background sub-set
        streaming_data_raw = np.concatenate((divided_background,divided_signal), axis=0)
        print("             .FullData shape: {}\n".format(streaming_data_raw.shape))

        # Normalize Data
        print('         .Normalizing Data')
        streaming = normalize(streaming_data_raw,norm='max',axis=0)
        
        x1 = [i for i in range(1,b_samples+1)]
        x2 = [i for i in range(b_samples+1,b_samples+s_samples+1)]
        label = np.array(b_samples*[0] + s_samples*[1])
        ##########################################################
        # ------------------------------------------------------ #
        # -------------------- Probability --------------------- #
        # ------------------------------------------------------ #
        ##########################################################
        print('             .Extracting Probability')
        p = probability(streaming)
        
        
        t_dict['Probability_Min'] = np.min(p[b_samples:])
        t_dict['Probability_Max'] = np.max(p[b_samples:])
        t_dict['Probability_Mean'] = np.mean(p[b_samples:])
        t_dict['Probability_Median'] = np.median(p[b_samples:])
        

        f, ax = plt.subplots(8,1,figsize=(16,8*7))
        
        ax[0].set_title('Event Probability - Iteration {}'.format(n_i+1), fontsize=20)
        ax[0].scatter(x1, p.reshape(-1,1)[:b_samples], c='b', label='Background', edgecolors='k')
        ax[0].scatter(x2, p.reshape(-1,1)[b_samples:], c='r', label='Signal', edgecolors='k')
        
        ax[0].plot(x1+x2, len(x1+x2)*[t_dict['Probability_Mean']], color='tab:pink', linewidth=3, linestyle='--', label='Mean')
        ax[0].plot(x1+x2, len(x1+x2)*[t_dict['Probability_Min']], color='tab:red', linewidth=3, linestyle='--', label='Min')
        ax[0].plot(x1+x2, len(x1+x2)*[t_dict['Probability_Max']], color='tab:orange', linewidth=3, linestyle='--', label='Max')
        ax[0].plot(x1+x2, len(x1+x2)*[t_dict['Probability_Median']], color='tab:green',linewidth=3, linestyle='--', label='Median')
        
        ax[0].set_xlabel('Event Index', fontsize=15)
        ax[0].set_ylabel('Probability', fontsize=15)
        ax[0].legend()
        ##########################################################
        # ------------------------------------------------------ #
        # -------------------- Autoencoder --------------------- #
        # ------------------------------------------------------ #
        ##########################################################
        print('             .Extracting MSE from autoencoder')
        rmse = autoencoder(streaming)
        
        t_dict['Autoencoder_Min'] = np.min(rmse[b_samples:])
        t_dict['Autoencoder_Max'] = np.max(rmse[b_samples:])
        t_dict['Autoencoder_Mean'] = np.mean(rmse[b_samples:])
        t_dict['Autoencoder_Median'] = np.median(rmse[b_samples:])
        
        ax[1].set_title('Reconstuction Error - Iteration {}'.format(n_i+1), fontsize=20)
        ax[1].scatter(x1, rmse.reshape(-1,1)[:b_samples], c='b', label='Background', edgecolors='k')
        ax[1].scatter(x2, rmse.reshape(-1,1)[b_samples:], c='r', label='Signal', edgecolors='k')
        
        ax[1].plot(x1+x2, len(x1+x2)*[t_dict['Autoencoder_Mean']], color='tab:pink', linewidth=3, linestyle='--', label='Mean')
        ax[1].plot(x1+x2, len(x1+x2)*[t_dict['Autoencoder_Min']], color='tab:red', linewidth=3, linestyle='--', label='Min')
        ax[1].plot(x1+x2, len(x1+x2)*[t_dict['Autoencoder_Max']], color='tab:orange', linewidth=3, linestyle='--', label='Max')
        ax[1].plot(x1+x2, len(x1+x2)*[t_dict['Autoencoder_Median']], color='tab:green',linewidth=3, linestyle='--', label='Median')
        
        ax[1].set_xlabel('Event Index', fontsize=15)
        ax[1].set_ylabel('MSE', fontsize=15)
        ax[1].legend()
        
        
        ##########################################################
        # ------------------------------------------------------ #
        # ------------------------ EDAs ------------------------ #
        # ------------------------------------------------------ #
        ##########################################################
        EDA_labels = ['EuclideanCentrality', 'EuclideanDensity',
                      'CosineCentrality', 'CosineDensity',
                      'MahalanobisCentrality', 'MahalanobisDensity']
        
        print('             .Extracting EDAs')
        EDA = EDA_calc(streaming)
        
        for i in range(6):
            t_dict['{}_Min'.format(EDA_labels[i])] = np.min(EDA[b_samples:,i])
            t_dict['{}_Max'.format(EDA_labels[i])] = np.max(EDA[b_samples:,i])
            t_dict['{}_Mean'.format(EDA_labels[i])] = np.mean(EDA[b_samples:,i])
            t_dict['{}_Median'.format(EDA_labels[i])] = np.median(EDA[b_samples:,i])

            ax[i+2].set_title('{} - Iteration {}'.format(EDA_labels[i], n_i+1), fontsize=20)
            ax[i+2].scatter(x1, EDA[:b_samples,i], c='b', label='Background', edgecolors='k')
            ax[i+2].scatter(x2, EDA[b_samples:,i], c='r', label='Signal', edgecolors='k')

            ax[i+2].plot(x1+x2, len(x1+x2)*[t_dict['{}_Mean'.format(EDA_labels[i])]], color='tab:pink', linewidth=3, linestyle='--', label='Mean')
            ax[i+2].plot(x1+x2, len(x1+x2)*[t_dict['{}_Min'.format(EDA_labels[i])]], color='tab:red', linewidth=3, linestyle='--', label='Min')
            ax[i+2].plot(x1+x2, len(x1+x2)*[t_dict['{}_Max'.format(EDA_labels[i])]], color='tab:orange', linewidth=3, linestyle='--', label='Max')
            ax[i+2].plot(x1+x2, len(x1+x2)*[t_dict['{}_Median'.format(EDA_labels[i])]], color='tab:green',linewidth=3, linestyle='--', label='Median')

            ax[i+2].set_xlabel('Event Index', fontsize=15)
            ax[i+2].set_ylabel(EDA_labels[i], fontsize=15)
            ax[i+2].legend()
            
        plt.show()
        threshold_list.append(t_dict)
        
        data = np.concatenate((p.reshape(-1,1), rmse.reshape(-1,1), EDA, label.reshape(-1,1)), axis=1)
        columns = ['Probability', 'RMSE'] + EDA_labels + ['label']
        aux = pd.DataFrame(data, columns=columns)
        
        sns.set(font_scale=1)
        sns.pairplot(aux, hue='label', diag_kind="hist", corner=True)
        plt.show()

In [None]:
df = pd.DataFrame(threshold_list)

In [None]:
df

In [None]:
df.describe()