In [2]:
import pandas as pd
import numpy as np
import ADP
import data_manipulation as dm
from datetime import datetime
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn import svm
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

In [3]:
var = pickle.load(open("ADP_Iterations/var.pkl", "rb"))

gra_list = var['gra_list']
iterations = var['iter']
total = var['total']
background_percent = var['back_percent']
test_size = var['test_size']
b_test = var['b_test']

iterations = 1

# Probabilidade

In [4]:
def probability(df):
    s = np.sum(df, axis=0)
    m = len(df)
    mu = s/m
    vr = np.sum((df - mu)**2, axis=0)
    variance = vr/m
    var_dia = np.diag(variance)
    k = len(mu)
    X = df - mu
    p = 1/((2*np.pi)**(k/2)*(np.linalg.det(var_dia)**0.5))* np.exp(-0.5* np.sum(X @ np.linalg.pinv(var_dia) * X,axis=1))
    return p

In [5]:
def tpfpfn(ep, p, y):
    tp, fp, fn = 0, 0, 0
    for i in range(len(y)):
        if p[i] <= ep and y[i] == 1:
            tp += 1
        elif p[i] <= ep and y[i] == 0:
            fp += 1
        elif p[i] > ep and y[i] == 1:
            fn += 1
    return tp, fp, fn

In [6]:
def f1(ep, p, y):
    tp, fp, fn = tpfpfn(ep,p, y)
    try:
        prec = tp/(tp + fp)
        rec = tp/(tp + fn)
        f1 = 2*prec*rec/(prec + rec)
    except:
        f1 = 0
    return f1

In [7]:
def prob_func(p, decision, L2):
    eps = [i for i in p if i <= p.mean()]
    f = [f1(i,p, decision) for i in eps]
    e_idx = np.array(f).argmax()
    e = eps[e_idx]
    prob_label = []
    for i in range(L2):
        if p[i] <= e:
            prob_label.append(1)
        else:
            prob_label.append(0)
    return prob_label, e

# Autoencoder

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model
import tensorflow as tf

In [9]:
def autoencoder(static, streaming, reduction=11, epochs=200, batch_size=1024, layer1=42, layer2=21, activation='exponential'):
    L, W = static.shape
    
    visible = Input(shape=(W,))
    e = Dense(layer1)(visible)
    e = BatchNormalization()(e)
    e = LeakyReLU()(e)

    e = Dense(layer2)(e)
    e = BatchNormalization()(e)
    e = LeakyReLU()(e)

    n_bottleneck = round(reduction)
    bottleneck = Dense(n_bottleneck)(e)

    d = Dense(layer2)(bottleneck)
    d = BatchNormalization()(d)
    d = LeakyReLU()(d)

    d = Dense(layer1)(d)
    d = BatchNormalization()(d)
    d = LeakyReLU()(d)

    cp = tf.keras.callbacks.ModelCheckpoint(filepath="autoencoder_fraud.h5",
                               mode='min', monitor='val_loss', verbose=0, save_best_only=True)
    early_stop = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            min_delta=0.0001,
            patience=10,
            verbose=0, 
            mode='min',
            restore_best_weights=True)
    
    output = Dense(W, activation=activation)(d)
    model = Model(inputs=visible, outputs=output)
    model.compile(optimizer='adam', loss='mse')
    history = model.fit(static, static, epochs=epochs, batch_size=batch_size, 
                        verbose=0, validation_data=(streaming, streaming),
                    callbacks=[cp, early_stop])
    
    streaming_predictions = model.predict(streaming)
    mse = np.mean(np.power(streaming - streaming_predictions, 2), axis=1)

    return mse

In [None]:
for n_i in range(iterations):
    ##################################################################
    # ------------------ LER OS DADOS DA ITERAÇÃO ------------------ #
    # -------------------------------------------------------------- #
    static = pickle.load(open("ADP_Iterations/Static_Iter{}.pkl".format(n_i), "rb"))
    streaming = pickle.load(open("ADP_Iterations/Streaming_Iter{}.pkl".format(n_i), "rb"))
    data = np.concatenate((static,streaming), axis=0)
    L, W = data.shape
    L1, _ = static.shape
    L2, _ = streaming.shape
    
    ##################################################################
    # ------------ DATAFRAME PARA METODO ANOMALY_CLOUDS ------------ #
    # -------------------------------------------------------------- #
    detection_info = pd.DataFrame(np.zeros((len(gra_list),6)), columns=['Granularity',
                                                            'True_Positive', 'True_Negative',
                                                            'False_Positive','False_Negative', 
                                                            'N_Groups'])
    detection_info['Granularity'] = gra_list
    
    ##################################################################
    # ------------ DATAFRAME PARA METODO PROBABILIDADE ------------- #
    # -------------------------------------------------------------- #
    prob_detection_info = pd.DataFrame(np.zeros((len(gra_list),6)), columns=['Granularity',
                                                            'True_Positive', 'True_Negative',
                                                            'False_Positive','False_Negative', 
                                                            'N_Groups'])
    prob_detection_info['Granularity'] = gra_list
    
    
    ##################################################################
    # ------------- DATAFRAME PARA METODOS EM CASCATA -------------- #
    # -------------------------------------------------------------- #
    cascata_detection_info = pd.DataFrame(np.zeros((len(gra_list),6)), columns=['Granularity',
                                                            'True_Positive', 'True_Negative',
                                                            'False_Positive','False_Negative', 
                                                            'N_Groups'])
    cascata_detection_info['Granularity'] = gra_list
    

    for gra in gra_list:
        ##################################################################
        # -------------------- LER OS DADOS DO ADP --------------------- #
        # -------------------------------------------------------------- #
        
        output = pickle.load(open("ADP_Iterations/Output_Iter{}_Gra{}.pkl".format(n_i, gra), "rb"))
        
        
        ##################################################################
        # ------------------- METODO ANOMALY CLOUDS -------------------- #
        # -------------------------------------------------------------- #
        on_center = output['centre']
        on_IDX = output['IDX']
        online_labels = output['IDX'][L1:]
        
        label = np.zeros((L2))
        label[b_test:] = 1
        decision = np.zeros((L2))
    
        cloud_info = pd.DataFrame(np.zeros((len(on_center),4)),columns=['Total_Samples','Old_Samples',
                                                                        'Percentage_Old_Samples', 'Percentage_of_Samples'])

        for j in range (len(on_IDX)):
            if j < L1:
                cloud_info.loc[int(on_IDX[j]),'Old_Samples'] += 1
            cloud_info.loc[int(on_IDX[j]),'Total_Samples'] += 1

        cloud_info.loc[:,'Percentage_Old_Samples'] = cloud_info.loc[:,'Old_Samples'] * 100 / cloud_info.loc[:,'Total_Samples']
        cloud_info.loc[:,'Percentage_of_Samples'] = cloud_info.loc[:,'Total_Samples'] * 100/ cloud_info.loc[:,'Total_Samples'].sum()

        anomaly_clouds=[]
        n_anomalies = 0

        for j in range(len(on_center)):
            if cloud_info.loc[j,'Percentage_Old_Samples'] == 0 :
                n_anomalies += cloud_info.loc[j,'Total_Samples']
                anomaly_clouds.append(j)

        if n_anomalies != 0:
            for j in range(len(online_labels)): 
                if online_labels[j] in anomaly_clouds:
                    decision[j] = 1

        for j in range(len(label)):
            if label[j] == 1:
                if decision[j] == label[j]:
                    detection_info.loc[gra-1,'True_Positive'] += 1
                else:
                    detection_info.loc[gra-1,'False_Negative'] += 1     
            else:
                if decision[j] == label[j]:
                    detection_info.loc[gra-1,'True_Negative'] += 1
                else:
                    detection_info.loc[gra-1,'False_Positive'] += 1
    
        detection_info.loc[gra-1,'N_Groups'] = max(on_IDX) + 1
        prob_detection_info.loc[gra-1,'N_Groups'] = max(on_IDX) + 1
        cascata_detection_info.loc[gra-1,'N_Groups'] = max(on_IDX) + 1
        
        ##################################################################
        # ------------- PROBABILIDADE EM RELAÇÃO AO GERAL -------------- #
        # --------------------- ANOMALY CLOUDS ------------------------- #
        # -------------------------------------------------------------- #
        p = probability(streaming)
        mse = autoencoder(static,streaming)
        
        plt.figure(figsize = [16,7])
        plt.suptitle('GRA - {}'.format(gra),fontsize=25)
        indices = [i for i in range(L2) if online_labels[i] in anomaly_clouds]
        plt.title('Analise das amostras dentro dos Data Clouds')
        plt.scatter(p.reshape(-1,1), mse.reshape(-1,1), label = 'Amostras', edgecolors='k')
        plt.scatter(p.reshape(-1,1)[indices], mse.reshape(-1,1)[indices], c='r', label='Amostras Anomalas', edgecolors='k')
        plt.xlabel('Probabilidade')
        plt.ylabel('GlobalDensity')
        plt.legend()
        
        prob_label, e = prob_func(p, decision, L2)
                
        for j in range(len(label)):
            if label[j] == 1:
                if prob_label[j] == label[j]:
                    prob_detection_info.loc[gra-1,'True_Positive'] += 1
                else:
                    prob_detection_info.loc[gra-1,'False_Negative'] += 1     
            else:
                if prob_label[j] == label[j]:
                    prob_detection_info.loc[gra-1,'True_Negative'] += 1
                else:
                    prob_detection_info.loc[gra-1,'False_Positive'] += 1
                    
                    
        plt.plot([e, e],[min(mse), max(mse)], 'g', linewidth=3)
        plt.savefig('Figures\Anomaly_Clouds_{}_{}.png'.format(gra, n_i))
        plt.show()
        ##################################################################
        # ------------- PROBABILIDADE VS GLOBAL DENSITY ---------------- #
        # ------------------ BACKGROUND E SIGNAL ----------------------- #
        # -------------------------------------------------------------- #
        plt.figure(figsize = [16,7])
        plt.suptitle('GRA - {}'.format(gra))
        plt.title('Probabildiade vs GlobalDensity - Background e Signal')
        plt.scatter(p.reshape(-1,1)[:b_test], mse.reshape(-1,1)[:b_test], label='Background', edgecolors='k')
        plt.scatter(p.reshape(-1,1)[b_test:], mse.reshape(-1,1)[b_test:], c='r', label='Signal', edgecolors='k')
        plt.xlabel('Probabilidade')
        plt.ylabel('GlobalDensity')
        plt.legend()
        plt.plot([e, e], [min(mse), max(mse)], 'g', linewidth=3)
        plt.savefig('Figures\Background_Signal_{}_{}.png'.format(gra, n_i))
        plt.show()
        
        cascata_label = prob_label * decision
        
        for j in range(len(label)):
            if label[j] == 1:
                if cascata_label[j] == label[j]:
                    cascata_detection_info.loc[gra-1,'True_Positive'] += 1
                else:
                    cascata_detection_info.loc[gra-1,'False_Negative'] += 1     
            else:
                if cascata_label[j] == label[j]:
                    cascata_detection_info.loc[gra-1,'True_Negative'] += 1
                else:
                    cascata_detection_info.loc[gra-1,'False_Positive'] += 1
        
        ##################################################################
        # ------------------------- PLOT 3D ---------------------------- #
        # ------------------ BACKGROUND E SIGNAL ----------------------- #
        # -------------------------------------------------------------- #
        
        GD = output['SamplesGlobalDensity'][L1:]
        fig = plt.figure(figsize = [16,7])
        ax = fig.add_subplot(projection='3d')

        ax.scatter(p.reshape(-1,1)[:b_test], mse.reshape(-1,1)[:b_test], np.squeeze(np.array(GD))[:b_test], label='Background', edgecolors='k')
        ax.scatter(p.reshape(-1,1)[b_test:], mse.reshape(-1,1)[b_test:], np.squeeze(np.array(GD))[b_test:], c='r', label='Signal', edgecolors='k')
        ax.set_xlabel('Probabilidade')
        ax.set_ylabel('MSE')
        ax.set_zlabel('GlobalDensity')
        plt.show()

        
        detection_info.to_csv('results/Anomaly_Clouds_detection_info_{}_{}.csv'.format(gra,n_i), index=False)
        prob_detection_info.to_csv('results/Prob_detection_info_{}_{}.csv'.format(gra,n_i), index=False)
        cascata_detection_info.to_csv('results/Cascata_detection_info_{}_{}.csv'.format(gra,n_i), index=False)


In [None]:
%matplotlib
fig = plt.figure(figsize = [16,7])
ax = fig.add_subplot(projection='3d')
ax.scatter(p.reshape(-1,1)[:b_test], mse.reshape(-1,1)[:b_test], np.squeeze(np.array(GD))[:b_test], label='Background', edgecolors='k')
ax.scatter(p.reshape(-1,1)[b_test:], mse.reshape(-1,1)[b_test:], np.squeeze(np.array(GD))[b_test:], c='r', label='Signal', edgecolors='k')
ax.set_xlabel('Probabilidade')
ax.set_ylabel('MSE')
ax.set_zlabel('GlobalDensity')
plt.show()

In [1]:
%matplotlib inline