In [1]:
import pickle

# libraries importing
import numpy as np
import pandas as pd
from tqdm import tqdm
import csv
import matplotlib.pyplot as plt
%matplotlib inline

#from sklearn.preprocessing import MinMaxScaler

from tensorflow import keras
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, RepeatVector, Lambda
from keras.callbacks import EarlyStopping
from keras import backend as K
from tensorflow.keras import losses
import tensorflow as tf
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

import warnings
warnings.filterwarnings("ignore")

c:\Users\gioel\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\gioel\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll


SEQUENCE

In [2]:
WINDOW_SIZE=40
def create_sequences(values, time_steps=WINDOW_SIZE):
    output = []
    for i in range(len(values) - time_steps + 1):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)

In [3]:

latent_dim=64

def create_lstm_vae():
    
    x = Input(shape=(WINDOW_SIZE, 19))

    # encoding
    h = LSTM(128,return_sequences=True)(x)
    h = LSTM(64)(h)

    # VAE Z layer
    z_mean = Dense(latent_dim, name='mean')(h)
    z_log_sigma = Dense(latent_dim, name='var')(h)
    
    def sampling(args):
        z_mean, z_log_sigma = args
        epsilon = K.random_normal(shape=(1, latent_dim),
                                  mean=0., stddev=1.)
        return z_mean + z_log_sigma * epsilon

    z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_sigma])

    #decoding
    decoder= RepeatVector(WINDOW_SIZE)(z)
    decoder= LSTM(64, return_sequences=True)(decoder)
    decoder= LSTM(128, return_sequences=True)(decoder)
    
    x_decoded_mean= Dense(19)(decoder)
    vae = Model(x, x_decoded_mean)

    # encoder, from inputs to latent space
    encoder = Model(x, [z_mean, z_log_sigma, z], name='encoder')
    #encoder.summary()

    #decoder, from latent space to output
    decoder_input = Input(shape=(latent_dim,))
    _h_decoded = RepeatVector(WINDOW_SIZE)(decoder_input)
    _h_decoded = LSTM(64, return_sequences=True)(_h_decoded)
    _h_decoded= LSTM(128, return_sequences=True)(_h_decoded)
    x_decoded= Dense(19)(_h_decoded)

    dencoder = Model(decoder_input, x_decoded, name='dencoder')
    #dencoder.summary()

    
    def vae_loss(x, x_decoded_mean):
        mse = losses.MeanSquaredError()
        xent_loss = mse(x, x_decoded_mean)
        kl_loss = - 0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma))
        loss = xent_loss + kl_loss

        return loss

    vae.compile(optimizer='rmsprop', loss=vae_loss)

    return vae,encoder

In [4]:
def arch(data,val):

    BATCH_SIZE = 32
    
    model,encoder= create_lstm_vae()
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=30)
    
    history = model.fit(
        data,
        data,
        epochs=1000,
        batch_size=BATCH_SIZE,
        verbose=1,
        validation_data=(val,val),
        callbacks=[callback]
    )

    return history, model, encoder

APP_TOT

In [6]:


with open(f'./DATA_SPLITTED/app_tot.pkl', 'rb') as f:
    DATA = pickle.load(f)

train=DATA['X_train']
val=DATA['X_val']


history,model,encoder=arch(train,val)

model.save_weights(f"MODEL_VAE/model_app_Win_{WINDOW_SIZE}_early.h5")
encoder.save(f"MODEL_VAE/encoder_app_Win_{WINDOW_SIZE}_early.h5")
with open(f'MODEL_VAE/history_app_Win_{WINDOW_SIZE}_early', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)


Train on 62666 samples, validate on 13423 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 6

isolation forest

In [7]:
#CALCULATE THRESHOLD
from sklearn.ensemble import IsolationForest

with open(f'./DATA_SPLITTED/app_tot.pkl', 'rb') as f:
    DATA = pickle.load(f)
    
model,encoder=create_lstm_vae()


model.load_weights(filepath=f'MODEL_VAE/model_app_Win_{WINDOW_SIZE}_.h5')
encoder.load_weights(filepath=f'MODEL_VAE/encoder_app_Win_{WINDOW_SIZE}_.h5')


z_mean, z_log_sigma, z_train= encoder.predict(DATA['X_test'])


clf = IsolationForest(n_jobs=-1,contamination=0.15)

clf.fit(z_train)

In [9]:
TP=0
TN=0
FN=0
FP=0


model,encoder= create_lstm_vae()
model.load_weights(filepath=f'MODEL_VAE/model_app_Win_{WINDOW_SIZE}_.h5')
encoder.load_weights(filepath=f'MODEL_VAE/encoder_app_Win_{WINDOW_SIZE}_.h5')

with open(f'./DATA_SPLITTED/app_tot.pkl', 'rb') as f:
    DATA = pickle.load(f)
train=DATA['X_test']

for i in tqdm(range(2,11)):
    if(i!=7 and i!=8 ):


        TEST=np.load(f'./OUTPUTS_ROOT/data/processed/spark_0_15s/spark_0_trace-scl_std/test{i}.npy',allow_pickle=True)

        ANOMALY=np.load(f'./OUTPUTS_ROOT/data/processed/spark_0_15s/spark_0_trace-scl_std/y_test{i}.npy',allow_pickle=True)

        with open(f'./OUTPUTS_ROOT/data/interim/spark_0_15s/test_info{i}.pkl', 'rb') as f:
            TEST_info= pickle.load(f)
        
        for x in range(len(TEST_info)):


                X_test=create_sequences(TEST[x])

                _,_,z_test=encoder.predict(X_test)

                pred=clf.predict(z_test)
                outlier=pred
                for k in range(len(pred)):
                    outlier[k]=pred[k]*(-1)
                    if(outlier[k]<0):
                        outlier[k]=0
                error=clf.score_samples(z_test)


                le=len(outlier)


                true_= ANOMALY[x][20:le+20]>=1
                
                prediction_ = outlier[:].astype(int)==1
                
                TP = TP+(true_ & prediction_).sum()   
                TN = TN+(~true_ & ~prediction_).sum()  
                FP = FP+(~true_ & prediction_).sum()    
                FN = FN+(true_ & ~prediction_).sum()    

PREC=TP / (TP + FP)
REC = TP/ (TP+FN)
f1=2 * PREC * REC/(PREC + REC)

print(f'F1:{f1}')
print(f'PREC:{PREC}')
print(f'REC:{REC}')

print(f'TP:{TP}')
print(f'TN:{TN}')
print(f'FP:{FP}')
print(f'FN:{FN}')

100%|██████████| 9/9 [00:31<00:00,  3.49s/it]

F1:0.6280923810306567
PREC:0.6186852331606217
REC:0.6377900183608747
TP:3821
TN:40083
FP:2355
FN:2170





KNN

In [10]:
#CALCULATE THRESHOLD
from pyod.models.knn import KNN

with open(f'./DATA_SPLITTED/app_tot.pkl', 'rb') as f:
    DATA = pickle.load(f)
    
model,encoder=create_lstm_vae()


model.load_weights(filepath=f'MODEL_VAE/model_app_Win_{WINDOW_SIZE}_.h5')
encoder.load_weights(filepath=f'MODEL_VAE/encoder_app_Win_{WINDOW_SIZE}_.h5')


z_mean, z_log_sigma, z_train= encoder.predict(DATA['X_test'])


clf_n = KNN(n_neighbors=400, method='mean', metric='euclidean', contamination=0.15)

clf_n.fit(z_train)


KNN(algorithm='auto', contamination=0.15, leaf_size=30, method='mean',
  metric='euclidean', metric_params=None, n_jobs=1, n_neighbors=400, p=2,
  radius=1.0)

In [14]:
TP = 0
TN = 0
FP = 0  
FN = 0

model,encoder= create_lstm_vae()
model.load_weights(filepath=f'MODEL_VAE/model_app_Win_{WINDOW_SIZE}_.h5')
encoder.load_weights(filepath=f'MODEL_VAE/encoder_app_Win_{WINDOW_SIZE}_.h5')

with open(f'./DATA_SPLITTED/app_tot.pkl', 'rb') as f:
    DATA = pickle.load(f)
train=DATA['X_test']

for i in tqdm(range(2,11)):
    if(i!=7):


        TEST=np.load(f'./OUTPUTS_ROOT/data/processed/spark_0_15s/spark_0_trace-scl_std/test{i}.npy',allow_pickle=True)

        ANOMALY=np.load(f'./OUTPUTS_ROOT/data/processed/spark_0_15s/spark_0_trace-scl_std/y_test{i}.npy',allow_pickle=True)

        with open(f'./OUTPUTS_ROOT/data/interim/spark_0_15s/test_info{i}.pkl', 'rb') as f:
            TEST_info= pickle.load(f)
        
        for x in range(len(TEST_info)):


                X_test=create_sequences(TEST[x])

                _,_,z_test=encoder.predict(X_test)

                outlier=clf_n.predict(z_test)
                error=clf_n.predict_proba(z_test)
                

                le=len(outlier)


                true_= ANOMALY[x][20:le+20]>=1
                
                prediction_ = outlier[:].astype(int)==1
                
                TP = TP+(true_ & prediction_).sum()   
                TN = TN+(~true_ & ~prediction_).sum()  
                FP = FP+(~true_ & prediction_).sum()    
                FN = FN+(true_ & ~prediction_).sum()    

PREC=TP / (TP + FP)
REC = TP/ (TP+FN)
f1=2 * PREC * REC/(PREC + REC)

print(f'F1:{f1}')
print(f'PREC:{PREC}')
print(f'REC:{REC}')

print(f'TP:{TP}')
print(f'TN:{TN}')
print(f'FP:{FP}')
print(f'FN:{FN}')
                
       

100%|██████████| 9/9 [03:58<00:00, 26.51s/it]

F1:0.6392681210415201
PREC:0.599366587490103
REC:0.6848612786489746
TP:4542
TN:45044
FP:3036
FN:2090



