In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, LayerNormalization,BatchNormalization, Dropout, MultiHeadAttention, TimeDistributed, GlobalAveragePooling1D
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from mutation_multivariate import *
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

2024-08-01 16:25:11.459353: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-01 16:25:11.473651: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-01 16:25:11.478312: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-01 16:25:11.503444: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
train_x = np.load('spark_0_trace-scl_std/train.npy', allow_pickle=True)
test_x = np.load('spark_0_trace-scl_std/test.npy', allow_pickle=True)
train_y = np.load('spark_0_trace-scl_std/y_train.npy', allow_pickle=True)
test_y = np.load('spark_0_trace-scl_std/y_test.npy', allow_pickle=True)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(52,)
(52,)
(31,)
(31,)


In [4]:
for j, _ in enumerate(test_y):
    test_y[j] = (test_y[j] != 0).astype(int)

# Time Series Modeling - Statistical Modeling

**Vector Autoregression (VAR)**

In [4]:
auc_scores = []
lag = 10

scaler = StandardScaler()
train_data_x = np.concatenate(train_x, axis=0)
train_data_x = scaler.fit_transform(train_data_x)
model = sm.tsa.VAR(train_data_x).fit(lag)

for test_record, true_values in zip(test_x, test_y):
    test_record = scaler.transform(test_record)
    predictions = model.forecast(train_data_x[-lag:], steps=len(test_record))
    predictions = predictions.mean(axis=1)

    auc_pr = get_auc_pr(predictions, true_values)
        
    auc_scores.append(auc_pr)

print(f'Average AUC-PR: {np.mean(auc_scores)}')


Average AUC-PR: 0.2460473964669215


# Semi Supervised Modelling

**LSTM autoencoder**

In [8]:
def create_lstm_autoencoder(input_shape):
    inputs = Input(shape=(input_shape[1], input_shape[2]))
    
    # Encoder
    encoded = LSTM(32, activation='relu', return_sequences=True)(inputs)
    encoded = Dropout(0.1)(encoded)
    encoded = LSTM(16, activation='relu')(encoded)
    encoded = BatchNormalization()(encoded)
    encoded = RepeatVector(input_shape[1])(encoded)
    
    # Decoder
    decoded = LSTM(16, activation='relu', return_sequences=True)(encoded)
    decoded = Dropout(0.1)(decoded)
    decoded = LSTM(32, activation='relu', return_sequences=True)(decoded)
    decoded = BatchNormalization()(decoded)
    
    outputs = TimeDistributed(Dense(input_shape[2]))(decoded)
    
    autoencoder = Model(inputs, outputs)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

# Preprocess data
scaler = StandardScaler()
train_data_x = np.concatenate(train_x, axis=0)
train_data_x = scaler.fit_transform(train_data_x.reshape(-1, train_data_x.shape[-1]))

# Reshape data to have shape (samples, time steps, features)
time_steps = 30  # choose an appropriate time step length
train_data_x = train_data_x.reshape(-1, time_steps, train_data_x.shape[-1])
input_shape = train_data_x.shape


# Create and train the model with EarlyStopping
autoencoder = create_lstm_autoencoder(input_shape)
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
autoencoder.fit(train_data_x, train_data_x, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping], shuffle = False)

auc_scores = []


for test_record, true_values in zip(test_x, test_y):
    test_record = scaler.transform(test_record.reshape(-1, test_record.shape[-1]))

    # Pad the test_record to be a multiple of time_steps
    if len(test_record) % time_steps != 0:
        pad_length = time_steps - (len(test_record) % time_steps)
        test_record = np.pad(test_record, ((0, pad_length), (0, 0)), 'constant')

    test_record = test_record.reshape(-1, time_steps, test_record.shape[-1])
    
    predictions = autoencoder.predict(test_record)
    
    predictions = predictions.reshape(-1, predictions.shape[-1])[:len(true_values)]
    predictions = predictions.mean(axis=1)
    # true_values = true_values.reshape(-1, true_values.shape[-1])[:len(true_values)]

    auc_pr = get_auc_pr(predictions, true_values)
        
    auc_scores.append(auc_pr)
    
    
print(f"AUC-PR: {np.mean(auc_scores)}")


Epoch 1/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 27ms/step - loss: 1.5072 - val_loss: 1.3336
Epoch 2/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 1.0719 - val_loss: 1.2977
Epoch 3/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - loss: 1.0266 - val_loss: 1.0099
Epoch 4/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 1.0161 - val_loss: 1.0149
Epoch 5/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 1.0085 - val_loss: 0.9938
Epoch 6/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 1.0081 - val_loss: 0.9934
Epoch 7/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - loss: 1.0055 - val_loss: 1.1559
Epoch 8/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - loss: 1.0015 - val_loss: 2.8924
Epoch 9/50
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━

**Transformer**

In [6]:
class TransformerEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def create_transformer_autoencoder(input_shape, embed_dim=30, num_heads=4, ff_dim=128, num_layers=4, rate=0.1):
    time_steps, num_features = input_shape[1], input_shape[2]

    inputs = Input(shape=(time_steps, num_features))
    x = Dense(embed_dim)(inputs)  # Ensure input dimension matches embed_dim
    for _ in range(num_layers):
        x = TransformerEncoderLayer(embed_dim, num_heads, ff_dim, rate)(x, training=True)
    
    encoded = GlobalAveragePooling1D()(x)
    encoded = Dense(embed_dim, activation='relu')(encoded)

    # Decoding part
    decoded = RepeatVector(time_steps)(encoded)
    for _ in range(num_layers):
        decoded = TransformerEncoderLayer(embed_dim, num_heads, ff_dim, rate)(decoded, training=True)
    
    decoded = TimeDistributed(Dense(num_features))(decoded)

    autoencoder = Model(inputs, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder

# Example reshaping of train_data_x
time_steps = 30

scaler = StandardScaler()
train_data_x = np.concatenate(train_x, axis=0)
train_data_x = scaler.fit_transform(train_data_x.reshape(-1, train_data_x.shape[-1]))

# Ensure there is enough data to reshape
if len(train_data_x) >= time_steps:
    num_samples = (len(train_data_x) // time_steps) * time_steps
    train_data_x = train_data_x[:num_samples]  # Truncate to a multiple of time_steps
    num_features = train_data_x.shape[1]
    train_data_x = train_data_x.reshape(-1, time_steps, num_features)
    input_shape = train_data_x.shape


    autoencoder = create_transformer_autoencoder(input_shape)
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    autoencoder.fit(train_data_x, train_data_x, epochs=100, batch_size=32, validation_split=0.2, shuffle=False, 
                    callbacks=[early_stopping])

    auc_scores = []

    for test_record, true_values in zip(test_x, test_y):
        padded_length = ((len(test_record) // time_steps) + 1) * time_steps
        padded_test_record = np.pad(test_record, ((0, padded_length - len(test_record)), (0, 0)), mode='constant')
        padded_test_record = padded_test_record.reshape(-1, time_steps, num_features)
        predictions = autoencoder.predict(padded_test_record)
        
        predictions = predictions.reshape(-1, predictions.shape[-1])[:len(true_values)]
        true_values = true_values.reshape(-1, true_values.shape[-1])[:len(true_values)]
        predictions = predictions.mean(axis=1)
        
        auc_pr = get_auc_pr(predictions, true_values)
        
        auc_scores.append(auc_pr)
    
    print(f'Average AUC-PR: {np.mean(auc_scores)}')
else:
    print("Not enough data to reshape for the specified time_steps.")


Epoch 1/100
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 66ms/step - loss: 1.2281 - val_loss: 0.9740
Epoch 2/100
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - loss: 1.0138 - val_loss: 0.9772
Epoch 3/100
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - loss: 1.0081 - val_loss: 0.9809
Epoch 4/100
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - loss: 1.0019 - val_loss: 0.9830
Epoch 5/100
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - loss: 0.9846 - val_loss: 0.9517
Epoch 6/100
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - loss: 0.9733 - val_loss: 0.9788
Epoch 7/100
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - loss: 0.9730 - val_loss: 0.9759
Epoch 8/100
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 70ms/step - loss: 0.9749 - val_loss: 0.9855
Epoch 9/100
[1m74/74[0m [32m━━━━━━━━