In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense

2024-07-31 01:41:44.200788: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 01:41:44.219151: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-31 01:41:44.223759: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-31 01:41:44.236604: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
train_x = np.load('spark_0_trace-scl_std/train.npy', allow_pickle=True)
test_x = np.load('spark_0_trace-scl_std/test.npy', allow_pickle=True)
train_y = np.load('spark_0_trace-scl_std/y_train.npy', allow_pickle=True)
test_y = np.load('spark_0_trace-scl_std/y_test.npy', allow_pickle=True)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(52,)
(52,)
(31,)
(31,)


In [4]:
for j, _ in enumerate(test_y):
    test_y[j] = (test_y[j] != 0).astype(int)

# Time Series Modeling - Statistical Modeling

In [5]:
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import auc
from prts import ts_precision, ts_recall
from sklearn.preprocessing import StandardScaler

In [6]:
def get_precision_recall(real, pred):
    try:
        precision = ts_precision(real, pred, alpha=0.0, cardinality="reciprocal", bias="flat")
        recall = ts_recall(real, pred, alpha=0.0, cardinality="reciprocal", bias="flat")
    except AssertError:
        precision = 0.0
        recall = 0.0
    return precision, recall

# Function to calculate AUC - Precision Recall
def auc_pr_range_based(precisions, recalls):
    # Sort recall values in ascending order and adjust precisions accordingly
    sorted_indices = np.argsort(recalls)
    sorted_recalls = np.array(recalls)[sorted_indices]
    sorted_precisions = np.array(precisions)[sorted_indices]
    
    return auc(sorted_recalls, sorted_precisions)


**Vector Autoregression (VAR)**

In [14]:
all_precisions = []
all_recalls = []
lag = 10

scaler = StandardScaler()
train_data_x = np.concatenate(train_x, axis=0)
train_data_x = scaler.fit_transform(train_data_x)
model = sm.tsa.VAR(train_data_x).fit(lag)

for test_record, true_values in zip(test_x, test_y):
    test_record = scaler.transform(test_record)
    predictions = model.forecast(train_data_x[-lag:], steps=len(test_record))
    predictions = predictions.mean(axis=1)

    residuals = np.abs(true_values - predictions)
    
    lower = np.quantile(residuals, 0.3)
    upper = np.quantile(residuals, 0.7)
    bin_pred = ((predictions > upper) | (predictions < lower)).astype(int)
    
    precision, recall = get_precision_recall(true_values, bin_pred)
    all_precisions.append(precision)
    all_recalls.append(recall)

# Optionally, you can compute average precision and recall
average_precision = np.mean(all_precisions)
average_recall = np.mean(all_recalls)
auc_score = auc_pr_range_based(all_precisions, all_recalls)
f1_score = 2 * average_precision * average_recall / (average_precision + average_recall)
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 score: {f1_score}")
print(f'Average AUC-PR: {auc_score}')


Average Precision: 0.11327868635519465
Average Recall: 0.7844774427578162
Average F1 score: 0.19797040935534826
Average AUC-PR: 0.24583324912294638


# Semi Supervised Modelling

**LSTM autoencoder**

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

def create_lstm_autoencoder(input_shape):
    inputs = Input(shape=(input_shape[1], input_shape[2]))
    
    # Encoder
    encoded = LSTM(128, activation='relu', return_sequences=True)(inputs)
    encoded = Dropout(0.2)(encoded)
    encoded = LSTM(64, activation='relu')(encoded)
    encoded = BatchNormalization()(encoded)
    encoded = RepeatVector(input_shape[1])(encoded)
    
    # Decoder
    decoded = LSTM(64, activation='relu', return_sequences=True)(encoded)
    decoded = Dropout(0.2)(decoded)
    decoded = LSTM(128, activation='relu', return_sequences=True)(decoded)
    decoded = BatchNormalization()(decoded)
    
    outputs = TimeDistributed(Dense(input_shape[2]))(decoded)
    
    autoencoder = Model(inputs, outputs)
    autoencoder.compile(optimizer='adam', loss='mse')
    
    return autoencoder

# Preprocess data
scaler = StandardScaler()
train_data_x = np.concatenate(train_x, axis=0)
train_data_x = scaler.fit_transform(train_data_x.reshape(-1, train_data_x.shape[-1]))

# Reshape data to have shape (samples, time steps, features)
time_steps = 10  # choose an appropriate time step length
train_data_x = train_data_x.reshape(-1, time_steps, train_data_x.shape[-1])
input_shape = train_data_x.shape

# Create and train the model with EarlyStopping
autoencoder = create_lstm_autoencoder(input_shape)
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
autoencoder.fit(train_data_x, train_data_x, epochs=150, batch_size=32, validation_split=0.2, callbacks=[early_stopping], shuffle = False)

# Evaluate model on test data
all_precisions = []
all_recalls = []

for test_record, true_values in zip(test_x, test_y):
    test_record = scaler.transform(test_record.reshape(-1, test_record.shape[-1]))

    # Pad the test_record to be a multiple of time_steps
    if len(test_record) % time_steps != 0:
        pad_length = time_steps - (len(test_record) % time_steps)
        test_record = np.pad(test_record, ((0, pad_length), (0, 0)), 'constant')

    test_record = test_record.reshape(-1, time_steps, test_record.shape[-1])
    
    predictions = autoencoder.predict(test_record)
    
    predictions = predictions.reshape(-1, predictions.shape[-1])[:len(true_values)]
    test_record = test_record.reshape(-1, test_record.shape[-1])[:len(true_values)]
    residuals = np.abs(test_record - predictions).mean(axis=1)
    
    mean_residual = np.mean(residuals)
    std_residual = np.std(residuals)
    threshold_upper = mean_residual + 2 * std_residual
    threshold_lower = mean_residual - 2 * std_residual
    
    bin_pred = ((residuals > threshold_upper) | (residuals < threshold_lower)).astype(int)
    bin_true = (true_values != 0).astype(int)  # assuming true_values contains binary labels
    
    precision, recall = get_precision_recall(bin_true, bin_pred)
    all_precisions.append(precision)
    all_recalls.append(recall)

average_precision = np.mean(all_precisions)
average_recall = np.mean(all_recalls)
auc_score = auc_pr_range_based(all_precisions, all_recalls)
f1_score = 2 * average_precision * average_recall / (average_precision + average_recall)

print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 score: {f1_score}")
print(f"AUC-PR: {auc_score}")


Epoch 1/150


I0000 00:00:1722411150.735916   26963 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-31 01:32:30.898166: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - loss: 1.0732 - val_loss: 0.9936
Epoch 2/150
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.9547 - val_loss: 0.9833
Epoch 3/150
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.9425 - val_loss: 2.8506
Epoch 4/150
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.9326 - val_loss: 1.0804
Epoch 5/150
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.9255 - val_loss: 0.9977
Epoch 6/150
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.9193 - val_loss: 0.9982
Epoch 7/150
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.9271 - val_loss: 1.4750
Epoch 8/150
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 0.9179 - val_loss: 0.9636
Epoch 9/150
[1m221/221[0m [32m━━━

**Transformer**

In [19]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout, MultiHeadAttention, RepeatVector, TimeDistributed, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

class TransformerEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def create_transformer_autoencoder(input_shape, embed_dim=30, num_heads=4, ff_dim=128, num_layers=4, rate=0.1):
    time_steps, num_features = input_shape[1], input_shape[2]

    inputs = Input(shape=(time_steps, num_features))
    x = Dense(embed_dim)(inputs)  # Ensure input dimension matches embed_dim
    for _ in range(num_layers):
        x = TransformerEncoderLayer(embed_dim, num_heads, ff_dim, rate)(x, training=True)
    
    encoded = GlobalAveragePooling1D()(x)
    encoded = Dense(embed_dim, activation='relu')(encoded)

    # Decoding part
    decoded = RepeatVector(time_steps)(encoded)
    for _ in range(num_layers):
        decoded = TransformerEncoderLayer(embed_dim, num_heads, ff_dim, rate)(decoded, training=True)
    
    decoded = TimeDistributed(Dense(num_features))(decoded)

    autoencoder = Model(inputs, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder

# Example reshaping of train_data_x
time_steps = 30

scaler = StandardScaler()
train_data_x = np.concatenate(train_x, axis=0)
train_data_x = scaler.fit_transform(train_data_x.reshape(-1, train_data_x.shape[-1]))

# Ensure there is enough data to reshape
if len(train_data_x) >= time_steps:
    num_samples = (len(train_data_x) // time_steps) * time_steps
    train_data_x = train_data_x[:num_samples]  # Truncate to a multiple of time_steps
    num_features = train_data_x.shape[1]
    train_data_x = train_data_x.reshape(-1, time_steps, num_features)
    input_shape = train_data_x.shape

    autoencoder = create_transformer_autoencoder(input_shape)
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    autoencoder.fit(train_data_x, train_data_x, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

    all_precisions = []
    all_recalls = []

    for test_record, true_values in zip(test_x, test_y):
        padded_length = ((len(test_record) // time_steps) + 1) * time_steps
        padded_test_record = np.pad(test_record, ((0, padded_length - len(test_record)), (0, 0)), mode='constant')
        padded_test_record = padded_test_record.reshape(-1, time_steps, num_features)
        predictions = autoencoder.predict(padded_test_record)
        
        predictions = predictions.reshape(-1, predictions.shape[-1])[:len(true_values)]
        predictions = predictions.mean(axis=1)
    
        # Calculate residuals
        residuals = np.abs(true_values - predictions)
    
        lower = np.quantile(residuals, 0.18)
        upper = np.quantile(residuals, 0.85)

        bin_pred = ((residuals > upper) | (residuals < lower)).astype(int)
        
        precision, recall = get_precision_recall(true_values, bin_pred)
        all_precisions.append(precision)
        all_recalls.append(recall)

    average_precision = np.mean(all_precisions)
    average_recall = np.mean(all_recalls)
    auc_score = auc_pr_range_based(all_precisions, all_recalls)
    f1_score = 2 * average_precision * average_recall / (average_precision + average_recall)
    print(f"Average Precision: {average_precision}")
    print(f"Average Recall: {average_recall}")
    print(f"Average F1 score: {f1_score}")
    print(f'Average AUC-PR: {auc_score}')
else:
    print("Not enough data to reshape for the specified time_steps.")


Epoch 1/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 61ms/step - loss: 1.2466 - val_loss: 0.9856
Epoch 2/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - loss: 0.9707 - val_loss: 0.9539
Epoch 3/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - loss: 0.9042 - val_loss: 0.9371
Epoch 4/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - loss: 0.9412 - val_loss: 0.9295
Epoch 5/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - loss: 0.9537 - val_loss: 0.9203
Epoch 6/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - loss: 0.9304 - val_loss: 0.9167
Epoch 7/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - loss: 0.8832 - val_loss: 0.9206
Epoch 8/100
[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - loss: 0.8976 - val_loss: 0.9129
Epoch 9/100
[1m83/83[0m [32m━━━━━━━━