In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import precision_score, recall_score, f1_score
from mutation_multivariate import *
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Masking, GRU, Flatten
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [8]:
train_x = np.load('spark_0_trace-scl_std/train.npy', allow_pickle=True)
test_x = np.load('spark_0_trace-scl_std/test.npy', allow_pickle=True)
train_y = np.load('spark_0_trace-scl_std/y_train.npy', allow_pickle=True)
test_y = np.load('spark_0_trace-scl_std/y_test.npy', allow_pickle=True)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(52,)
(52,)
(31,)
(31,)


In [9]:
for j, _ in enumerate(test_y):
    test_y[j] = (test_y[j] != 0).astype(int)

In [10]:
# Injecting anomaly using mutation
train_x, train_y = load_multivariate_mutated(train_x, train_y, record = False)

**MLP2**

In [18]:
# Function to create input sequences
def create_input_sequences_f(data, labels, sequence_length):
    sequences = []
    targets = []
    for i in range(len(data) - sequence_length):
        sequence = np.array(data[i:i + sequence_length]).flatten()  # Flatten the sequence
        sequences.append(sequence)
        targets.append(labels[i + sequence_length])
    return np.array(sequences), np.array(targets)

auc_prs = []

# Define the sequence length
sequence_length = 5

# Stack and preprocess all training samples
stacked_train_x = np.vstack(train_x)
stacked_train_y = np.concatenate(train_y)

# Create sequences from stacked training samples
s_train_x, s_train_y = create_input_sequences_f(stacked_train_x, stacked_train_y, sequence_length)

model_save_path = 'mlp_best.weights.h5'
auc_pr_callback = AUC_PR_Callback(validation_data=(s_train_x, s_train_y), model_save_path=model_save_path)

# Ensure the input shape is correct
input_shape = (sequence_length * 19,)

# Define the MLP model
model = Sequential([
    Dense(128, activation='relu', input_shape=input_shape),  # Adjust input shape for flattened sequence
    Dense(64, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
class_weights = {0: 1.0, 1: 5.0}
model.fit(s_train_x, s_train_y, epochs=30, batch_size=32, validation_split=0.2, callbacks=[auc_pr_callback], 
          shuffle=False, class_weight=class_weights)
model.load_weights(model_save_path)

# Iterate over test_x to evaluate and calculate metrics
auc_prs = []  # Reinitialize to avoid appending to previous results
for i in range(len(test_x)):
    if len(test_x[i]) > sequence_length:
        # Create sequences for the current test sample
        s_test_x, s_test_y = create_input_sequences(test_x[i], test_y[i], sequence_length)
        
        # Ensure the test input shape is correct
        s_test_x = s_test_x.reshape((-1, input_shape[0]))

        # Predict on the test data
        test_predictions = model.predict(s_test_x).reshape(-1)
        
        # Assuming `get_auc_pr` function is defined to calculate AUC-PR
        auc_pr = get_auc_pr(test_predictions, s_test_y)
        
        auc_prs.append(auc_pr)

avg_auc_pr = np.mean(auc_prs)

print(f'Average AUC-PR: {avg_auc_pr}')


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 604us/step

Epoch 1: AUC-PR improved to 0.1248. Model weights saved.
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 14ms/step - accuracy: 0.9685 - loss: 2.1528 - val_accuracy: 0.9816 - val_loss: 0.1399
Epoch 2/30
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 447us/step

Epoch 2: AUC-PR improved to 0.3190. Model weights saved.
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 13ms/step - accuracy: 0.9016 - loss: 0.9332 - val_accuracy: 0.9727 - val_loss: 0.1694
Epoch 3/30
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 452us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 13ms/step - accuracy: 0.9739 - loss: 0.5144 - val_accuracy: 0.9814 - val_loss: 0.0959
Epoch 4/30
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 454us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 1

In [6]:
# Function to create input sequences
def create_input_sequences(data, labels, sequence_length):
    sequences = []
    targets = []
    for i in range(len(data) - sequence_length):
        sequence = np.array(data[i:i + sequence_length])
        sequences.append(sequence)
        targets.append(labels[i + sequence_length])
    return np.array(sequences), np.array(targets)

**LSTM**

In [None]:
auc_prs = []

# Define the sequence length
sequence_length = 5

# Stack and preprocess all training samples
stacked_train_x = np.vstack(train_x)
stacked_train_y = np.concatenate(train_y)

# Create sequences from stacked training samples
s_train_x, s_train_y = create_input_sequences(stacked_train_x, stacked_train_y, sequence_length)

model_save_path = 'lstm_best.weights.h5'
auc_pr_callback = AUC_PR_Callback(validation_data=(s_train_x, s_train_y), model_save_path=model_save_path)

# Define the LSTM model
model = Sequential([
    Masking(mask_value=0., input_shape=(sequence_length, 19)),
    LSTM(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


# Train the model
class_weights = {0: 1.0, 1: 5.0}
model.fit(s_train_x, s_train_y, epochs=30, batch_size=32, validation_split=0.2, callbacks=[auc_pr_callback], 
          shuffle=False, class_weight=class_weights)
model.load_weights(model_save_path)

# Iterate over test_x to evaluate and calculate metrics
for i in range(len(test_x)):
    if len(test_x[i]) > sequence_length:
        # Create sequences for the current test sample
        s_test_x, s_test_y = create_input_sequences(test_x[i], test_y[i], sequence_length)
        
        # Predict on the test data
        test_predictions = model.predict(s_test_x).reshape(-1)
        
        auc_pr = get_auc_pr(test_predictions, s_test_y)
        
        auc_prs.append(auc_pr)

avg_auc_pr = np.mean(auc_prs)

print(f'Average AUC-PR: {avg_auc_pr}')

Epoch 1/3


  super().__init__(**kwargs)


[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 739us/step

Epoch 1: AUC-PR improved to 0.2836. Model weights saved.
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 12ms/step - accuracy: 0.9660 - loss: 2.0806 - val_accuracy: 0.9833 - val_loss: 0.1653
Epoch 2/3
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 763us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 13ms/step - accuracy: 0.9822 - loss: 0.3502 - val_accuracy: 0.9832 - val_loss: 0.1516
Epoch 3/3
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 671us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 14ms/step - accuracy: 0.9856 - loss: 0.2524 - val_accuracy: 0.9833 - val_loss: 0.1614
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 928us/step
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 730us/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 964us/step

**BI-LSTM**

In [7]:
auc_prs = []

# Define the sequence length
sequence_length = 5

# Stack and preprocess all training samples
stacked_train_x = np.vstack(train_x)
stacked_train_y = np.concatenate(train_y)

# Create sequences from stacked training samples
s_train_x, s_train_y = create_input_sequences(stacked_train_x, stacked_train_y, sequence_length)

model_save_path = 'bi_lstm_best.weights.h5'
auc_pr_callback = AUC_PR_Callback(validation_data=(s_train_x, s_train_y), model_save_path=model_save_path)

# Define the Bidirectional LSTM model
model = Sequential([
    Masking(mask_value=0., input_shape=(sequence_length, 19)),
    Bidirectional(LSTM(32, activation='relu')),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
class_weights = {0: 1.0, 1: 2.0}
model.fit(s_train_x, s_train_y, epochs=30, batch_size=32, validation_split=0.2, callbacks=[auc_pr_callback], 
          shuffle=False, class_weight=class_weights)
model.load_weights(model_save_path)

# Iterate over test_x to evaluate and calculate metrics
auc_prs = []  # Reinitialize to avoid appending to previous results
for i in range(len(test_x)):
    if len(test_x[i]) > sequence_length:
        # Create sequences for the current test sample
        s_test_x, s_test_y = create_input_sequences(test_x[i], test_y[i], sequence_length)
        
        # Predict on the test data
        test_predictions = model.predict(s_test_x).reshape(-1)
        
        # Assuming `get_auc_pr` function is defined to calculate AUC-PR
        auc_pr = get_auc_pr(test_predictions, s_test_y)
        
        auc_prs.append(auc_pr)

avg_auc_pr = np.mean(auc_prs)

print(f'Average AUC-PR: {avg_auc_pr}')


Epoch 1/30


  super().__init__(**kwargs)
I0000 00:00:1722533774.855207   56946 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-01 11:36:14.892245: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-08-01 11:36:15.062131: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 26820400 exceeds 10% of free system memory.


[1m   1/2758[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8:15[0m 180ms/step/step - accuracy: 0.9646 - loss: 1.2234

2024-08-01 11:36:23.334303: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 33525500 exceeds 10% of free system memory.


[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 934us/step

Epoch 1: AUC-PR improved to 0.1493. Model weights saved.
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 13ms/step - accuracy: 0.9646 - loss: 1.2192 - val_accuracy: 0.9798 - val_loss: 0.1128
Epoch 2/30
[1m 186/2758[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 819us/step accuracy: 0.9850 - loss: 

2024-08-01 11:36:52.585561: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 33525500 exceeds 10% of free system memory.


[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 861us/step

Epoch 2: AUC-PR improved to 0.2548. Model weights saved.
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 12ms/step - accuracy: 0.9850 - loss: 0.1757 - val_accuracy: 0.9784 - val_loss: 0.1280
Epoch 3/30
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 780us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 11ms/step - accuracy: 0.9860 - loss: 0.1604 - val_accuracy: 0.9804 - val_loss: 0.1095
Epoch 4/30
[1m 188/2758[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 810us/step accuracy: 0.9856 - loss: 

2024-08-01 11:37:44.279238: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 33525500 exceeds 10% of free system memory.


[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 876us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 11ms/step - accuracy: 0.9856 - loss: 0.1456 - val_accuracy: 0.9769 - val_loss: 0.1219
Epoch 5/30
[1m 182/2758[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 838us/step accuracy: 0.9873 - loss: 

2024-08-01 11:38:09.587209: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 33525500 exceeds 10% of free system memory.


[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 872us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 12ms/step - accuracy: 0.9873 - loss: 0.1207 - val_accuracy: 0.9767 - val_loss: 0.1147
Epoch 6/30
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 811us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 13ms/step - accuracy: 0.9877 - loss: 0.1085 - val_accuracy: 0.9611 - val_loss: 0.2912
Epoch 7/30
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 810us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 12ms/step - accuracy: 0.9876 - loss: 0.1028 - val_accuracy: 0.9646 - val_loss: 0.1432
Epoch 8/30
[1m2758/2758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 802us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 12ms/step - accuracy: 0.9875 - loss: 0.1037 - val_accuracy: 0.9538 - val_loss: 0.2683
Epoch 9/30
[1m2758/2758[0m [

**GRU**

In [9]:
auc_prs = []

# Define the sequence length
sequence_length = 10

# Stack and preprocess all training samples
stacked_train_x = np.vstack(train_x)
stacked_train_y = np.concatenate(train_y)

# Create sequences from stacked training samples
s_train_x, s_train_y = create_input_sequences(stacked_train_x, stacked_train_y, sequence_length)

model_save_path = 'gru_best.weights.h5'
auc_pr_callback = AUC_PR_Callback(validation_data=(s_train_x, s_train_y), model_save_path=model_save_path)

# Define the GRU model
model = Sequential([
    Masking(mask_value=0., input_shape=(sequence_length, 19)),
    GRU(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
class_weights = {0: 1.0, 1: 5.0}
model.fit(s_train_x, s_train_y, epochs=30, batch_size=32, validation_split=0.2, callbacks=[auc_pr_callback], 
          shuffle=False, class_weight=class_weights)
model.load_weights(model_save_path)

# Iterate over test_x to evaluate and calculate metrics
auc_prs = []  # Reinitialize to avoid appending to previous results
for i in range(len(test_x)):
    if len(test_x[i]) > sequence_length:
        # Create sequences for the current test sample
        s_test_x, s_test_y = create_input_sequences(test_x[i], test_y[i], sequence_length)
        
        # Predict on the test data
        test_predictions = model.predict(s_test_x).reshape(-1)
        
        # Assuming `get_auc_pr` function is defined to calculate AUC-PR
        auc_pr = get_auc_pr(test_predictions, s_test_y)
        
        auc_prs.append(auc_pr)

avg_auc_pr = np.mean(auc_prs)

print(f'Average AUC-PR: {avg_auc_pr}')


Epoch 1/30


  super().__init__(**kwargs)


[1m2757/2757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 881us/step

Epoch 1: AUC-PR improved to 0.1742. Model weights saved.
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 13ms/step - accuracy: 0.9421 - loss: 0.9535 - val_accuracy: 0.9793 - val_loss: 0.1718
Epoch 2/30
[1m2757/2757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step

Epoch 2: AUC-PR improved to 0.2089. Model weights saved.
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 13ms/step - accuracy: 0.9821 - loss: 0.3109 - val_accuracy: 0.9796 - val_loss: 0.1679
Epoch 3/30
[1m2757/2757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 845us/step
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 11ms/step - accuracy: 0.9857 - loss: 0.2420 - val_accuracy: 0.9802 - val_loss: 0.1562
Epoch 4/30
[1m2757/2757[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 850us/step

Epoch 4: AUC-PR improved to 0.2212. Model weights saved.
[1m2206/2206

**SVM**

In [12]:
from sklearn.svm import SVC

# Function to create input sequences
def create_input_sequences_ft(data, labels, sequence_length):
    sequences = []
    targets = []
    for i in range(len(data) - sequence_length):
        sequence = np.array(data[i:i + sequence_length]).flatten()  # Flatten the sequence
        sequences.append(sequence)
        targets.append(labels[i + sequence_length])
    return np.array(sequences), np.array(targets)

# Define the sequence length
sequence_length = 10

# Stack and preprocess all training samples
stacked_train_x = np.vstack(train_x)
stacked_train_y = np.concatenate(train_y)

# Create sequences from stacked training samples
s_train_x, s_train_y = create_input_sequences_ft(stacked_train_x, stacked_train_y, sequence_length)

# Scale the data
scaler = StandardScaler()
s_train_x = scaler.fit_transform(s_train_x)

# Define the SVM model with 'rbf' kernel
svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True)

# Train the model
svm_model.fit(s_train_x, s_train_y)

# Iterate over test_x to evaluate and calculate metrics
auc_prs = []
for i in range(len(test_x)):
    if len(test_x[i]) > sequence_length:
        # Create sequences for the current test sample
        s_test_x, s_test_y = create_input_sequences_ft(test_x[i], test_y[i], sequence_length)
        
        # Scale the test data
        s_test_x = scaler.transform(s_test_x)
        
        # Predict on the test data
        test_predictions = svm_model.predict(s_test_x)
        
        # Calculate AUC-PR
        auc_pr = get_auc_pr(test_predictions, s_test_y)
        
        auc_prs.append(auc_pr)

avg_auc_pr = np.mean(auc_prs)

print(f'Average AUC-PR: {avg_auc_pr}')


Average AUC-PR: 0.2971407897705757
