Bi-directional using Keras

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from sklearn.metrics import precision_score, recall_score, f1_score

def load_glove_embeddings(embeddings_path):
    embeddings_index = {}
    with open(embeddings_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def create_embedding_matrix(word_index, embeddings_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    new_word_index = {}
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            new_word_index[word] = i
    return embedding_matrix, new_word_index, vocab_size



def read_data(normal_logs, abnormal_logs):
    data, labels = [], []
    for filepath in normal_logs:
        with open(filepath) as fp:
            for line in fp:
                content = line.strip()
                data.append(content)
                labels.append(0)
    for filepath in abnormal_logs:
        with open(filepath) as fp:
            for line in fp:
                content = line.strip()
                data.append(content)
                labels.append(1)
    return data, np.array(labels)

# Define paths
normal_train_paths = ["/home/aks/AOS-Project/filter/ceph-dout-filter.txt", "/home/aks/AOS-Project/filter/glustrefs_info-filter.txt", "/home/aks/AOS-Project/filter/daos-debug-filter.txt","/home/aks/AOS-Project/filter/orangefs-debug-filter.txt","/home/aks/AOS-Project/filter/beegfs-debug-filter.txt","/home/aks/AOS-Project/filter/hive-debug-filter.txt"]
abnormal_train_paths = ["/home/aks/AOS-Project/filter/ceph-derr-filter.txt", "/home/aks/AOS-Project/filter/glustrefs_error-filter.txt", "/home/aks/AOS-Project/filter/daos-error-filter.txt", "/home/aks/AOS-Project/filter/orangefs-error-filter.txt","/home/aks/AOS-Project/filter/beegfs-error-filter.txt","/home/aks/AOS-Project/filter/hive-error-filter.txt"]
normal_test_paths = ["/home/aks/AOS-Project/filter/lustre-debug-filter.txt","/home/aks/AOS-Project/filter/hbase-debug-filter.txt","/home/aks/AOS-Project/filter/hdfs-debug-filter.txt"]
abnormal_test_paths = ["/home/aks/AOS-Project/filter/lustre-error-filter.txt","/home/aks/AOS-Project/filter/hbase-error-filter.txt","/home/aks/AOS-Project/filter/hdfs-error-filter.txt"]
glove_path = '/home/aks/DRILL/glove-embeddings/glove.6B.100d.txt'


train_data, train_labels = read_data(normal_train_paths, abnormal_train_paths)
test_data, test_labels = read_data(normal_test_paths, abnormal_test_paths)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data + test_data)

# Load GloVe embeddings
embeddings_index = load_glove_embeddings(glove_path)
embedding_dim = 100  
embedding_matrix, new_word_index, vocab_size = create_embedding_matrix(tokenizer.word_index, embeddings_index, embedding_dim)

# Adjust tokenizer's word index to match the new vocabulary
tokenizer.word_index = new_word_index
tokenizer.num_words = vocab_size

# Prepare data
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)
max_len = 100
train_padded = pad_sequences(train_sequences, maxlen=max_len)
test_padded = pad_sequences(test_sequences, maxlen=max_len)

# Define the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, weights=[embedding_matrix[:vocab_size]], trainable=False),
    Bidirectional(LSTM(100, return_sequences=True)),
    Bidirectional(LSTM(100)),
    Dense(400, activation='relu'),
    Dense(2, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(train_padded, train_labels, epochs=5, validation_data=(test_padded, test_labels))

# Evaluate the model
test_loss, test_acc = model.evaluate(test_padded, test_labels)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

# Function to evaluate the model using precision, recall, and F1 score
def evaluate_model(model, test_data, test_labels):
    predictions = model.predict(test_data)
    predicted_labels = np.argmax(predictions, axis=1)
    precision = precision_score(test_labels, predicted_labels)
    recall = recall_score(test_labels, predicted_labels)
    f1 = f1_score(test_labels, predicted_labels)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

# Evaluate the model using precision, recall, and F1 score
evaluate_model(model, test_padded, test_labels)


In [None]:
model.save('my_model_bi_RNN.h5')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score
import seaborn as sns

sns.set(style="whitegrid")

# Load the saved model
model = load_model('my_model_bi_RNN.h5')

# Assume test_data_files is a list of paths to the test data files
lustre_normal_test_paths = ["/home/aks/DRILL/sentilog/filter/lustre-debug-filter.txt"]
hdfs_normal_test_paths = ["/home/aks/DRILL/sentilog/filter/hdfs-debug-filter.txt"]
hbase_normal_test_paths = ["/home/aks/DRILL/sentilog/filter/hbase-debug-filter.txt"]

lustre_abnormal_test_paths = ["/home/aks/DRILL/sentilog/filter/lustre-error-filter.txt"]
hdfs_abnormal_test_paths = ["/home/aks/DRILL/sentilog/filter/hbase-error-filter.txt"]
hbase_abnormal_test_paths = ["/home/aks/DRILL/sentilog/filter/hdfs-error-filter.txt"]


lustre_test_data, lustre_test_labels = read_data(lustre_normal_test_paths, lustre_abnormal_test_paths)
hdfs_test_data, hdfs_test_labels = read_data(hdfs_normal_test_paths, hdfs_abnormal_test_paths)
hbase_test_data, hbase_test_labels = read_data(hbase_normal_test_paths, hbase_abnormal_test_paths)

lustre_test_sequences = tokenizer.texts_to_sequences(lustre_test_data)
hdfs_test_sequences = tokenizer.texts_to_sequences(hdfs_test_data)
hbase_test_sequences = tokenizer.texts_to_sequences(hbase_test_data)

max_len = 100

lustre_test_padded = pad_sequences(lustre_test_sequences, maxlen=max_len)
hdfs_test_padded = pad_sequences(hdfs_test_sequences, maxlen=max_len)
hbase_test_padded = pad_sequences(hbase_test_sequences, maxlen=max_len)

lustre_test_loss, lustre_test_acc = model.evaluate(lustre_test_padded, lustre_test_labels)
print("Lustre Test Loss:", lustre_test_loss)
print("Lustre Test Accuracy:", lustre_test_acc)

hdfs_test_loss, hdfs_test_acc = model.evaluate(hdfs_test_padded, hdfs_test_labels)
print("Hdfs Test Loss:", hdfs_test_loss)
print("Hdfs Test Accuracy:", hdfs_test_acc)

hbase_test_loss, hbase_test_acc = model.evaluate(hbase_test_padded, hbase_test_labels)
print("Hbase Test Loss:", hbase_test_loss)
print("Hbase Test Accuracy:", hbase_test_acc)

dataset_accuracies = {}

dataset_accuracies["Lustre"] = lustre_test_acc
dataset_accuracies["Hdfs"] = hdfs_test_acc
dataset_accuracies["Hbase"] = hbase_test_acc

# Plotting
datasets = list(dataset_accuracies.keys())
accuracies = list(dataset_accuracies.values())
plt.figure(figsize=(12, 6)) 
plt.bar(datasets, accuracies, color='skyblue', width=0.4)  
plt.xlabel('Dataset')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Dataset using Bi_Directional Deep Learning Model')
plt.xticks(rotation=45, ha="right") 
plt.tight_layout()  
plt.show()