In [None]:
import os

os.environ["KERAS_BACKEND"] = "jax"

import ast

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from jax import default_backend as jax_backend
from keras.api.config import backend as keras_backend
from keras.api.layers import GRU, Bidirectional, Dense, Dropout, Embedding
from keras.api.models import Sequential
from keras.api.optimizers import Adam
from keras.api.preprocessing.sequence import pad_sequences
from keras.api.utils import to_categorical
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

print(keras_backend())
print(jax_backend())

In [2]:
df = pd.read_csv("../data/syscall/log.csv")

In [3]:
label_map = {
    "normal": 0,
    "boolean-based": 1,
    "error-based": 2,
    "time-based": 3,
    "union-based": 4,
}

In [None]:
df["label"].value_counts()

In [5]:
encoder = OneHotEncoder(categories=[range(548)], sparse_output=False)

In [6]:
def one_hot_encode(sequence_str):
    sequence = ast.literal_eval(sequence_str)
    reshaped = np.array(sequence).reshape(-1, 1)
    return encoder.fit_transform(reshaped)

In [7]:
df["query"] = df["query"].apply(lambda x: ast.literal_eval(x))
df["label"] = df["label"].map(label_map)

In [8]:
df = shuffle(df)

In [9]:
X = df["query"].tolist()
Y = df["label"].values

In [10]:
y = to_categorical(Y, num_classes=5)

In [11]:
X_padded = pad_sequences(X, maxlen=1500, padding="post")

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.3)

In [13]:
model = Sequential(
    layers=[
        Embedding(input_dim=548, output_dim=64),
        # Input(shape=(max_len, 548)),
        Bidirectional(GRU(128)),
        Dense(128, activation="relu"),
        Dropout(0.2),
        Dense(64),
        Dropout(0.2),
        Dense(32),
        Dropout(0.2),
        Dense(5, activation="softmax"),
    ]
)

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

In [None]:
model.fit(
    X_train, y_train, epochs=50, batch_size=32, validation_split=0.05, shuffle=True
)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

In [17]:
y_pred_int = np.argmax(y_pred, axis=1)
y_test_int = np.argmax(y_test, axis=1)

In [18]:
conf_matrix = confusion_matrix(y_test_int, y_pred_int)

In [None]:
plt.figure(figsize=(8, 6))
attack_types = list(label_map.keys())
sns.heatmap(
    conf_matrix,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=attack_types,
    yticklabels=attack_types,
)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix for BiGRU Model")
plt.show()