In [1]:
import os

os.environ["KERAS_BACKEND"] = "jax"

from jax import default_backend as jax_backend
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from keras.api.layers import Dense, LSTM, Dropout, Input
from keras.api.models import Sequential
from keras.api.config import backend as keras_backend
from keras.api.optimizers import Adam
from keras.api.utils import to_categorical

print(keras_backend())
print(jax_backend())

jax
gpu


In [2]:
df = pd.read_csv("../data/syscall/syscall.csv")

In [3]:
label_map = {
    "normal": 0,
    "boolean-based": 1,
    "error-based": 2,
    "inline": 3,
    "stacked": 4,
    "time-based": 5,
    "union-based": 6,
}

In [4]:
df["label"].value_counts()

label
normal           439
union-based      115
boolean-based     53
time-based        43
error-based       40
stacked           40
inline            11
Name: count, dtype: int64

In [5]:
encoder = OneHotEncoder(categories=[range(548)], sparse_output=False)

In [6]:
def one_hot_encode(sequence_str):
  sequence = ast.literal_eval(sequence_str)
  reshaped = np.array(sequence).reshape(-1, 1)
  return encoder.fit_transform(reshaped)

In [7]:
df["query"] = df["query"].apply(one_hot_encode)
df["label"] = df["label"].map(label_map)

In [8]:
df = shuffle(df)

In [9]:
Y = df["label"].values

In [10]:
y = to_categorical(Y, num_classes=7)

In [11]:
max_len = max([q.shape[0] for q in df["query"]])
padded_queries = [
    np.pad(q, ((0, max_len - q.shape[0]), (0, 0)), mode="constant") for q in df["query"]
]
max_len

1499

In [12]:
X_padded = np.array(padded_queries)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y, test_size=0.2, random_state=42
)

In [None]:
model = Sequential(
    layers=[
        # Embedding(input_dim=548, output_dim=64),
        Input(shape=(max_len, 548)),
        LSTM(128),
        Dense(128, activation="relu"),
        Dropout(0.2),
        Dense(64),
        Dropout(0.2),
        Dense(32),
        Dropout(0.2),
        Dense(7, activation="softmax"),
    ]
)

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

In [None]:
model.fit(
    X_train, y_train, epochs=50, batch_size=32, validation_split=0.05, shuffle=True
)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)