In [1]:
import os

os.environ["KERAS_BACKEND"] = "jax"

from jax import default_backend as jax_backend
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from keras.api.layers import Dense, Embedding, LSTM, Dropout
from keras.api.models import Sequential
from keras.api.preprocessing.sequence import pad_sequences
from keras.api.config import backend as keras_backend
from keras.api.optimizers import Adam
from keras.api.utils import to_categorical

print(keras_backend())
print(jax_backend())

jax
gpu


In [2]:
df = pd.read_csv("../data/syscall/syscall.csv")

In [3]:
label_map = {
    "normal": 0,
    "boolean-based": 1,
    "error-based": 2,
    "inline": 3,
    "stacked": 4,
    "time-based": 5,
    "union-based": 6,
}

In [4]:
df["label"].value_counts()

label
normal           439
union-based      115
boolean-based     53
time-based        43
error-based       40
stacked           40
inline            11
Name: count, dtype: int64

In [5]:
encoder = OneHotEncoder(categories=[range(548)], sparse_output=False)

In [6]:
def one_hot_encode(sequence_str):
  sequence = ast.literal_eval(sequence_str)
  reshaped = np.array(sequence).reshape(-1, 1)
  return encoder.fit_transform(reshaped)

In [7]:
df["query"] = df["query"].apply(lambda x: ast.literal_eval(x))
df["label"] = df["label"].map(label_map)

In [8]:
df = shuffle(df)

In [9]:
X = df["query"].tolist()
Y = df["label"].values

In [10]:
y = to_categorical(Y, num_classes=7)

In [11]:
X_padded = pad_sequences(X, maxlen=1500, padding="post")

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y, test_size=0.2, random_state=42
)

In [13]:
model = Sequential(
    layers=[
        Embedding(input_dim=548, output_dim=64),
        # Input(shape=(max_len, 548)),
        LSTM(128),
        Dense(128, activation="relu"),
        Dropout(0.2),
        Dense(64),
        Dropout(0.2),
        Dense(32),
        Dropout(0.2),
        Dense(7, activation="softmax"),
    ]
)

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

In [14]:
model.fit(
    X_train, y_train, epochs=50, batch_size=32, validation_split=0.05, shuffle=True
)

Epoch 1/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 489ms/step - accuracy: 0.5002 - loss: 1.7360 - val_accuracy: 0.4333 - val_loss: 1.6722
Epoch 2/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 97ms/step - accuracy: 0.6056 - loss: 1.3820 - val_accuracy: 0.4333 - val_loss: 1.6895
Epoch 3/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 83ms/step - accuracy: 0.6042 - loss: 1.3607 - val_accuracy: 0.4333 - val_loss: 1.7693
Epoch 4/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 84ms/step - accuracy: 0.6274 - loss: 1.3092 - val_accuracy: 0.4333 - val_loss: 1.6961
Epoch 5/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 95ms/step - accuracy: 0.5997 - loss: 1.3619 - val_accuracy: 0.4333 - val_loss: 1.7383
Epoch 6/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 86ms/step - accuracy: 0.6057 - loss: 1.3455 - val_accuracy: 0.4333 - val_loss: 1.7559
Epoch 7/50
[1m18/18[0m [32m━━

<keras.src.callbacks.history.History at 0x7f7fec7774a0>

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 539ms/step - accuracy: 0.5568 - loss: 1.3780
