# How to run the Code

1. Open the notebook in Google Colab. 
2. Set the runtime to GPU for faster execution
3. Run the code

# Code starts!

In [None]:
%pip install kagglehub kaggle torch numpy scikit-learn --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# Imports

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from pathlib import Path
import kagglehub
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Download data

In [None]:

# Where kagglehub unpacked
root = Path(kagglehub.dataset_download("yianqaq/adfa-ld"))

# The “master” folder of the original release
base = root / "ADFA-LD"
train_dir  = base / "Training_Data_Master"
val_dir    = base / "Validation_Data_Master"
attack_dir = base / "Attack_Data_Master"
print("the data set is under:" , base)


Downloading from https://www.kaggle.com/api/v1/datasets/download/yianqaq/adfa-ld?dataset_version_number=1...


100%|██████████| 3.48M/3.48M [00:01<00:00, 2.92MB/s]

Extracting files...





the data set is under: /root/.cache/kagglehub/datasets/yianqaq/adfa-ld/versions/1/ADFA-LD


# Get data statistics

In [None]:
print("Sequence length statistics:")
# Collect all filenames and labels
normals = list(train_dir.glob("*.txt")) + list(val_dir.glob("*.txt"))
attacks = list(attack_dir.rglob("*.txt"))
print(f"number of normal sequances: {len(normals)}")
print(f"number of attacks sequances: {len(attacks)}")
all_paths  = normals + attacks
all_labels = [0]*len(normals) + [1]*len(attacks)
# Analyze syscall sequence lengths
lengths = []
for path in all_paths:
    text = path.read_text().strip()
    tokens = text.split()
    lengths.append(len(tokens))

# Calculate proportions
thresholds = [200, 300, 400, 500, 600, 700, 800, 900]
length_counts = {t: sum(l <= t for l in lengths) for t in thresholds}

total = len(lengths)

for t in thresholds:
    percent = 100 * length_counts[t] / total
    print(f"≤ {t:4}: {length_counts[t]:5} sequences ({percent:.2f}%)")


Sequence length statistics:
number of normal sequances: 5205
number of attacks sequances: 746
≤  200:  2072 sequences (34.82%)
≤  300:  2742 sequences (46.08%)
≤  400:  4291 sequences (72.11%)
≤  500:  4568 sequences (76.76%)
≤  600:  4748 sequences (79.78%)
≤  700:  4928 sequences (82.81%)
≤  800:  5090 sequences (85.53%)
≤  900:  5190 sequences (87.21%)


# Preprocess Data

In [None]:
# Collect all filenames and labels
normals = list(train_dir.glob("*.txt")) + list(val_dir.glob("*.txt"))
attacks = list(attack_dir.rglob("*.txt"))

all_paths  = normals + attacks
all_labels = [0]*len(normals) + [1]*len(attacks)

# First split: carve off TEST (20%)
trainval_paths, test_paths, trainval_labels, test_labels = train_test_split(
    all_paths, all_labels,
    test_size=0.20,
    stratify=all_labels,
    random_state=42
)

# Second split: from remaining 80%, carve off VAL (20% of that → 16% of total)
train_paths, val_paths, train_labels, val_labels = train_test_split(
    trainval_paths, trainval_labels,
    test_size=0.20,
    stratify=trainval_labels,
    random_state=42
)

# Build syscall→index vocabulary on the TRAIN set only - will be used for all,
# Note! if in the test there is a system call which was not seen in training it will put it as 0 (padding)
cnt = Counter()
for path in train_paths:
    text = path.read_text().strip()
    tokens = text.split()               # splits on any whitespace
    ints   = map(int, tokens)           # convert each token to int
    cnt.update(ints)                    # count frequencies

syscall2idx = {sys: i+1 for i, (sys, _) in enumerate(cnt.most_common())}
vocab_size   = len(syscall2idx) + 1  # +1 for padding (idx 0)
vocab_size = len(syscall2idx) + 1

# Dataset wrapper to handle the data to torch transition
class SyscallDataset(Dataset):
    def __init__(self, paths, labels, max_len=400):
        self.paths, self.labels, self.max_len = paths, labels, max_len

    def __len__(self): return len(self.paths)

    def __getitem__(self, i):
        # get system calls sequance for a spesific test
        text   = self.paths[i].read_text().strip()
        tokens = text.split()
        # get the integer corresponding to the sys call
        seq    = [ syscall2idx.get(int(syscall), 0)
                   for syscall in tokens ][:self.max_len]
        # returns 2 tensors: sequance and its label
        return torch.tensor(seq, dtype=torch.long), torch.tensor(self.labels[i], dtype=torch.long)

# creates a batch, returns 2 tensors sequances[batch_size, max_seq_len] and labels[batch_size] - pads with 0 if needed
def collate_fn(batch):
    seqs, labels = zip(*batch)
    return pad_sequence(seqs, batch_first=True), torch.stack(labels)

# DataLoaders
batch_size = 64
train_ds   = SyscallDataset(train_paths, train_labels)
val_ds     = SyscallDataset(val_paths,   val_labels)
test_ds    = SyscallDataset(test_paths,  test_labels)

train_loader = DataLoader(train_ds, batch_size, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size, shuffle=False, collate_fn=collate_fn)

print(f"▸ #train {len(train_ds)}, #val {len(val_ds)}, #test {len(test_ds)}")
print(f"▸ vocab size = {vocab_size}")


▸ #train 3808, #val 952, #test 1191
▸ vocab size = 173


# MODEL DEFINITION

In [None]:


class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, bidirectional=False):
        super().__init__()
        # Embedding: maps each syscall index to a vector of size emb_dim
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        #defining the LSTM encoder
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            # if enabled each token can see also the future giving it more context to where it is in the sequance
            # also if enabled the LSTM have 2 separate hidden states for forward and backward and the results will be [B, hidden_dim × (2 or 1)], where we concate the 2 if enabled.
            bidirectional=bidirectional
        )
        # Hidden fully-connected layer with sigmoid activation
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.3)
        # Output layer projecting to two classes (normal vs attack)
        self.fc2 = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        emb = self.embedding(x)
        lstm_out, _ = self.lstm(emb)
        # take output at last time step
        last_hidden_state = lstm_out[:, -1, :]
        hidden = self.sigmoid(self.fc1(last_hidden_state))
        # use dropout to not overfit 
        hidden = self.dropout(hidden)
        logits = self.fc2(hidden)
        return logits


# model, loss and optimizer

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# defining the model here!
model = LSTMClassifier(
    vocab_size=vocab_size,
    emb_dim=256,
    hidden_dim=512,
    num_layers=2,
    bidirectional=False
).to(device)

# loss: binary cross-entropy with logits
loss_fn  = nn.CrossEntropyLoss()
# define adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# to control the learning rate
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2
)
# Class to stop the training and not overfit
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.01):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float('inf')
        self.counter = 0
        self.should_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True

# TRAINING & VALIDATION

In [None]:
def train_epoch(loader):
    # Puts the model into “training” mode (enables dropout, batch-norm updates and more)
    model.train()
    total_loss = 0
    # Iterates over your DataLoader, where each seqs is a tensor of shape [B, seq_len] and labels is [B].
    for seqs, labels in loader:
        seqs, labels = seqs.to(device), labels.to(device)
        # forward pass
        logits = model(seqs)
        # compute loss
        loss   = loss_fn(logits, labels)
        # backpropogation: Zeroes old gradients, computes new ones, and takes an optimizer step.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # loss.item() is the average loss over the batch; multiplying by B gives the total for that batch.
        total_loss += loss.item() * seqs.size(0)
    # returns the avarage training loss
    return total_loss / len(loader.dataset)

def eval_epoch(loader, isTest=False):
    # Switches to evaluation mode (disables dropout, freezes batch-norm).
    model.eval()
    total_loss = 0
    correct    = 0
    preds_list = []
    labels_list = []
    with torch.no_grad():
        for seqs, labels in loader:
            seqs, labels = seqs.to(device), labels.to(device)
            logits = model(seqs)
            loss   = loss_fn(logits, labels)
            # loss.item() is the average loss over the batch; multiplying by B gives the total for that batch.
            total_loss += loss.item() * seqs.size(0)
            # until here same as training, now get the prediction by argmax on the probability vectors (normal, attack)
            preds = torch.argmax(logits, dim=1)
            # Move predictions and labels to CPU and add to lists
            preds_list.extend(preds.cpu().tolist())
            labels_list.extend(labels.cpu().tolist())

    # After all batches, compute overall metrics:
    avg_loss = total_loss / len(loader.dataset)
    # Accuracy: fraction of correct predictions
    accuracy = (torch.tensor(preds_list) == torch.tensor(labels_list)).float().mean().item()
    # Precision, recall, F1 for the positive (attack) class
    precision, recall, f1, _ = precision_recall_fscore_support(labels_list, preds_list, average='binary', zero_division=0)
    if(isTest):
        tn, fp, fn, tp = confusion_matrix(labels_list, preds_list).ravel()
        print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
    return avg_loss, accuracy, precision, recall, f1

num_epochs = 30
early_stopping = EarlyStopping(patience=5, min_delta=0.001)
for epoch in range(1, num_epochs+1):
        train_loss = train_epoch(train_loader)
        val_loss, val_acc, val_prec, val_rec, val_f1 = eval_epoch(val_loader)
        scheduler.step(val_loss)
        early_stopping(val_loss)
        if early_stopping.should_stop:
            print("Early stopping triggered.")
            break
        print(
            f"Epoch {epoch:2d} |"
            f" train loss: {train_loss:.4f} |"
            f" val loss:   {val_loss:.4f} |"
            f" val acc:    {val_acc:.4f} |"
            f" prec:       {val_prec:.4f} |"
            f" rec:        {val_rec:.4f} |"
            f" f1:         {val_f1:.4f} |"
        )


Epoch  1 | train loss: 0.3650 | val loss:   0.3064 | val acc:    0.8855 | prec:       0.5521 | rec:        0.4454 | f1:         0.4930 |
Epoch  2 | train loss: 0.3217 | val loss:   0.2976 | val acc:    0.9034 | prec:       0.7755 | rec:        0.3193 | f1:         0.4524 |
Epoch  3 | train loss: 0.3232 | val loss:   0.3467 | val acc:    0.8834 | prec:       0.6538 | rec:        0.1429 | f1:         0.2345 |
Epoch  4 | train loss: 0.3363 | val loss:   0.2768 | val acc:    0.8887 | prec:       0.5586 | rec:        0.5210 | f1:         0.5391 |
Epoch  5 | train loss: 0.2767 | val loss:   0.2387 | val acc:    0.8992 | prec:       0.6292 | rec:        0.4706 | f1:         0.5385 |
Epoch  6 | train loss: 0.2243 | val loss:   0.1975 | val acc:    0.9139 | prec:       0.6259 | rec:        0.7731 | f1:         0.6917 |
Epoch  7 | train loss: 0.1734 | val loss:   0.1834 | val acc:    0.9254 | prec:       0.6558 | rec:        0.8487 | f1:         0.7399 |
Epoch  8 | train loss: 0.1548 | val loss:

# Test

In [None]:
test_loss, test_acc, test_prec, test_rec, test_f1 = eval_epoch(test_loader, True)
print(
            f" loss: {test_loss:.4f} |"
            f" acc:    {val_acc:.4f} |"
            f" prec:       {val_prec:.4f} |"
            f" rec:        {val_rec:.4f} |"
            f" f1:         {val_f1:.4f} |"
        )


TP: 131, FP: 19, TN: 1023, FN: 18
 loss: 0.1154 | acc:    0.9653 | prec:       0.8772 | rec:        0.8403 | f1:         0.8584 |
