In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [40]:
# Read legitimate commands
with open('legit.cm', 'r') as file:
    legitimate_commands = file.read().splitlines()

# Read malicious commands
with open('malicious.cm', 'r') as file:
    malicious_commands = file.read().splitlines()

print(f"Number of malicious commands read: {len(malicious_commands)}")

Number of malicious commands read: 123


In [11]:
# Create DataFrames for each
legitimate_df = pd.DataFrame({'command': legitimate_commands})
malicious_df = pd.DataFrame({'command': malicious_commands})

# Assign labels
legitimate_df['label'] = 0
malicious_df['label'] = 1

# Combine the DataFrames
data_df = pd.concat([legitimate_df, malicious_df], ignore_index=True)

In [15]:
# Splitting the data accordingly for testing and training 80/20 split
X = data_df['command']
y = data_df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [17]:
# BERT tokeniser and embedder
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')



In [19]:
def get_bert_embedding(text):
    # Tokenise input text
    inputs = tokenizer(
        text,
        return_tensors='pt',
        max_length=64,
        padding='max_length',
        truncation=True
    )
    
    # Get embeddings from BERT
    with torch.no_grad():
        outputs = bert_model(**inputs)
        last_hidden_state = outputs.last_hidden_state  # Shape: [batch_size, seq_length, hidden_size]
    
    # Don't really understand what the dofference between the [CLS] token is?
    cls_embedding = last_hidden_state[:, 0, :]
    
    return cls_embedding.squeeze().numpy()
    

In [21]:
def generate_embeddings(commands):
    embeddings = []
    for command in tqdm(commands, desc="Generating"):
        embedding = get_bert_embedding(command)
        embeddings.append(embedding)
    return np.array(embeddings)
# Embedding generation with a nice progress bar so I don't have to guess whilst my fans go mental
X_train_embeddings = generate_embeddings(X_train)
X_test_embeddings = generate_embeddings(X_test)

Generating: 100%|████████████████████████████████████████████████████████| 10184/10184 [02:50<00:00, 59.62it/s]
Generating: 100%|██████████████████████████████████████████████████████████| 2546/2546 [01:13<00:00, 34.82it/s]


In [23]:
# I've read about scaling features to have 'no mean and unit variance' but also unsure atm as to what that means exactly

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_embeddings)
X_test_scaled = scaler.transform(X_test_embeddings)

In [25]:
# SVM
svm_classifier = SVC(kernel='linear', probability=True, random_state=42)
svm_classifier.fit(X_train_scaled, y_train)

In [32]:
# Test how well it does
y_pred = svm_classifier.predict(X_test_scaled)

# accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}\n")

# classification report
report = classification_report(y_test, y_pred, target_names=['Legitimate', 'Malicious'])
print("Classification Report:")
print(report)

# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.99

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00      2521
   Malicious       0.64      0.56      0.60        25

    accuracy                           0.99      2546
   macro avg       0.82      0.78      0.80      2546
weighted avg       0.99      0.99      0.99      2546

Confusion Matrix:
[[2513    8]
 [  11   14]]


In [36]:
from sklearn.model_selection import GridSearchCV

# Find optimal parameters for SVM - might improve accuracy.

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
grid = GridSearchCV(SVC(probability=True, random_state=42), param_grid, refit=True, verbose=2)
grid.fit(X_train_scaled, y_train)
svm_classifier = grid.best_estimator_


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.3s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.4s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.6s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.6s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=   3.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   7.9s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   7.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   7.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   8.1s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   7.8s
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=   3.3s
[CV] END ...................C=0.1, gamma=auto, k