In [2]:
import os
import sys
sys.path.append(os.path.abspath(".."))

from common.utils import get_cleaned_syscall_files
import pandas as pd

def get_data():
    dfs = []
    files = get_cleaned_syscall_files()
    for f in files:
        df = pd.read_csv(f)
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)

    return df

In [None]:
from sklearn.model_selection import train_test_split
from DataProcesser import DataProcesser

dp = DataProcesser(get_data())

X = dp.get_encoded_features()
y = dp.get_encoded_labels()

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,
    random_state=42, 
    stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 2132894 samples
Testing set size: 533224 samples


In [4]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50)
    ]
)

Training until validation scores don't improve for 50 rounds
[50]	training's multi_logloss: 0.102949	valid_1's multi_logloss: 0.103401
[100]	training's multi_logloss: 0.0969455	valid_1's multi_logloss: 0.0974051
[150]	training's multi_logloss: 0.0957243	valid_1's multi_logloss: 0.0962184
[200]	training's multi_logloss: 0.0951775	valid_1's multi_logloss: 0.0957039
[250]	training's multi_logloss: 0.0948565	valid_1's multi_logloss: 0.0954046
[300]	training's multi_logloss: 0.0946179	valid_1's multi_logloss: 0.0951904
[350]	training's multi_logloss: 0.0944606	valid_1's multi_logloss: 0.0950556
[400]	training's multi_logloss: 0.0943509	valid_1's multi_logloss: 0.0949705
[450]	training's multi_logloss: 0.094271	valid_1's multi_logloss: 0.0949114
[500]	training's multi_logloss: 0.0941967	valid_1's multi_logloss: 0.0948652
[550]	training's multi_logloss: 0.0941425	valid_1's multi_logloss: 0.0948466
[600]	training's multi_logloss: 0.0941015	valid_1's multi_logloss: 0.0948403
[650]	training's mu

In [5]:
y_pred = model.predict(X_test)

y_pred_classes = y_pred.argmax(axis=1)

y_pred_labels = dp.decode_labels(y_pred_classes)
y_test_labels = dp.decode_labels(y_test)

from sklearn.metrics import classification_report

print(classification_report(y_test_labels, y_pred_labels))


              precision    recall  f1-score   support

      benign       0.95      1.00      0.97    476767
        evil       1.00      1.00      1.00      2001
  suspicious       0.98      0.56      0.71     54456

    accuracy                           0.95    533224
   macro avg       0.98      0.85      0.90    533224
weighted avg       0.95      0.95      0.95    533224



In [8]:
def predict(encoded_input):
    prediction = model.predict(encoded_input).argmax(axis=1)
    return dp.decode_labels(prediction)

In [9]:
from DemoConstants import evil_input, suspicious_input, benign_input

encoded_evil_input = dp.encode_features(evil_input)
encoded_suspicious_input = dp.encode_features(suspicious_input)
encoded_benign_input = dp.encode_features(benign_input)

print("Evil input prediction:", predict(encoded_evil_input))
print("Suspicious input prediction:", predict(encoded_suspicious_input))
print("Benign input prediction:", predict(encoded_benign_input))



Evil input prediction: ['evil']
Suspicious input prediction: ['suspicious']
Benign input prediction: ['benign']
