In [11]:
import os
import sys
sys.path.append(os.path.abspath(".."))

from common.utils import get_cleaned_syscall_files
import pandas as pd

def get_data():
    dfs = []
    files = get_cleaned_syscall_files()
    for f in files:
        df = pd.read_csv(f)
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)

    return df

In [13]:
from sklearn.model_selection import train_test_split
from DataProcesser import DataProcesser

dp = DataProcesser(get_data())

X = dp.get_encoded_features()
y = dp.get_encoded_labels()

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,  # 80% training, 20% testing
    random_state=42, 
    stratify=y  # Important to keep class distribution
)

# y_train_encoded = dp.encode_labels(y_train)
# y_test_encoded = dp.encode_labels(y_test)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 2132894 samples
Testing set size: 533224 samples


In [14]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

# Create datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': 3,  # because you have 3 classes
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

# Train with early stopping and logging
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50)
    ]
)

Training until validation scores don't improve for 50 rounds
[50]	training's multi_logloss: 0.102949	valid_1's multi_logloss: 0.103401
[100]	training's multi_logloss: 0.0969455	valid_1's multi_logloss: 0.0974051
[150]	training's multi_logloss: 0.0957243	valid_1's multi_logloss: 0.0962184
[200]	training's multi_logloss: 0.0951775	valid_1's multi_logloss: 0.0957039
[250]	training's multi_logloss: 0.0948565	valid_1's multi_logloss: 0.0954046
[300]	training's multi_logloss: 0.0946179	valid_1's multi_logloss: 0.0951904
[350]	training's multi_logloss: 0.0944606	valid_1's multi_logloss: 0.0950556
[400]	training's multi_logloss: 0.0943509	valid_1's multi_logloss: 0.0949705
[450]	training's multi_logloss: 0.094271	valid_1's multi_logloss: 0.0949114
[500]	training's multi_logloss: 0.0941967	valid_1's multi_logloss: 0.0948652
[550]	training's multi_logloss: 0.0941425	valid_1's multi_logloss: 0.0948466
[600]	training's multi_logloss: 0.0941015	valid_1's multi_logloss: 0.0948403
[650]	training's mu

In [15]:
# Predict probabilities
y_pred = model.predict(X_test)

# Get the predicted classes
y_pred_classes = y_pred.argmax(axis=1)

# Decode back to original labels (benign/suspicious/evil)
y_pred_labels = dp.decode_labels(y_pred_classes)
y_test_labels = dp.decode_labels(y_test)

# Classification report
from sklearn.metrics import classification_report

print(classification_report(y_test_labels, y_pred_labels))


              precision    recall  f1-score   support

      benign       0.95      1.00      0.97    476767
        evil       1.00      1.00      1.00      2001
  suspicious       0.98      0.56      0.71     54456

    accuracy                           0.95    533224
   macro avg       0.98      0.85      0.90    533224
weighted avg       0.95      0.95      0.95    533224



In [18]:
import pandas as pd

# Example input sample
sample_data = {
    'userId': [1001],
    'processName': ['(sd-pam)'],
    'eventName': ['prctl'],
    'argsNum': [5],
    'returnValue': [0],
    'args': ["[{'name': 'option', 'type': 'int', 'value': 'PR_SET_PDEATHSIG'}, {'name': 'arg2', 'type': 'unsigned long', 'value': 15}, {'name': 'arg3', 'type': 'unsigned long', 'value': 140730094619080}, {'name': 'arg4', 'type': 'unsigned long', 'value': 139950094474910}, {'name': 'arg5', 'type': 'unsigned long', 'value': 0}]"]
}

sample_df = pd.DataFrame(sample_data)

In [19]:
encoded_smaple = dp.encode_features(sample_df)

prediction = model.predict(encoded_smaple)

result = dp.decode_labels(prediction.argmax(axis=1))

print(result)


['evil']
