In [13]:
import os
import sys
sys.path.append(os.path.abspath(".."))
from common.utils import get_cleaned_syscall_files
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
import pandas as pd

def get_data():
    dfs = []
    files = get_cleaned_syscall_files()
    for f in files:
        df = pd.read_csv(f)
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)

    return df

def encode_process_and_event_name(df):
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
    ohe_features = ohe.fit_transform(df[['processName', 'eventName']])

    return ohe_features

def encode_user_id(df):
    le_user = LabelEncoder()
    user_id_encoded = le_user.fit_transform(df['userId'])
    user_id_encoded = user_id_encoded.reshape(-1, 1) 

    user_id_sparse = csr_matrix(user_id_encoded)

    return user_id_sparse

def encode_args(df):
    tfidf = TfidfVectorizer(max_features=500)
    args_tfidf = tfidf.fit_transform(df['args'].astype(str))

    return args_tfidf

data = get_data()
process_and_event_name_encoded = encode_process_and_event_name(data)
user_id_encoded = encode_user_id(data)
args_encoded = encode_args(data)

X = hstack([process_and_event_name_encoded, user_id_encoded, args_encoded])
y = data['label'].values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,  # 80% training, 20% testing
    random_state=42, 
    stratify=y  # Important to keep class distribution
)

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 2132894 samples
Testing set size: 533224 samples


In [11]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

# Create datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': 3,  # because you have 3 classes
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

# Train with early stopping and logging
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50)
    ]
)

Training until validation scores don't improve for 50 rounds
[50]	training's multi_logloss: 0.102949	valid_1's multi_logloss: 0.103401
[100]	training's multi_logloss: 0.0969455	valid_1's multi_logloss: 0.0974051
[150]	training's multi_logloss: 0.0957243	valid_1's multi_logloss: 0.0962184
[200]	training's multi_logloss: 0.0951775	valid_1's multi_logloss: 0.0957039
[250]	training's multi_logloss: 0.0948565	valid_1's multi_logloss: 0.0954046
[300]	training's multi_logloss: 0.0946179	valid_1's multi_logloss: 0.0951904
[350]	training's multi_logloss: 0.0944606	valid_1's multi_logloss: 0.0950556
[400]	training's multi_logloss: 0.0943509	valid_1's multi_logloss: 0.0949705
[450]	training's multi_logloss: 0.094271	valid_1's multi_logloss: 0.0949114
[500]	training's multi_logloss: 0.0941967	valid_1's multi_logloss: 0.0948652
[550]	training's multi_logloss: 0.0941425	valid_1's multi_logloss: 0.0948466
[600]	training's multi_logloss: 0.0941015	valid_1's multi_logloss: 0.0948403
[650]	training's mu

In [16]:
# Predict probabilities
y_pred = model.predict(X_test)

# Get the predicted classes
y_pred_classes = y_pred.argmax(axis=1)

# Decode back to original labels (benign/suspicious/evil)
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_test_labels = label_encoder.inverse_transform(y_test_encoded)

# Classification report
from sklearn.metrics import classification_report

print(classification_report(y_test_labels, y_pred_labels))


              precision    recall  f1-score   support

           0       0.95      1.00      0.97    476767
           1       1.00      1.00      1.00      2001
           2       0.98      0.56      0.71     54456

    accuracy                           0.95    533224
   macro avg       0.98      0.85      0.90    533224
weighted avg       0.95      0.95      0.95    533224

