In [2]:
import utils.utils as utils
import pandas as pd

def get_data():
    dfs = []
    files = utils.get_cleaned_datafiles()
    for f in files:
        df = pd.read_csv(f)
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)

    return df

In [3]:
from sklearn.model_selection import train_test_split
from utils.DataProcesser import DataProcesser

dp = DataProcesser(get_data())

X = dp.get_encoded_features()
y = dp.get_encoded_labels()

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 1973516 samples
Testing set size: 493380 samples


In [4]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=50)
    ]
)

Training until validation scores don't improve for 50 rounds
[50]	training's multi_logloss: 0.111162	valid_1's multi_logloss: 0.111334
[100]	training's multi_logloss: 0.104741	valid_1's multi_logloss: 0.104951
[150]	training's multi_logloss: 0.103472	valid_1's multi_logloss: 0.103709
[200]	training's multi_logloss: 0.102931	valid_1's multi_logloss: 0.103201
[250]	training's multi_logloss: 0.102581	valid_1's multi_logloss: 0.102882
[300]	training's multi_logloss: 0.102296	valid_1's multi_logloss: 0.102622
[350]	training's multi_logloss: 0.102139	valid_1's multi_logloss: 0.102494
[400]	training's multi_logloss: 0.102021	valid_1's multi_logloss: 0.102403
[450]	training's multi_logloss: 0.101929	valid_1's multi_logloss: 0.102337
[500]	training's multi_logloss: 0.101832	valid_1's multi_logloss: 0.102274
[550]	training's multi_logloss: 0.10177	valid_1's multi_logloss: 0.102252
[600]	training's multi_logloss: 0.101728	valid_1's multi_logloss: 0.102244
[650]	training's multi_logloss: 0.101695	

In [5]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

y_pred_classes = y_pred.argmax(axis=1)

y_pred_labels = dp.decode_labels(y_pred_classes)
y_test_labels = dp.decode_labels(y_test)

print(classification_report(y_test_labels, y_pred_labels))


              precision    recall  f1-score   support

      benign       0.95      1.00      0.97    436923
        evil       1.00      1.00      1.00      2001
  suspicious       0.97      0.56      0.71     54456

    accuracy                           0.95    493380
   macro avg       0.97      0.85      0.90    493380
weighted avg       0.95      0.95      0.94    493380



In [7]:
from utils.constants import EVIL_INPUT, SUSPICIOUS_INPUT, BENIGN_INPUT

def predict(encoded_input):
    prediction = model.predict(encoded_input).argmax(axis=1)
    return dp.decode_labels(prediction)

encoded_evil_input = dp.encode_features(EVIL_INPUT)
encoded_suspicious_input = dp.encode_features(SUSPICIOUS_INPUT)
encoded_benign_input = dp.encode_features(BENIGN_INPUT)

print("Evil input prediction:", predict(encoded_evil_input))
print("Suspicious input prediction:", predict(encoded_suspicious_input))
print("Benign input prediction:", predict(encoded_benign_input))



Evil input prediction: ['evil']
Suspicious input prediction: ['suspicious']
Benign input prediction: ['benign']
