In [25]:
import random

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report

In [2]:
RANDOM_SEED = 42
DATA_PATH = "data/"
DATA_FILE = "processed_traffic.parquet"

In [3]:
data = pd.read_parquet(DATA_PATH + DATA_FILE)
data_train, data_test = train_test_split(data, test_size=0.1, random_state=RANDOM_SEED)
data_train.shape, data_test.shape

((55960, 49), (6218, 49))

In [5]:
X_train, y_train = data_train.drop(columns=["Attack Name", "Label"]), data_train["Label"]
X_test, y_test = data_test.drop(columns=["Attack Name", "Label"]), data_test["Label"]
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [10]:
param = {
    'objective': 'binary:logistic',
    'eval_metric': ['auc', 'error', 'logloss'],
    'device': 'cuda',
    'verbosity': 1,
}
evallist = [(dtrain, 'train'), (dtest, 'eval')]
model = xgb.train(param, dtrain, num_boost_round=200, evals=evallist, early_stopping_rounds=10)

[0]	train-auc:0.91241	train-error:0.11951	train-logloss:0.53299	eval-auc:0.90914	eval-error:0.12496	eval-logloss:0.53532
[1]	train-auc:0.92778	train-error:0.11825	train-logloss:0.44553	eval-auc:0.92447	eval-error:0.12432	eval-logloss:0.44947
[2]	train-auc:0.94719	train-error:0.10602	train-logloss:0.38447	eval-auc:0.94572	eval-error:0.10856	eval-logloss:0.38825
[3]	train-auc:0.94968	train-error:0.10593	train-logloss:0.34632	eval-auc:0.94760	eval-error:0.10839	eval-logloss:0.35063
[4]	train-auc:0.95274	train-error:0.10724	train-logloss:0.31868	eval-auc:0.95006	eval-error:0.11129	eval-logloss:0.32444
[5]	train-auc:0.96224	train-error:0.10340	train-logloss:0.28885	eval-auc:0.95970	eval-error:0.10984	eval-logloss:0.29590
[6]	train-auc:0.96541	train-error:0.09971	train-logloss:0.27143	eval-auc:0.96150	eval-error:0.10486	eval-logloss:0.27922
[7]	train-auc:0.97269	train-error:0.07766	train-logloss:0.24622	eval-auc:0.97033	eval-error:0.08154	eval-logloss:0.25317
[8]	train-auc:0.97941	train-erro

In [14]:
THRESHOLD = 0.5
preds = model.predict(dtest)
accuracy = accuracy_score(y_test, preds > THRESHOLD)
report = classification_report(y_test, preds > THRESHOLD, digits=4)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.9928
Classification Report:
              precision    recall  f1-score   support

           0     0.9927    0.9936    0.9931      3264
           1     0.9929    0.9919    0.9924      2954

    accuracy                         0.9928      6218
   macro avg     0.9928    0.9927    0.9927      6218
weighted avg     0.9928    0.9928    0.9928      6218



In [20]:
comparison = pd.DataFrame(preds, columns=["Predicted Probability"])
comparison["Predicted Label"] = (comparison["Predicted Probability"] > THRESHOLD).astype(int)
comparison["True Label"] = y_test.values
comparison["Attack Name"] = data_test["Attack Name"].values
comparison

Unnamed: 0,Predicted Probability,Predicted Label,True Label,Attack Name
0,0.000478,0,0,Benign Traffic
1,0.001047,0,0,Benign Traffic
2,0.998435,1,1,MQTT Malformed
3,0.001985,0,0,Benign Traffic
4,0.000728,0,0,Benign Traffic
...,...,...,...,...
6213,0.999448,1,1,DDoS ICMP Flood
6214,0.000053,0,0,Benign Traffic
6215,0.999865,1,1,Recon Vulnerability Scan
6216,0.000850,0,0,Benign Traffic


In [33]:
incorrect = comparison[comparison["Predicted Label"] != comparison["True Label"]]
incorrect["Attack Name"].value_counts()

Attack Name
Benign Traffic       21
DoS UDP Flood         7
DDoS UDP Flood        6
DoS ICMP Flood        5
DDoS ICMP Flood       3
MQTT Malformed        2
MITM ARP Spoofing     1
Name: count, dtype: int64

# Hyperparameter Tuning

Random search CV was essentially used due to the large number of hyperparameters that could be tuned. Only the main ones after searching online as the most influential were tuned. The below did not use sklearn modules and interface in order to utilize gpu.

In [None]:
base_param = {
    'objective': 'binary:logistic',
    'eval_metric': ['auc', 'error', 'logloss'],
    'device': 'cuda',
    'verbosity': 1,
}
evallist = [(dtrain, 'train'), (dtest, 'eval')]

hyperparameter_grid = {
    "max_depth": [3, 5, 7, 9, 11, 13],
    "min_child_weight":[1, 2, 3, 4, 5, 6],
    "subsample":[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "eta": [0.01, 0.03, 0.05, 0.1],
}

tested_params = set()
def sample_hyperparameters(grid: dict) -> dict[str, float | int]:
    params = {}
    for key, values in grid.items():
        params[key] = random.choice(values)
    return params

n_iters = 30

best_auc = 0.0
best_iter = -1
best_params = None

for i in range(n_iters):
    sampled_params = sample_hyperparameters(hyperparameter_grid)
    while frozenset(sampled_params.items()) in tested_params:
        sampled_params = sample_hyperparameters(hyperparameter_grid)

    tested_params.add(frozenset(sampled_params.items()))
    param = base_param.copy()
    param.update(sampled_params)
    model = xgb.train(param, dtrain, num_boost_round=2000, evals=evallist, early_stopping_rounds=10, verbose_eval=False)
    preds = model.predict(dtest)
    accuracy = accuracy_score(y_test, preds > THRESHOLD)
    auc = roc_auc_score(y_test, preds)
    precision = precision_score(y_test, preds > THRESHOLD)
    recall = recall_score(y_test, preds > THRESHOLD)
    f1 = f1_score(y_test, preds > THRESHOLD)

    print(f"Iteration {i+1}/{n_iters} - Params: {sampled_params} | Accuracy: {accuracy:.4f} | AUC: {auc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

    if auc > best_auc:
        best_auc = auc
        best_iter = i + 1
        best_params = sampled_params
        model.save_model(f"XGBModels/model_iter_{i+1}_acc_{accuracy:.4f}_auc_{auc:.4f}.json")

print(f"Best AUC: {best_auc:.4f} with params: {best_params} at iteration {best_iter}")

Iteration 1/30 - Params: {'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.6, 'colsample_bytree': 1.0, 'eta': 0.05} | Accuracy: 0.9923 | AUC: 0.9998 | Precision: 0.9929 | Recall: 0.9909 | F1-Score: 0.9919
Iteration 2/30 - Params: {'max_depth': 9, 'min_child_weight': 1, 'subsample': 1.0, 'colsample_bytree': 1.0, 'eta': 0.01} | Accuracy: 0.9937 | AUC: 0.9998 | Precision: 0.9946 | Recall: 0.9922 | F1-Score: 0.9934
Iteration 3/30 - Params: {'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.8, 'colsample_bytree': 0.8, 'eta': 0.05} | Accuracy: 0.9916 | AUC: 0.9997 | Precision: 0.9912 | Recall: 0.9912 | F1-Score: 0.9912
Iteration 4/30 - Params: {'max_depth': 5, 'min_child_weight': 6, 'subsample': 0.8, 'colsample_bytree': 1.0, 'eta': 0.03} | Accuracy: 0.9907 | AUC: 0.9997 | Precision: 0.9918 | Recall: 0.9885 | F1-Score: 0.9902
Iteration 5/30 - Params: {'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.5, 'colsample_bytree': 1.0, 'eta': 0.03} | Accuracy: 0.9913 | AUC: 0.9998 | Prec

In [34]:
best_model = xgb.Booster(model_file="XGBModels/model_iter_12_acc_0.9937_auc_0.9998.json")

In [30]:
preds = best_model.predict(dtest)
print(classification_report(y_test, preds > THRESHOLD, digits=4))

              precision    recall  f1-score   support

           0     0.9933    0.9948    0.9940      3264
           1     0.9942    0.9926    0.9934      2954

    accuracy                         0.9937      6218
   macro avg     0.9938    0.9937    0.9937      6218
weighted avg     0.9937    0.9937    0.9937      6218



In [None]:
# See performance at different thresholds
for threshold in [0.2, 0.4, 0.5, 0.6, 0.8]:
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, preds > threshold, digits=4))

Threshold: 0.2
              precision    recall  f1-score   support

           0     0.9972    0.9862    0.9917      3264
           1     0.9849    0.9970    0.9909      2954

    accuracy                         0.9913      6218
   macro avg     0.9911    0.9916    0.9913      6218
weighted avg     0.9914    0.9913    0.9913      6218

Threshold: 0.4
              precision    recall  f1-score   support

           0     0.9948    0.9930    0.9939      3264
           1     0.9922    0.9942    0.9932      2954

    accuracy                         0.9936      6218
   macro avg     0.9935    0.9936    0.9936      6218
weighted avg     0.9936    0.9936    0.9936      6218

Threshold: 0.5
              precision    recall  f1-score   support

           0     0.9933    0.9948    0.9940      3264
           1     0.9942    0.9926    0.9934      2954

    accuracy                         0.9937      6218
   macro avg     0.9938    0.9937    0.9937      6218
weighted avg     0.9937    0.

Doesn't seem like there's a big difference for using different thresholds. 0.5 seems to give the best balance between accuracy, precision, recall, and f1-score.