In [2]:
import random

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.inspection import permutation_importance

In [3]:
RANDOM_SEED = 42
DATA_PATH = "data/"
DATA_FILE = "processed_traffic.parquet"

In [4]:
data = pd.read_parquet(DATA_PATH + DATA_FILE)
data_train, data_test = train_test_split(data, test_size=0.2, random_state=RANDOM_SEED)
data_train.shape, data_test.shape

((49742, 49), (12436, 49))

In [5]:
X_train, y_train = data_train.drop(columns=["Attack Name", "Label"]), data_train["Label"]
X_test, y_test = data_test.drop(columns=["Attack Name", "Label"]), data_test["Label"]
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Initial Train

In [6]:
param = {
    'objective': 'binary:logistic',
    'eval_metric': ['auc', 'error', 'logloss'],
    'device': 'cuda',
    'verbosity': 1,
}
evallist = [(dtrain, 'train'), (dtest, 'eval')]
model = xgb.train(param, dtrain, num_boost_round=200, evals=evallist, early_stopping_rounds=10)

[0]	train-auc:0.91179	train-error:0.11879	train-logloss:0.53257	eval-auc:0.91143	eval-error:0.12005	eval-logloss:0.53272
[1]	train-auc:0.92624	train-error:0.11686	train-logloss:0.44519	eval-auc:0.92671	eval-error:0.11740	eval-logloss:0.44535
[2]	train-auc:0.94804	train-error:0.10792	train-logloss:0.38432	eval-auc:0.94814	eval-error:0.10743	eval-logloss:0.38486
[3]	train-auc:0.94863	train-error:0.10758	train-logloss:0.34486	eval-auc:0.94819	eval-error:0.10815	eval-logloss:0.34540
[4]	train-auc:0.95436	train-error:0.10390	train-logloss:0.31308	eval-auc:0.95407	eval-error:0.10550	eval-logloss:0.31437
[5]	train-auc:0.96189	train-error:0.09778	train-logloss:0.28515	eval-auc:0.96221	eval-error:0.10108	eval-logloss:0.28607
[6]	train-auc:0.97414	train-error:0.08303	train-logloss:0.25250	eval-auc:0.97358	eval-error:0.08709	eval-logloss:0.25455
[7]	train-auc:0.97664	train-error:0.07724	train-logloss:0.23812	eval-auc:0.97620	eval-error:0.08057	eval-logloss:0.23999
[8]	train-auc:0.97980	train-erro

In [7]:
THRESHOLD = 0.5
preds = model.predict(dtest)
accuracy = accuracy_score(y_test, preds > THRESHOLD)
report = classification_report(y_test, preds > THRESHOLD, digits=4)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.9903
Classification Report:
              precision    recall  f1-score   support

           0     0.9904    0.9909    0.9907      6476
           1     0.9901    0.9896    0.9898      5960

    accuracy                         0.9903     12436
   macro avg     0.9903    0.9902    0.9903     12436
weighted avg     0.9903    0.9903    0.9903     12436



In [8]:
comparison = pd.DataFrame(preds, columns=["Predicted Probability"])
comparison["Predicted Label"] = (comparison["Predicted Probability"] > THRESHOLD).astype(int)
comparison["True Label"] = y_test.values
comparison["Attack Name"] = data_test["Attack Name"].values
comparison

Unnamed: 0,Predicted Probability,Predicted Label,True Label,Attack Name
0,0.000729,0,0,Benign Traffic
1,0.001192,0,0,Benign Traffic
2,0.991024,1,1,MQTT Malformed
3,0.001424,0,0,Benign Traffic
4,0.000632,0,0,Benign Traffic
...,...,...,...,...
12431,0.971185,1,1,DDoS UDP Flood
12432,0.000308,0,0,Benign Traffic
12433,0.995690,1,1,DoS ICMP Flood
12434,0.000754,0,0,Benign Traffic


In [9]:
incorrect = comparison[comparison["Predicted Label"] != comparison["True Label"]]
incorrect["Attack Name"].value_counts()

Attack Name
Benign Traffic              59
DoS UDP Flood               23
DoS ICMP Flood              12
DDoS UDP Flood              11
DDoS ICMP Flood             10
MQTT Malformed               2
Recon Vulnerability Scan     1
MITM ARP Spoofing            1
Recon Port Scan              1
MQTT DoS Publish Flood       1
Name: count, dtype: int64

# Hyperparameter Tuning

Random search CV was essentially used due to the large number of hyperparameters that could be tuned. Only the main ones after searching online as the most influential were tuned. The below did not use sklearn modules and interface in order to utilize gpu.

In [10]:
base_param = {
    'objective': 'binary:logistic',
    'eval_metric': ['auc', 'error', 'logloss'],
    'device': 'cuda',
    'verbosity': 1,
}
evallist = [(dtrain, 'train'), (dtest, 'eval')]

hyperparameter_grid = {
    "max_depth": [3, 5, 7, 9, 11, 13],
    "min_child_weight":[1, 2, 3, 4, 5, 6],
    "subsample":[0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "eta": [0.01, 0.03, 0.05, 0.1],
}

tested_params = set()
def sample_hyperparameters(grid: dict) -> dict[str, float | int]:
    params = {}
    for key, values in grid.items():
        params[key] = random.choice(values)
    return params

n_iters = 30

best_auc = 0.0
best_iter = -1
best_params = None

for i in range(n_iters):
    sampled_params = sample_hyperparameters(hyperparameter_grid)
    while frozenset(sampled_params.items()) in tested_params:
        sampled_params = sample_hyperparameters(hyperparameter_grid)

    tested_params.add(frozenset(sampled_params.items()))
    param = base_param.copy()
    param.update(sampled_params)
    model = xgb.train(param, dtrain, num_boost_round=2000, evals=evallist, early_stopping_rounds=10, verbose_eval=False)
    preds = model.predict(dtest)
    accuracy = accuracy_score(y_test, preds > THRESHOLD)
    auc = roc_auc_score(y_test, preds)
    precision = precision_score(y_test, preds > THRESHOLD)
    recall = recall_score(y_test, preds > THRESHOLD)
    f1 = f1_score(y_test, preds > THRESHOLD)

    print(f"Iteration {i+1}/{n_iters} - Params: {sampled_params} | Accuracy: {accuracy:.4f} | AUC: {auc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")

    if auc > best_auc:
        best_auc = auc
        best_iter = i + 1
        best_params = sampled_params
        model.save_model(f"XGBModels/model_iter_{i+1}_acc_{accuracy:.4f}_auc_{auc:.4f}.json")

print(f"Best AUC: {best_auc:.4f} with params: {best_params} at iteration {best_iter}")

Iteration 1/30 - Params: {'max_depth': 9, 'min_child_weight': 3, 'subsample': 1.0, 'colsample_bytree': 1.0, 'eta': 0.05} | Accuracy: 0.9916 | AUC: 0.9996 | Precision: 0.9918 | Recall: 0.9908 | F1-Score: 0.9913
Iteration 2/30 - Params: {'max_depth': 11, 'min_child_weight': 4, 'subsample': 0.5, 'colsample_bytree': 0.5, 'eta': 0.1} | Accuracy: 0.9893 | AUC: 0.9994 | Precision: 0.9896 | Recall: 0.9881 | F1-Score: 0.9888
Iteration 3/30 - Params: {'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.6, 'colsample_bytree': 0.8, 'eta': 0.03} | Accuracy: 0.9912 | AUC: 0.9996 | Precision: 0.9918 | Recall: 0.9898 | F1-Score: 0.9908
Iteration 4/30 - Params: {'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.8, 'colsample_bytree': 0.6, 'eta': 0.03} | Accuracy: 0.9907 | AUC: 0.9996 | Precision: 0.9916 | Recall: 0.9889 | F1-Score: 0.9903
Iteration 5/30 - Params: {'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.5, 'colsample_bytree': 0.5, 'eta': 0.01} | Accuracy: 0.9620 | AUC: 0.9934 | Prec

In [16]:
best_model = xgb.XGBClassifier(device='cuda')
best_model.load_model("XGBModels/model_iter_1_acc_0.9916_auc_0.9996.json")

In [17]:
def calculate_metrics(y_true, preds, threshold):
    preds_label = (preds > threshold).astype(int)
    accuracy = accuracy_score(y_true, preds_label)
    precision = precision_score(y_true, preds_label)
    recall = recall_score(y_true, preds_label)
    f1 = f1_score(y_true, preds_label)
    auc = roc_auc_score(y_true, preds)
    print(f"Best Model Evaluation using threshold {threshold}:")
    print(f"Best Model Accuracy: {accuracy:.4f}")
    print(f"Best Model AUC: {auc:.4f}")
    print(f"Best Model F1 Score: {f1:.4f}")
    print(f"Best Model Recall: {recall:.4f}")
    print(f"Best Model Precision: {precision:.4f}")


In [18]:
THRESHOLD = 0.5
preds = best_model.predict(X_test)
calculate_metrics(y_test, preds, THRESHOLD)

Best Model Evaluation using threshold 0.5:
Best Model Accuracy: 0.9916
Best Model AUC: 0.9916
Best Model F1 Score: 0.9913
Best Model Recall: 0.9908
Best Model Precision: 0.9918


In [19]:
# See performance at different thresholds
for threshold in [0.2, 0.4, 0.5, 0.6, 0.8]:
    calculate_metrics(y_test, preds, threshold)
    print("\n")

Best Model Evaluation using threshold 0.2:
Best Model Accuracy: 0.9916
Best Model AUC: 0.9916
Best Model F1 Score: 0.9913
Best Model Recall: 0.9908
Best Model Precision: 0.9918


Best Model Evaluation using threshold 0.4:
Best Model Accuracy: 0.9916
Best Model AUC: 0.9916
Best Model F1 Score: 0.9913
Best Model Recall: 0.9908
Best Model Precision: 0.9918


Best Model Evaluation using threshold 0.5:
Best Model Accuracy: 0.9916
Best Model AUC: 0.9916
Best Model F1 Score: 0.9913
Best Model Recall: 0.9908
Best Model Precision: 0.9918


Best Model Evaluation using threshold 0.6:
Best Model Accuracy: 0.9916
Best Model AUC: 0.9916
Best Model F1 Score: 0.9913
Best Model Recall: 0.9908
Best Model Precision: 0.9918


Best Model Evaluation using threshold 0.8:
Best Model Accuracy: 0.9916
Best Model AUC: 0.9916
Best Model F1 Score: 0.9913
Best Model Recall: 0.9908
Best Model Precision: 0.9918




Doesn't seem like there's a big difference for using different thresholds. 0.5 seems to give the best balance between accuracy, precision, recall, and f1-score.

# Feature Importances

In [20]:
perm_importance = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=RANDOM_SEED, scoring=['roc_auc', 'accuracy'])

In [21]:
auc_importances = pd.DataFrame(perm_importance['roc_auc']['importances_mean'], index=X_test.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
accuracy_importances = pd.DataFrame(perm_importance['accuracy']['importances_mean'], index=X_test.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)

In [22]:
pd.set_option('display.float_format', lambda x: '%.8f' % x)
auc_importances.head(10)

Unnamed: 0,Importance
Src Port,0.11045975
Bwd Init Win Bytes,0.00541093
ACK Flag Count,0.00534028
Dst Port,0.00304108
Flow Duration,0.00062137
Bwd Packets/s,0.00058703
FWD Init Win Bytes,0.0005273
Active Mean,0.00029287
Fwd IAT Std,0.00011969
Flow IAT Mean,0.00011389


In [23]:
pd.set_option('display.float_format', lambda x: '%.8f' % x)
accuracy_importances.head(10)

Unnamed: 0,Importance
Src Port,0.24816661
Bwd Init Win Bytes,0.07504021
ACK Flag Count,0.04840785
Dst Port,0.01955613
Bwd Packets/s,0.01225474
FWD Init Win Bytes,0.00708427
Flow Duration,0.00467996
Flow IAT Std,0.00161628
Fwd Packet Length Std,0.0014072
Active Mean,0.00116597
