In [1]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

import pandas as pd
import pickle
import os

### Data Preparation

In [2]:
black_list = [
 'DNS_QUERY_ID',
 'Dataset',
 'IPV4_DST_ADDR',
 'IPV4_DST_ADDR_INT',
 'IPV4_SRC_ADDR',
 'IPV4_SRC_ADDR_INT',
 'Label'
]

df = pd.read_parquet('../data/netflow_sample.parquet')
selected_features = [_ for _ in list(df.columns) if _ not in black_list]
data = df[selected_features]

# XGBoost requires numerical labels (column: Label)
le = LabelEncoder()
labels = le.fit_transform(df['Label'])
unique_labels = sorted(set(df['Label']))

X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=35, shuffle=True, train_size=0.8)

### Hyperparameter Search

In [3]:
file_name = '../search/params_xgboost.pkl'

if not os.path.exists(file_name):

    parameters = {
        'n_estimators':[50, 75, 100],
        'eta': [0.5, 0.7, 0.9],
        'min_child_weight':[1, 2, 3],
        'gamma':[0, 0.5, 1],
        'max_depth':[4, 5, 6]
    }

    model = XGBClassifier(random_state=35, booster="gbtree", tree_method="exact")

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=35)

    search = RandomizedSearchCV(model, parameters, scoring="accuracy", n_iter=10, n_jobs=6, cv=skf, verbose=10,
                                random_state=35)
    search.fit(X_train, y_train) #computation time: 3h

    with open(file_name, 'wb') as file:
        pickle.dump(search, file)
else:
    with open(file_name, 'rb') as file:
        search = pickle.load(file)

print(f"Best parameters for xgboost: {search.best_params_}")

Best parameters for xgboost: {'n_estimators': 75, 'min_child_weight': 1, 'max_depth': 6, 'gamma': 0.5, 'eta': 0.7}


### Training Data Set

In [4]:
file_name = '../model/xgboost_default.pkl'

if not os.path.exists(file_name):
    model = XGBClassifier(random_state=35, booster="gbtree", tree_method="exact")
    model = model.fit(X_train, y_train)
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
else:
    with open(file_name, 'rb') as file:
        model = pickle.load(file)

print(f"Mean accuracy of default xgboost classifier: {model.score(X_test, y_test):.3f}.")

Mean accuracy of default xgboost classifier: 0.952.


In [5]:
file_name = '../model/xgboost.pkl'

if not os.path.exists(file_name):
    hyperparameters = search.best_params_
    model = XGBClassifier(random_state=35, booster="gbtree", tree_method="exact", **hyperparameters)
    model = model.fit(X_train, y_train)
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
else:
    with open(file_name, 'rb') as file:
        model = pickle.load(file)

print(f"Mean accuracy of optimized xgboost classifier: {model.score(X_test, y_test):.3f}.")

Mean accuracy of optimized xgboost classifier: 0.956.


### Validation

In [6]:
file_name = '../scores/xgboost_test.pkl'

if not os.path.exists(file_name):
    y_pred = model.predict(X_test[model.feature_names_in_])
    y_pred = le.inverse_transform(y_pred)
    y_true = le.inverse_transform(y_test)

    with open(file_name, 'wb') as file:
        score_dict = {
            'matrix': confusion_matrix(y_true, y_pred, labels=unique_labels, normalize=None),
            'report': classification_report(y_true, y_pred, labels=unique_labels, output_dict=True, zero_division=0),
        }
        pickle.dump(score_dict, file)
else:
    with open(file_name, 'rb') as file:
        score_dict = pickle.load(file)

print(f"Accuracy of optimized xgboost on test data: {score_dict['report']['accuracy']:.3f}")
pd.DataFrame({
    'Attack':unique_labels,
    'F1-Score':[score_dict['report'][label]['f1-score'] for label in unique_labels],
})

Accuracy of optimized xgboost on test data: 0.956


Unnamed: 0,Attack,F1-Score
0,Benign,0.918725
1,Bot,1.0
2,DDoS,0.9833
3,DoS,0.96263
4,Infiltration,0.93929
5,Injection,0.926847
6,Password,0.966572
7,Scanning,0.976376
8,XSS,0.933366


In [7]:
file_name = '../scores/xgboost_fhswfcnl.pkl'

if not os.path.exists(file_name):
    X_true = pd.read_csv('../data/fhswfcnl_labeled.csv')
    y_true = X_true['Label']

    y_pred = model.predict(X_true[model.feature_names_in_])
    y_pred = le.inverse_transform(y_pred)

    with open(file_name, 'wb') as file:
        score_dict = {
            'matrix': confusion_matrix(y_true, y_pred, labels=unique_labels, normalize=None),
            'report': classification_report(y_true, y_pred, labels=unique_labels, output_dict=True, zero_division=0),
        }
        pickle.dump(score_dict, file)
else:
    with open(file_name, 'rb') as file:
        score_dict = pickle.load(file)

print(f"Accuracy of optimized xgboost on validation data: {score_dict['report']['accuracy']:.3f}")
pd.DataFrame({
    'Attack':unique_labels,
    'F1-Score':[score_dict['report'][label]['f1-score'] for label in unique_labels],
})

Accuracy of optimized xgboost on validation data: 0.281


Unnamed: 0,Attack,F1-Score
0,Benign,0.123937
1,Bot,0.0
2,DDoS,0.0
3,DoS,0.0
4,Infiltration,0.026739
5,Injection,0.0
6,Password,0.384425
7,Scanning,0.178289
8,XSS,0.0
