In [15]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

import pandas as pd
import pickle
import os

### Data Preparation

In [5]:
black_list = [
 'Attack',
 'DNS_QUERY_ID',
 'Dataset',
 'IPV4_DST_ADDR',
 'IPV4_DST_ADDR_INT',
 'IPV4_SRC_ADDR',
 'IPV4_SRC_ADDR_INT',
 'Label'
]

df = pd.read_parquet('../data/netflow_sample.parquet')
selected_features = [_ for _ in list(df.columns) if _ not in black_list]

In [None]:
df = pd.read_parquet('../data/netflow_sample.parquet')
data = df[selected_features]
# XGBoost requires numerical labels (column: Attack)
labels = LabelEncoder().fit_transform(df["Attack"])
X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=35, shuffle=True, train_size=0.8)

### Label Encoding

### Hyperparameter Search

In [10]:
file_name = '../search/params_xgboost.pkl'

if not os.path.isfile(file_name):

    parameters = {
        'n_estimators':[50, 75, 100],
        'eta': [0.5, 0.7, 0.9],
        'min_child_weight':[1, 2, 3],
        'gamma':[0, 0.5, 1],
        'max_depth':[4, 5, 6]
    }

    model = XGBClassifier(random_state=35, booster="gbtree", tree_method="exact")

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=35)

    search = RandomizedSearchCV(model, parameters, scoring="accuracy", n_iter=10, n_jobs=6, cv=skf, verbose=10,
                                random_state=35)
    search.fit(X_train, y_train) #computation time: 3h

    with open(file_name, 'wb') as file:
        pickle.dump(search, file)
else:
    with open(file_name, 'rb') as file:
        search = pickle.load(file)

print(f"Best parameters for xgboost: {search.best_params_}")

Best parameters for xgboost: {'n_estimators': 75, 'min_child_weight': 1, 'max_depth': 6, 'gamma': 0.5, 'eta': 0.7}


### Training

In [20]:
file_name = '../model/xgboost_default.pkl'

if not os.path.isfile(file_name):
    model = XGBClassifier(random_state=35, booster="gbtree", tree_method="exact")
    model = model.fit(X_train, y_train)
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
else:
    with open(file_name, 'rb') as file:
        model = pickle.load(file)

print(f"Mean accuracy of default xgboost classifier: {model.score(X_test, y_test):.3f}.")

Mean accuracy of default xgboost classifier: 0.952.


In [22]:
file_name = '../model/xgboost.pkl'

if not os.path.isfile(file_name):
    hyperparameters = search.best_params_
    model = XGBClassifier(random_state=35, booster="gbtree", tree_method="exact", **hyperparameters)
    model = model.fit(X_train, y_train)
    with open(file_name, 'wb') as file:
        pickle.dump(model, file)
else:
    with open(file_name, 'rb') as file:
        model = pickle.load(file)

print(f"Mean accuracy of optimized xgboost classifier: {model.score(X_test, y_test):.3f}.")

Mean accuracy of optimized xgboost classifier: 0.956.
