In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.experimental import enable_halving_search_cv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn import set_config
set_config(enable_metadata_routing=True)
RANDOM_STATE = 124

from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

from time import monotonic

from utils.data_processing import load_data, raw_columns, full_dtypes

In [2]:
data_path = Path("./data")
input_data_path = Path("./data/cybersecurity_attacks.csv")
dtypes = {col: col_type for col, col_type in full_dtypes.items() if col in raw_columns}
raw_data = load_data(input_data_path)
cat_cols = raw_data.select_dtypes(include=["category"]).columns
raw_data[cat_cols] = raw_data[cat_cols].astype("str")
X_dataset = raw_data.drop(columns=["Attack Type"])
Y = raw_data["Attack Type"]
X_dataset = X_dataset.fillna(np.nan)

In [9]:
num_cols = X_dataset.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_dataset.select_dtypes(include=["object","str"]).columns.tolist()
pass_cols = [col for col in X_dataset.columns if col not in cat_cols and col not in num_cols]

_tmp = SimpleImputer(strategy="constant", fill_value="unknown").fit_transform(X_dataset[cat_cols])
_tmp_ohe = OneHotEncoder(drop="first", handle_unknown="ignore").fit(_tmp)
all_categories = _tmp_ohe.categories_

param_grid = {
    "xgbclassifier__n_features": [500, 800, 1000],
    "pca__n_components": [0.95, 0.98, 0.99],
    "classifier__n_estimators": [100, 200, 300],
}

numeric_transformer = Pipeline(
        steps = [
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ])

cat_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=True, categories=all_categories))
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", cat_transformer, cat_cols),
            ("num", numeric_transformer, num_cols)
        ])

class XGBFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=800, n_estimators=100, learning_rate=0.05, max_depth=10, tree_method="hist", random_state=RANDOM_STATE):
        self.n_features = n_features
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.tree_method = tree_method
        self.random_state = random_state

    def fit(self, X, y):
        clf = XGBClassifier(
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            max_depth=self.max_depth,
            tree_method=self.tree_method,
            random_state=self.random_state
        )
        clf.fit(X, y)
        importances = clf.feature_importances_
        self.selected_idx_ = np.argsort(importances)[::-1][:min(self.n_features, X.shape[1])]
        return self

    def transform(self, X):
        X_sel = X[:, self.selected_idx_]
        if hasattr(X_sel, "toarray"):
            X_sel = X_sel.toarray().astype(np.float32)
        return X_sel

def to_dense(X):
    if hasattr(X, "toarray"):
        return X.toarray().astype(np.float32)
    return X.astype(np.float32)

class DebugTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, name=""):
        self.name = name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        sparse = hasattr(X, "toarray")
        print(f"[{self.name}] shape={X.shape}, sparse={sparse}, dtype={X.dtype}")
        return X


class TimeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, name=""):
        self.name = name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print(f"{monotonic()- start_time} Starting transformation")
        return X


pipeline = ImbPipeline(
        steps=[
            ("debug0", TimeTransformer(name="Before Preprocessing")),
            ("preprocessor", preprocessor),
            # ("debug1", DebugTransformer(name="After Preprocessing")),
            ("xgbclassifier", XGBFeatureSelector()),
            # ("debug2", DebugTransformer(name="After XGB Classifier")),
            # ("densifier", FunctionTransformer(to_dense)),
            # ("debug3", DebugTransformer(name="After Densifier")),
            ("syntheticdata", SMOTETomek(random_state=RANDOM_STATE)),
            # ("debug4", DebugTransformer(name="After SMOTETomek")),
            ("pca", PCA(svd_solver="full")),
            ("classifier", RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE))
        ])

Yencoder = LabelEncoder()
Yencoded = Yencoder.fit_transform(Y)

X_train, X_test, y_train, y_test = train_test_split(X_dataset, Yencoded, test_size=0.2, stratify=Yencoded, random_state=RANDOM_STATE)

search = HalvingGridSearchCV(pipeline, param_grid, cv=5, factor=3, n_jobs=6, resource="xgbclassifier__n_estimators",
                             min_resources=100, max_resources=5000, random_state=RANDOM_STATE)
start_time = monotonic()
search.fit(X_train, y_train)
print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
print("Model score: %.3f" % search.score(X_test, y_test))
labels = ["DDoS", "Intrusion", "Malware"]
y_pred = search.predict(X_test)
print(classification_report(y_pred, y_test, target_names=labels))
print("Best parameters: ", search.best_params_)

2.519614552999883 Starting transformation
2.8568527709999216 Starting transformation
3.1367116219998934 Starting transformation
3.3412819999998646 Starting transformation
3.648516806999851 Starting transformation
3.942180014999849 Starting transformation
129.0911236490001 Starting transformation
131.6338810430002 Starting transformation
133.79839264600014 Starting transformation
141.8090180600002 Starting transformation
144.80258211399996 Starting transformation
147.982774737 Starting transformation
151.04264689699994 Starting transformation
153.93981917400015 Starting transformation
156.05417521200025 Starting transformation
174.5648307800002 Starting transformation
177.41285894300017 Starting transformation
179.7568572780001 Starting transformation
208.7886488600002 Starting transformation
211.21189041800017 Starting transformation
213.8293256920001 Starting transformation
215.73937395200028 Starting transformation
218.20684885799983 Starting transformation
220.49343848099988 Startin



716.3555096330001 Starting transformation
763.3076561490002 Starting transformation
765.669410861 Starting transformation
767.696000591 Starting transformation
767.892301934 Starting transformation
770.1479768889999 Starting transformation
774.6248297360003 Starting transformation
827.7601284530001 Starting transformation
831.2919494890002 Starting transformation
838.3414600260003 Starting transformation
859.6016946610002 Starting transformation
862.2200434780002 Starting transformation
868.8951738989999 Starting transformation
873.7164243299999 Starting transformation
876.5045370170001 Starting transformation
878.3813250630001 Starting transformation
908.4539127110002 Starting transformation
912.465156966 Starting transformation
915.4428068699999 Starting transformation
954.123139537 Starting transformation
956.871364049 Starting transformation
964.962792029 Starting transformation
1127.5952869540001 Starting transformation
1130.9611977970003 Starting transformation
1134.6731241490002



2157.6963533209996 Starting transformation
2161.55918737 Starting transformation
2185.9218017850003 Starting transformation
2189.7539994960002 Starting transformation
2193.555756507 Starting transformation
2524.3477924310005 Starting transformation
2526.737784001 Starting transformation
2528.935501306 Starting transformation
2549.7904752500003 Starting transformation
2551.825405735 Starting transformation
2553.2884281600004 Starting transformation
2553.849695841 Starting transformation
2555.3409656480003 Starting transformation
2556.5239320530004 Starting transformation
2557.4536344040002 Starting transformation
2558.8579936359997 Starting transformation
2592.315212881 Starting transformation
2593.6321974039997 Starting transformation
2595.8897550449997 Starting transformation
2597.0555065330004 Starting transformation
2729.577069015 Starting transformation
2730.8003402850004 Starting transformation
2747.106359764 Starting transformation
2748.264666335 Starting transformation
2754.7730