In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.experimental import enable_halving_search_cv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import set_config
set_config(enable_metadata_routing=True)
RANDOM_STATE = 124

from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

from time import monotonic

from utils.data_processing import load_data, raw_columns, full_dtypes

In [2]:
data_path = Path("./data")
input_data_path = Path("./data/cybersecurity_attacks.csv")
dtypes = {col: col_type for col, col_type in full_dtypes.items() if col in raw_columns}
raw_data = load_data(input_data_path)
cat_cols = raw_data.select_dtypes(include=["category"]).columns
raw_data[cat_cols] = raw_data[cat_cols].astype("str")
X_dataset = raw_data.drop(columns=["Attack Type"])
Y = raw_data["Attack Type"]
X_dataset = X_dataset.fillna(np.nan)

In [3]:
num_cols = X_dataset.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_dataset.select_dtypes(include=["object","str"]).columns.tolist()
pass_cols = [col for col in X_dataset.columns if col not in cat_cols and col not in num_cols]

_tmp = SimpleImputer(strategy="constant", fill_value="unknown").fit_transform(X_dataset[cat_cols])
_tmp_ohe = OneHotEncoder(drop="first", handle_unknown="ignore").fit(_tmp)
all_categories = _tmp_ohe.categories_

param_grid = {
    "pca__n_components": [0.95, 0.98, 0.99],
    "classifier__n_estimators": [100, 200, 300],
    "classifier__max_depth": [None, 10, 30]
}

numeric_transformer = Pipeline(
        steps = [
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ])

cat_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=True, categories=all_categories))
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", cat_transformer, cat_cols),
            ("num", numeric_transformer, num_cols)
        ])

class XGBFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=800, n_estimators=100, learning_rate=0.05, max_depth=10, tree_method="hist", random_state=RANDOM_STATE):
        self.n_features = n_features
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.tree_method = tree_method
        self.random_state = random_state

    def fit(self, X, y):
        clf = XGBClassifier(
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            max_depth=self.max_depth,
            tree_method=self.tree_method,
            random_state=self.random_state
        )
        clf.fit(X, y)
        importances = clf.feature_importances_
        self.selected_idx_ = np.argsort(importances)[::-1][:min(self.n_features, X.shape[1])]
        return self

    def transform(self, X):
        X_sel = X[:, self.selected_idx_]
        if hasattr(X_sel, "toarray"):
            X_sel = X_sel.toarray().astype(np.float32)
        return X_sel

def to_dense(X):
    if hasattr(X, "toarray"):
        return X.toarray().astype(np.float32)
    return X.astype(np.float32)

class DebugTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, name=""):
        self.name = name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        sparse = hasattr(X, "toarray")
        print(f"[{self.name}] shape={X.shape}, sparse={sparse}, dtype={X.dtype}")
        return X

pipeline = ImbPipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("debug1", DebugTransformer(name="After Preprocessing")),
            ("xgbclassifier", XGBFeatureSelector()),
            ("debug2", DebugTransformer(name="After XGB Classifier")),
            ("densifier", FunctionTransformer(to_dense)),
            ("debug3", DebugTransformer(name="After Densifier")),
            ("syntheticdata", SMOTETomek(random_state=RANDOM_STATE)),
            ("debug4", DebugTransformer(name="After SMOTETomek")),
            ("pca", PCA(n_components=0.98, svd_solver="full")),
            ("classifier", RandomForestClassifier( random_state=RANDOM_STATE))
        ])
Yencorder = LabelEncoder()

Yencoded = Yencorder.fit_transform(Y)

X_train, X_test, y_train, y_test = train_test_split(X_dataset, Yencoded, test_size=0.2, stratify=Yencoded, random_state=RANDOM_STATE)

search = HalvingGridSearchCV(pipeline, param_grid, cv=5, factor=3, n_jobs=6, resource="xgbclassifier__n_estimators",
                             min_resources=100, max_resources=5000, random_state=RANDOM_STATE)
start_time = monotonic()
search.fit(X_train, y_train)
print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
print("Model score: %.3f" % search.score(X_test, y_test))
print("Best parameters: ", search.best_params_)

[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After SMOTETomek] s



[After Preprocessing] shape=(6400, 253375), sparse=True, dtype=float64
[After XGB Classifier] shape=(6400, 800), sparse=False, dtype=float32
[After Densifier] shape=(6400, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(6400, 800), sparse=False, dtype=float32
[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(25600, 800), sparse=False, dtype=float32
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After SMOTETomek] shape=(18146, 800), sparse=False, dtype=float32
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 8



[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(18202, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(18251, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(18305, 800), sparse=False, dtype=float32
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After Preprocessing] shape=(6400, 253375), sparse=True, dtype=float64
[After XGB Classifier] shape=(6400, 800), sparse=False, dtype=float32
[After Densifier] shape=(6400, 800), s



[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After Preprocessing] shape=(6400, 253375), sparse=True, dtype=float64
[After XGB Classifier] shape=(6400, 800), sparse=False, dtype=float32
[After Densifier] shape=(6400, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(6400, 800), sparse=False, dtype=float32
[After Preprocessing] shape=(25600, 253375), sparse=True, dtype=float64
[After XGB Classifier] shape=(25600, 800), sparse=False, dtype=float32
[After Densifier] shape=(25600, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(25600, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(18260, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(18546, 800), sparse=False, dtype=float32
[After Preprocessing] shape=(6400, 253375), sparse=True, dtype=float64
[After XGB Classifier] shape=(6400, 800), sparse=False, dtype=float32
[After Densifier] shape=(6400, 800), sparse=False, dtype=float32
[After SMOTETomek] shape=(6400, 800), sp