In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.experimental import enable_halving_search_cv
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, HalvingGridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn import set_config
set_config(enable_metadata_routing=True)

from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

from time import monotonic

from utils.data_processing import load_data, transform_ipinfo, transform_datetime, transform_proxyinfo, df_ua_parser, transform_packetinfo, full_dtypes, raw_columns

In [2]:
RANDOM_STATE = 124
data_path = Path("./data")
if data_path.joinpath("first_ml_processing.csv").exists():
    processed_data = pd.read_csv(data_path.joinpath("first_ml_processing.csv"))
    raw_data = pd.read_csv(data_path.joinpath("cybersecurity_attacks.csv"))
    
    ip_cols = ["Int Source IP", "Int Destination IP", "Global Source IP", "Global Destination IP"]
    raw_data[ip_cols] = transform_ipinfo(raw_data[["Source IP Address", "Destination IP Address"]])
else:
    # Must use clean_data function to load data 
    dtypes = {col: col_type for col, col_type in full_dtypes.items() if col in raw_columns}
    raw_data = load_data(data_path.joinpath("cybersecurity_attacks.csv"), dtype=dtypes)

    datetime_cols = ["Year", "Month", "Day", "Hour", "Minute", "Second", "DayOfWeek", "IsWeekend"]
    raw_data[datetime_cols] = transform_datetime(raw_data["Timestamp"])
    device_cols = ["String","Browser Name", "Browser Version", "Browser Minor", "Browser Patch",
                    "Browser Patch Minor", "OS Name", "OS Version", "OS Version Minor",
                    "OS Version Patch", "OS Version Patch Minor", "Device Brand", "Device Model",
                    "Device Type"]
    raw_data[device_cols] = df_ua_parser(raw_data["Device Information"])
    proxy_cols = ["Is Proxy"]
    raw_data[proxy_cols] = transform_proxyinfo(raw_data["Proxy Information"])
    ip_cols = ["Int Source IP", "Int Destination IP", "Global Source IP", "Global Destination IP"]
    raw_data[ip_cols] = transform_ipinfo(raw_data[["Source IP Address", "Destination IP Address"]])
    packet_cols = ["Packet Bin"]
    raw_data[packet_cols] = transform_packetinfo(raw_data["Packet Length"], scale=False)

    processed_data = raw_data.drop(columns=["Payload Data","Timestamp", "String", "Device Information",
                                    "Proxy Information", "Source IP Address", "Destination IP Address"])
    processed_data.to_csv(data_path.joinpath("first_ml_processing.csv"), index=False)


In [3]:
X_orig = raw_data.drop(columns=["Attack Type"])
y_orig = raw_data["Attack Type"].astype("category").cat.codes
y_test = raw_data["Attack Type"]
labels = ["DDoS", "Intrusion", "Malware"]

# Model found by Merouane

## Pipeline Defition

In [None]:
num_cols = X_orig.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X_orig.select_dtypes(include=['object',"str"]).columns.tolist()

preproc_orig = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ]
)
X_proc = preproc_orig.fit_transform(X_orig)

xgb_fs = XGBClassifier(
        n_estimators=200, learning_rate=0.05, max_depth=10,
        random_state=RANDOM_STATE, tree_method="hist"
    )
xgb_fs.fit(X_proc, y_orig)
importances_orig = xgb_fs.feature_importances_
idx_orig = np.argsort(importances_orig)[::-1][:min(800, X_proc.shape[1])]
X_sel = X_proc[:, idx_orig]
y_sel = y_orig.copy()

if hasattr(X_sel, "toarray"):
    X_sel = X_sel.toarray()

sm = SMOTETomek(random_state=RANDOM_STATE)
X_sel, y_sel = sm.fit_resample(X_sel, y_sel)
y_sel.shape


From Smote processing, around 28000 observations are created. Must be comapred to the initial 400000. To keep in mind when we will give smotetomek specific split

In [5]:
X_sel = PCA(n_components=0.98, svd_solver="full").fit_transform(X_sel)

X_tr, X_te, y_tr, y_te = train_test_split(
    X_sel, y_sel, test_size=0.2, random_state=RANDOM_STATE, stratify=y_sel
)

## Model Fitting & Result

In [6]:
model = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)
model.fit(X_tr, y_tr)
pred = model.predict(X_te)
accuracy = accuracy_score(y_te, pred)
print(f"Final Accuracy: {accuracy:.4f}")
print(classification_report(y_te, pred, target_names=labels))

Final Accuracy: 0.4497
              precision    recall  f1-score   support

        DDoS       0.44      0.43      0.44      1813
   Intrusion       0.46      0.46      0.46      1827
     Malware       0.45      0.46      0.45      1835

    accuracy                           0.45      5475
   macro avg       0.45      0.45      0.45      5475
weighted avg       0.45      0.45      0.45      5475



# Model using all features

There is no features reduction using the xgbclassifier before the smotetomek steps

## Pipeline Definition

In [7]:
preproc_orig = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ]
)
X_proc = preproc_orig.fit_transform(X_orig)

X_sel = X_proc.copy()
y_sel = y_orig.copy()

sm = SMOTETomek(random_state=RANDOM_STATE)
X_sel, y_sel = sm.fit_resample(X_sel, y_sel)
X_sel = PCA(n_components=200, svd_solver="auto").fit_transform(X_sel)

X_tr, X_te, y_tr, y_te = train_test_split(
    X_sel, y_sel, test_size=0.2, random_state=RANDOM_STATE, stratify=y_sel
)

model = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)
start_time = monotonic()
model.fit(X_tr, y_tr)
print(f"Training time: {monotonic() - start_time:.2f} seconds")
pred = model.predict(X_te)
accuracy = accuracy_score(y_te, pred)
print(f"Final Accuracy: {accuracy:.4f}")
print(classification_report(y_te, pred, target_names=labels))

Training time: 113.32 seconds
Final Accuracy: 0.3451
              precision    recall  f1-score   support

        DDoS       0.34      0.33      0.34      2074
   Intrusion       0.35      0.36      0.36      2094
     Malware       0.35      0.34      0.34      2091

    accuracy                           0.35      6259
   macro avg       0.35      0.35      0.34      6259
weighted avg       0.35      0.35      0.34      6259



Need to use xgbclassifier to improve the model

# Search of Optimal Hyper-Parameters for the steps

## Pipeline Selection

In [8]:
# Custom transformer for XGBoost feature selection
class XGBFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=800, random_state=124):
        self.n_features = n_features
        self.random_state = random_state
        self.feature_indices_ = None
        
    def fit(self, X, y):
        # Train XGBoost to get feature importances
        xgb = XGBClassifier(
            n_estimators=200, 
            learning_rate=0.05, 
            max_depth=10,
            random_state=self.random_state, 
            tree_method="hist"
        )
        xgb.fit(X, y)
        
        # Select top N features
        importances = xgb.feature_importances_
        self.feature_indices_ = np.argsort(importances)[::-1][:min(self.n_features, X.shape[1])]
        return self
    
    def transform(self, X):
    
        return X[:, self.feature_indices_]
    
    def set_fit_request(self, *, y: bool = True):
        return self


X_train, X_test, y_train, y_test = train_test_split(
    X_orig, y_orig, test_size=0.2, random_state=RANDOM_STATE, stratify=y_orig
)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ]
)

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', XGBFeatureSelector(random_state=RANDOM_STATE)),
    ('smote', SMOTETomek(random_state=RANDOM_STATE)),
    ('pca', PCA(n_components=200, svd_solver="auto")),  # Use integer with auto solver
    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
])

param_grid = {
    'feature_selector__n_features': [400, 800, 1200],
    'pca__n_components': [100, 200, 400],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [ 2, 5],
    'classifier__max_features': ['sqrt', 'log2']
}


## Model Fitting & Result

In [9]:
ans = ""
while ans.lower() not in ["y", "n"]:
    ans = input(f"About to run a hyperparameter search. Do you want to proceed? (y/n): ")
if ans.lower() == "y":

    print("Starting hyperparameter search with pipeline...")
    print(f"Testing {len(param_grid['feature_selector__n_features'])} feature counts")
    print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")
    print(f"Using n_estimators as resource: [30, 90, 270] trees")
    start_time = monotonic()

    halving_cv = HalvingGridSearchCV(
        pipeline, 
        param_grid,
        resource='classifier__n_estimators',
        min_resources=30,
        max_resources=270,
        factor=3,
        cv=3,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=2
    )
    halving_cv.fit(X_train, y_train)

    elapsed = monotonic() - start_time
    print(f"\n{'='*60}")
    print(f"Search completed in {elapsed:.2f} seconds ({elapsed/60:.1f} minutes)")
    print(f"{'='*60}")

    # Best results
    print(f"\nüèÜ Best parameters found:")
    for param, value in halving_cv.best_params_.items():
        print(f"   {param}: {value}")
    print(f"\nBest CV score: {halving_cv.best_score_:.4f}")

    # Evaluate on test set
    pred = halving_cv.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    print(f"Test accuracy: {accuracy:.4f}")

    print(f"\n{classification_report(y_test, pred, target_names=labels)}")

    # Show top 5 configurations
    print(f"\n{'='*60}")
    print("Top 5 configurations:")
    print(f"{'='*60}")
    cv_results = pd.DataFrame(halving_cv.cv_results_)
    top_5 = cv_results.nlargest(5, 'mean_test_score')[
        ['param_feature_selector__n_features', 'param_classifier__n_estimators', 
        'param_classifier__max_depth', 'mean_test_score', 'rank_test_score']
    ]
    print(top_5.to_string(index=False))

# Model with SmoteTomek and imbalance on outputs

We defined the output of smotetomek instead of putting it on auto. <br>
As we have seen the randomforestclassifier tends to fit more to DDoS attack type, we will create more inputs for the others 2 types <br>
The precision was around 88% for DDoS while it was 6% for the ohter 2. So we will over-compensate by that percentage

In [None]:
_tmp = SimpleImputer(strategy="constant", fill_value="unknown").fit_transform(X_orig[cat_cols])
_tmp_ohe = OneHotEncoder(handle_unknown='ignore',drop="first", sparse_output=True).fit(_tmp)
all_categories = _tmp_ohe.categories_

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore',drop="first", sparse_output=True, categories=all_categories), cat_cols)
    ]
)

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTETomek(random_state=RANDOM_STATE)),
    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
])

param_grid = {
    "smote__sampling_strategy": [{0: 10000, 1: 20000, 2: 20000},
                                 {0: 10000, 1: 30000, 2: 30000},
                                 {0: 15000, 1: 15000, 2: 15000},
                                 ],
    'classifier__max_depth': [None, 10, 20],
    'classifier__max_features': ['sqrt', 'log2']
}

halving_cv = HalvingGridSearchCV(
    pipeline, 
    param_grid,
    resource='classifier__n_estimators',
    min_resources=30,
    max_resources=270,
    factor=3,
    cv=3,
    random_state=RANDOM_STATE,
    n_jobs=8,
    verbose=4
)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_orig, y_orig, test_size=0.2, random_state=RANDOM_STATE, stratify=y_orig)

start_time = monotonic()
halving_cv.fit(X_train, y_train)
print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
print("Best parameters: ", halving_cv.best_params_)
print("Model score: %.3f" % halving_cv.score(X_test, y_test))
y_pred = halving_cv.predict(X_test)
print(classification_report(y_pred, y_test, target_names=labels))

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 30
max_resources_: 270
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 18
n_resources: 30
Fitting 3 folds for each of 18 candidates, totalling 54 fits




KeyboardInterrupt: 