In [1]:
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder, OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, HalvingGridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.inspection import permutation_importance 
from sklearn import set_config
# set_config(enable_metadata_routing=True)


from utils.data_processing import transform_ipinfo

In [2]:
data_path = Path("data")
raw_data = pd.read_csv(data_path.joinpath("cybersecurity_attacks.csv"))
df = raw_data.copy()
ip_features = transform_ipinfo(raw_data[["Source IP Address", "Destination IP Address","Proxy Information"]])

In [4]:
RANDOM_STATE = 124

y = raw_data['Attack Type']
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_labels = le.classes_.tolist()

In [6]:
df = raw_data.copy().drop(columns=["Attack Type","Source IP Address", "Destination IP Address"])
df = df.merge(ip_features[["Int Source IP Address","Int Destination IP Address"]], left_index=True, right_index=True, how="left")
cat_cols = [col for col in df.select_dtypes(include=["object","str"]).columns if col != "Payload Data"]
num_cols = df.select_dtypes(include="number").columns
text_feature = 'Payload Data'

_tmp = SimpleImputer(strategy="constant", fill_value="None").fit_transform(df[cat_cols])
_tmp_ohe = OneHotEncoder(handle_unknown="ignore",drop="first").fit(_tmp)
all_categories = _tmp_ohe.categories_ # make sure all categories are included in the one-hot encoding. Did this for high dimensionality features that will be dropped later anyway.

cat_prepocessor = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="None")), # Fill missing values with "None" in the pipeline directly instedad of pre-filling the dataframe. Avoid errors and is more efficient.
    ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=True, categories=all_categories))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),                
        ('cat', cat_prepocessor, cat_cols), 
        ('text', TfidfVectorizer(max_features=50), text_feature)      
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1))
])

X_train, X_test, y_train, y_test = train_test_split(df, y_encoded, test_size=0.2, random_state=RANDOM_STATE)

print("Training the pipeline... Please wait...")
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print("\n--- Pipeline Results ---")
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=y_labels))

Training the pipeline... Please wait...

--- Pipeline Results ---
Accuracy Score: 0.3324

Classification Report:
              precision    recall  f1-score   support

        DDoS       0.33      0.41      0.36      2636
   Intrusion       0.34      0.28      0.31      2721
     Malware       0.34      0.31      0.32      2643

    accuracy                           0.33      8000
   macro avg       0.33      0.33      0.33      8000
weighted avg       0.33      0.33      0.33      8000

