In [1]:
from pathlib import Path
import pandas as pd
import numpy as np


from sklearn.experimental import enable_halving_search_cv
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder, OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingGridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn import set_config
set_config(enable_metadata_routing=True)
from imblearn.combine import SMOTETomek
from xgboost import XGBClassifier

from joblib import parallel_backend
from time import monotonic
from prince import FAMD, MCA


from utils.data_processing import load_data, raw_columns, full_dtypes, transform_datetime, df_ua_parser, transform_ipinfo, transform_packetinfo, transform_proxyinfo

In [2]:
data_path = Path("./data")
if not data_path.joinpath("first_ml_processing.csv").exists():
    # Must use clean_data function to load data 
    input_data_path = Path("./data/cybersecurity_attacks.csv")
    dtypes = {col: col_type for col, col_type in full_dtypes.items() if col in raw_columns}
    raw_data = load_data(input_data_path, dtype=dtypes)

    datetime_columns = ["Year", "Month", "Day", "Hour", "Minute", "Second", "DayOfWeek", "IsWeekend"]
    raw_data[datetime_columns] = transform_datetime(raw_data["Timestamp"])
    device_columns = ["String","Browser Name", "Browser Version", "Browser Minor", "Browser Patch",
                    "Browser Patch Minor", "OS Name", "OS Version", "OS Version Minor",
                    "OS Version Patch", "OS Version Patch Minor", "Device Brand", "Device Model",
                    "Device Type"]
    raw_data[device_columns] = df_ua_parser(raw_data["Device Information"])
    proxy_columns = ["Is Proxy"]
    raw_data[proxy_columns] = transform_proxyinfo(raw_data["Proxy Information"])
    ip_columns = ["Int Source IP", "Int Destination IP", "Global Source IP", "Global Destination IP"]
    raw_data[ip_columns] = transform_ipinfo(raw_data[["Source IP Address", "Destination IP Address"]])
    packet_columns = ["Packet Bin"]
    raw_data[packet_columns] = transform_packetinfo(raw_data["Packet Length"], scale=False)

    processed_data = raw_data.drop(columns=["Payload Data","Timestamp", "String", "Device Information",
                                    "Proxy Information", "Source IP Address", "Destination IP Address"])
    processed_data.to_csv(data_path.joinpath("first_ml_processing.csv"), index=False)
processed_data = pd.read_csv(data_path.joinpath("first_ml_processing.csv"))

In [3]:
Y_true = processed_data["Attack Type"].copy()
X_dataset = processed_data.copy().drop(columns=["Attack Type", "Browser Patch" , "Browser Patch Minor",
                                                "OS Version", "OS Version Minor", "OS Version Patch", "OS Version Patch Minor",
                                                "Device Type", "User Information", "Geo-location Data"])

# Feature Selection
## PCA Analysis

In [None]:
cat_cols = X_dataset.select_dtypes(include=["category","str"]).columns
num_cols = X_dataset.select_dtypes(include=["number"]).columns
bool_cols = X_dataset.select_dtypes(include=["bool"]).columns
passthrough_columns = [col for col in X_dataset.columns if col not in cat_cols and col not in bool_cols and col not in num_cols]
    
numeric_transformer = Pipeline(
        steps = [("scaler", StandardScaler())]
    )

categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="unknown")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
        ])
boolean_transformer = Pipeline([
        ("encoder", TargetEncoder(target_type="binary")),
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, cat_cols),
            ("num", numeric_transformer, num_cols),
            ("bool", boolean_transformer, bool_cols)
        ]
    )

param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 10, 30],
    "pca__n_components": [2, 6, 10, 15]
}

pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("pca", PCA(random_state=124)),
            ("classifier", RandomForestClassifier(random_state=124))
        ])

relaunch =""
while relaunch.lower() not in ["y", "n"]:
    relaunch = input("Do you want to relaunch the grid search with an ordinal encoder for categorical data? (y/n) ")
if relaunch.lower() == "y":
    X_train, X_test, y_train, y_test = train_test_split(X_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
    start_time = monotonic()
    gs = GridSearchCV(pipeline, param_grid, cv=5)
    with parallel_backend('threading', n_jobs=6):
        gs.fit(X_train, y_train)
    print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
    print("Model score: %.3f" % gs.score(X_test, y_test))
    print("Best parameters: ", gs.best_params_)

Time taken to fit the model: 648.73 seconds
Model score: 0.335
Best parameters:  {'classifier__max_depth': 20, 'classifier__min_samples_split': 30, 'classifier__n_estimators': 100, 'pca__n_components': 10}


In [5]:
Xcat_dataset = X_dataset.copy()
columns = ["Year", "Month", "Day", "Hour", "Minute", "Second", "DayOfWeek", "Browser Version", "Browser Minor"]
Xcat_dataset[columns] = Xcat_dataset[columns].astype("str")

In [None]:
cat_cols = Xcat_dataset.select_dtypes(include=["category","str"]).columns
num_cols = Xcat_dataset.select_dtypes(include=["number"]).columns
bool_cols = Xcat_dataset.select_dtypes(include=["bool"]).columns
passthrough_columns = [col for col in Xcat_dataset.columns if col not in cat_cols and col not in bool_cols and col not in num_cols]
    
numeric_transformer = Pipeline(
        steps = [("scaler", StandardScaler())]
    )

categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
        ])
boolean_transformer = Pipeline([
        ("encoder", TargetEncoder(target_type="binary")),
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, cat_cols),
            ("num", numeric_transformer, num_cols),
            ("bool", boolean_transformer, bool_cols)
        ]
    )

param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 10, 30],
    "pca__n_components": [2, 6, 10, 15]
}

pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("pca", PCA(random_state=124)),
            ("classifier", RandomForestClassifier(random_state=124))
        ])
relaunch =""
while relaunch.lower() not in ["y", "n"]:
    relaunch = input("Do you want to relaunch the PCA analysis with more categorical columns? (y/n) ")
if relaunch.lower() == "y":
    X_train, X_test, y_train, y_test = train_test_split(Xcat_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
    start_time = monotonic()
    gs = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=6)
    gs.fit(X_train, y_train)
    print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
    print("Model score: %.3f" % gs.score(X_test, y_test))
    print("Best parameters: ", gs.best_params_)



Time taken to fit the model: 660.44 seconds
Model score: 0.332
Best parameters:  {'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100, 'pca__n_components': 6}


In [7]:
Xcat_dataset = X_dataset.copy()
columns = ["Year", "Month", "Day", "Hour", "Minute", "Second", "DayOfWeek", "Browser Version", "Browser Minor",
           "Global Source IP", "Global Destination IP","IsWeekend", "Is Proxy", "Device Brand", "Device Model", "Browser Name","OS Name"]
Xcat_dataset = Xcat_dataset.drop(columns=columns)
bool_cols = Xcat_dataset.select_dtypes(include=["bool"]).columns
Xcat_dataset[bool_cols] = Xcat_dataset[bool_cols].astype("str")
cat_cols = Xcat_dataset.select_dtypes(include=["object","str"]).columns
num_cols = Xcat_dataset.select_dtypes(include="number").columns
# Xcat_dataset[cat_cols] = Xcat_dataset[cat_cols].apply(lambda x: x.fillna("unknown"))
# numeric_transformer = Pipeline(
#         steps = [
#             ("imputer", SimpleImputer(strategy="mean")),
#             ("scaler", StandardScaler())
#         ])
# Xcat_dataset[num_cols] = numeric_transformer.fit_transform(Xcat_dataset[num_cols])

X_train, X_test, y_train, y_test = train_test_split(Xcat_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)

In [8]:
passthrough_columns = [col for col in Xcat_dataset.columns if col not in cat_cols and col not in bool_cols and col not in num_cols]


numeric_transformer = Pipeline(
        steps = [
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ])

cat_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="unknown")),
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", cat_transformer, cat_cols),
            ("num", numeric_transformer, num_cols)
        ]).set_output(transform="pandas")

param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 10, 30],
    "famd__n_components": [2, 6, 10, 15]
}

pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("famd", FAMD()),
            ("classifier", RandomForestClassifier(random_state=124))
        ])

In [9]:
relaunch =""
while relaunch.lower() not in ["y", "n"]:
    relaunch = input("Do you want to relaunch the FAMD analysis? (y/n) ")
if relaunch.lower() == "y":
    start_time = monotonic()
    gs = GridSearchCV(pipeline, param_grid, cv=2, n_jobs=6)
    gs.fit(X_train, y_train)
    print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
    print("Model score: %.3f" % gs.score(X_test, y_test))
    print("Best parameters: ", gs.best_params_)



Time taken to fit the model: 147.93 seconds
Model score: 0.326
Best parameters:  {'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100, 'famd__n_components': 10}


In [10]:
xtrain_processed = preprocessor.fit_transform(X_train)

In [11]:
famd = FAMD(n_components=10)
xtrain_famd = famd.fit_transform(xtrain_processed)
famd.eigenvalues_summary

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.018,16.96%,16.96%
1,1.01,16.83%,33.79%
2,1.004,16.74%,50.52%
3,0.995,16.59%,67.12%
4,0.99,16.50%,83.62%
5,0.983,16.38%,100.00%
6,0.0,0.00%,100.00%
7,0.0,0.00%,100.00%
8,0.0,0.00%,100.00%
9,0.0,0.00%,100.00%


In [12]:
Xcat_dataset = X_dataset.copy()
columns = ["Year", "Month", "Day", "Hour", "Minute", "Second", "DayOfWeek", "Browser Version", "Browser Minor"]
Xcat_dataset[columns] = Xcat_dataset[columns].astype("str")
bool_cols = Xcat_dataset.select_dtypes(include=["bool"]).columns
Xcat_dataset[bool_cols] = Xcat_dataset[bool_cols].astype("str")

In [None]:
cat_cols = Xcat_dataset.select_dtypes(include=["category","str"]).columns
num_cols = Xcat_dataset.select_dtypes(include=["number"]).columns
passthrough_columns = [col for col in Xcat_dataset.columns if col not in cat_cols and col not in bool_cols and col not in num_cols]
    
numeric_transformer = Pipeline(
        steps = [
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ])

cat_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False))
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", cat_transformer, cat_cols),
            ("num", numeric_transformer, num_cols)
        ])

param_grid = {
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 10, 30],
}

pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier(random_state=124))
        ])


In [14]:
relaunch =""
while relaunch.lower() not in ["y", "n"]:
    relaunch = input("Do you want to relaunch the HalvingGridSearchCV analysis? (y/n) ")
if relaunch.lower() == "y":
    X_train, X_test, y_train, y_test = train_test_split(Xcat_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
    start_time = monotonic()
    gs = HalvingGridSearchCV(pipeline, param_grid, resource = "classifier__n_estimators", min_resources=10 , max_resources=500)
    gs.fit(X_train, y_train)
    print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
    print("Model score: %.3f" % gs.score(X_test, y_test))
    print("Best parameters: ", gs.best_params_)

Time taken to fit the model: 166.84 seconds
Model score: 0.334
Best parameters:  {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 90}


In [15]:
columns = ["Int Source IP", "Int Destination IP", "Source Port", "Destination Port", "Protocol", "Packet Type", "Traffic Type", "Attack Signature"]
Xsim_dataset = X_dataset[columns].copy()
bool_cols = Xsim_dataset.select_dtypes(include=["bool"]).columns
Xsim_dataset[bool_cols] = Xsim_dataset[bool_cols].astype("str")

In [None]:
cat_cols = Xsim_dataset.select_dtypes(include=["category","str"]).columns
num_cols = Xsim_dataset.select_dtypes(include=["number"]).columns
passthrough_columns = [col for col in Xsim_dataset.columns if col not in cat_cols and col not in bool_cols and col not in num_cols]
    
numeric_transformer = Pipeline(
        steps = [
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ])

cat_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False))
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", cat_transformer, cat_cols),
            ("num", numeric_transformer, num_cols)
        ])

param_grid = {
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 10, 30],
}

pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier(random_state=124))
        ])


In [17]:
relaunch =""
while relaunch.lower() not in ["y", "n"]:
    relaunch = input("Do you want to relaunch the simple HalvingGridSearchCV analysis? (y/n) ")
if relaunch.lower() == "y":
    X_train, X_test, y_train, y_test = train_test_split(Xsim_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
    start_time = monotonic()
    gs = HalvingGridSearchCV(pipeline, param_grid, resource = "classifier__n_estimators", min_resources=10 , max_resources=1000, factor=2)
    gs.fit(X_train, y_train)
    print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
    print("Model score: %.3f" % gs.score(X_test, y_test))
    print("Best parameters: ", gs.best_params_)

Time taken to fit the model: 94.16 seconds
Model score: 0.335
Best parameters:  {'classifier__max_depth': 20, 'classifier__min_samples_split': 30, 'classifier__n_estimators': 80}


In [18]:
columns = ["Source Port", "Destination Port", "Protocol", "Packet Type", "Traffic Type", "Attack Signature", "Network Segment"]
Xsim_dataset = X_dataset[columns].copy()
bool_cols = Xsim_dataset.select_dtypes(include=["bool"]).columns
Xsim_dataset[bool_cols] = Xsim_dataset[bool_cols].astype("str")

In [None]:
cat_cols = Xsim_dataset.select_dtypes(include=["category","str"]).columns
num_cols = Xsim_dataset.select_dtypes(include=["number"]).columns
passthrough_columns = [col for col in Xsim_dataset.columns if col not in cat_cols and col not in bool_cols and col not in num_cols]
    
numeric_transformer = Pipeline(
        steps = [
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ])

cat_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False))
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", cat_transformer, cat_cols),
            ("num", numeric_transformer, num_cols)
        ])

param_grid = {
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 10, 30],
}

pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier(random_state=124))
        ])


In [20]:
relaunch =""
while relaunch.lower() not in ["y", "n"]:
    relaunch = input("Do you want to relaunch the simple HalvingGridSearchCV analysis? (y/n) ")
if relaunch.lower() == "y":
    X_train, X_test, y_train, y_test = train_test_split(Xsim_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
    start_time = monotonic()
    gs = HalvingGridSearchCV(pipeline, param_grid, resource = "classifier__n_estimators", min_resources=10 , max_resources=500)
    gs.fit(X_train, y_train)
    print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
    print("Model score: %.3f" % gs.score(X_test, y_test))
    print("Best parameters: ", gs.best_params_)

Time taken to fit the model: 44.86 seconds
Model score: 0.329
Best parameters:  {'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 90}
