In [20]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn import set_config
set_config(enable_metadata_routing=True)

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from time import monotonic

from utils.data_processing import load_data, raw_columns, full_dtypes, transform_datetime, df_ua_parser, transform_ipinfo, transform_packetinfo, transform_proxyinfo

# Input Data Processing

## First Model

In [2]:
data_path = Path("./data")
if data_path.joinpath("first_ml_processing.csv").exists():
    processed_data = pd.read_csv(data_path.joinpath("first_ml_processing.csv"))
else:
    # Must use clean_data function to load data 
    input_data_path = Path("./data/cybersecurity_attacks.csv")
    dtypes = {col: col_type for col, col_type in full_dtypes.items() if col in raw_columns}
    raw_data = load_data(input_data_path, dtype=dtypes)

    datetime_columns = ["Year", "Month", "Day", "Hour", "Minute", "Second", "DayOfWeek", "IsWeekend"]
    raw_data[datetime_columns] = transform_datetime(raw_data["Timestamp"])
    device_columns = ["String","Browser Name", "Browser Version", "Browser Minor", "Browser Patch",
                    "Browser Patch Minor", "OS Name", "OS Version", "OS Version Minor",
                    "OS Version Patch", "OS Version Patch Minor", "Device Brand", "Device Model",
                    "Device Type"]
    raw_data[device_columns] = df_ua_parser(raw_data["Device Information"])
    proxy_columns = ["Is Proxy"]
    raw_data[proxy_columns] = transform_proxyinfo(raw_data["Proxy Information"])
    ip_columns = ["Int Source IP", "Int Destination IP", "Global Source IP", "Global Destination IP"]
    raw_data[ip_columns] = transform_ipinfo(raw_data[["Source IP Address", "Destination IP Address"]])
    packet_columns = ["Packet Bin"]
    raw_data[packet_columns] = transform_packetinfo(raw_data["Packet Length"], scale=False)

    processed_data = raw_data.drop(columns=["Payload Data","Timestamp", "String", "Device Information",
                                    "Proxy Information", "Source IP Address", "Destination IP Address"])
    processed_data.to_csv(data_path.joinpath("first_ml_processing.csv"), index=False)


Because we are not combining features, all numerical features with missing values will not be considered. There is no method to apply a specific method to replace it for features like OS version patch minor or browser patch minor without getting extra data

In [3]:
Y_true = processed_data["Attack Type"].copy()
X_dataset = processed_data.copy().drop(columns=["Attack Type", "Browser Patch" , "Browser Patch Minor",
                                                "OS Version", "OS Version Minor", "OS Version Patch", "OS Version Patch Minor",
                                                "Device Type", "User Information", "Geo-location Data"])

In [4]:
categorical_columns = X_dataset.select_dtypes(include=["str", "category"]).columns
boolean_columns = X_dataset.select_dtypes(include=["bool"]).columns
numeric_columns = X_dataset.select_dtypes(include=["int64", "float64"]).columns
passthrough_columns = [col for col in X_dataset.columns if col not in categorical_columns and col not in boolean_columns and col not in numeric_columns]

numeric_transformer = Pipeline(
        steps = [("scaler", StandardScaler())]
    )

categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first"))
        ])
boolean_transformer = Pipeline([
        ("encoder", TargetEncoder(target_type="binary")),
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_columns),
            ("num", numeric_transformer, numeric_columns),
            ("bool", boolean_transformer, boolean_columns)
        ]
    )

model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=35))
    ])

X_train, X_test, y_train, y_test = train_test_split(X_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
start_time = monotonic()
model.fit(X_train, y_train)
print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
print("Model score: %.3f" % model.score(X_test, y_test))

Time taken to fit the model: 6.59 seconds
Model score: 0.324


So nothing new. We obtain a 1/3 which is expected from random value

In [5]:
importances = model.named_steps['classifier'].feature_importances_
std = np.std([tree.feature_importances_ for tree in model.named_steps['classifier'].estimators_], axis=0)
feature_names = [f"{x}" for x in model.named_steps["preprocessor"].get_feature_names_out()]
forest_importances = pd.DataFrame({"feature": feature_names, "importance": importances, "std": std}).sort_values("importance", ascending=False)

fig = px.bar(forest_importances,x="feature",y="importance", error_y="std", title="Feature importances using MDI")
fig.show()


Each feature show very low importance (most below 5%)

## 2nd model

Some numeric features are considered as categories now. All features related to date and time information as well as the one related to browser and device version

In [6]:
Xcat_dataset = X_dataset.copy()
columns = ["Year", "Month", "Day", "Hour", "Minute", "Second", "DayOfWeek", "Browser Version", "Browser Minor"]
Xcat_dataset[columns] = Xcat_dataset[columns].astype("category")

In [7]:
categorical_columns = Xcat_dataset.select_dtypes(include=["str", "category"]).columns
boolean_columns = Xcat_dataset.select_dtypes(include=["bool"]).columns
numeric_columns = Xcat_dataset.select_dtypes(include=["int64", "float64"]).columns
passthrough_columns = [col for col in Xcat_dataset.columns if col not in categorical_columns and col not in boolean_columns and col not in numeric_columns]

numeric_transformer = Pipeline(
        steps = [("scaler", StandardScaler())]
    )

categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first"))
        ])
boolean_transformer = Pipeline([
        ("encoder", TargetEncoder(target_type="binary")),
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_columns),
            ("num", numeric_transformer, numeric_columns),
            ("bool", boolean_transformer, boolean_columns)
        ]
    )

cat_model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=35))
    ])

X_train, X_test, y_train, y_test = train_test_split(X_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
start_time = monotonic()
cat_model.fit(X_train, y_train)
print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
print("Model score: %.3f" % cat_model.score(X_test, y_test))

Time taken to fit the model: 36.50 seconds
Model score: 0.339


In [8]:
importances = cat_model.named_steps['classifier'].feature_importances_
std = np.std([tree.feature_importances_ for tree in cat_model.named_steps['classifier'].estimators_], axis=0)
feature_names = [f"{x}" for x in cat_model.named_steps["preprocessor"].get_feature_names_out()]
forest_importances = pd.DataFrame({"feature": feature_names, "importance": importances, "std": std}).sort_values("importance", ascending=False)

fig = px.bar(forest_importances[forest_importances["importance"] > 0.01],x="feature",y="importance", error_y="std", title="Feature importances using MDI")
fig.show()

Not better

## 3rd Model
For the third model we will combine some features.
First ones are: features related to browser will be merged, features related to device will be merged together and (Protocol,Packet Type,Scale Packet Length) will be merged (from the features analysis it was the one with the higher mean between min max of each combination)

In [9]:
Xmer_dataset = X_dataset.copy()
Xmer_dataset["Browser"] = Xmer_dataset["Browser Name"] + "_" + Xmer_dataset["Browser Version"].astype(str) + "_" + Xmer_dataset["Browser Minor"].astype(str)
Xmer_dataset = Xmer_dataset.drop(columns=["Browser Name", "Browser Version", "Browser Minor"])
Xmer_dataset["Device"] = Xmer_dataset["Device Brand"] + "_" + Xmer_dataset["Device Model"]
Xmer_dataset = Xmer_dataset.drop(columns=["Device Brand", "Device Model"])
columns = ["Year", "Month", "Day", "Hour", "Minute", "Second", "DayOfWeek"]
Xmer_dataset[columns] = Xmer_dataset[columns].astype("category")
Xmer_dataset["Syn_feature_1"] = Xmer_dataset["Protocol"] + "_" + Xmer_dataset["Packet Type"] + Xmer_dataset["Traffic Type"]
Xmer_dataset = Xmer_dataset.drop(columns=["Protocol", "Packet Type", "Traffic Type"])
Xmer_dataset["Syn_feature_2"] = Xmer_dataset["Malware Indicators"] + "_" + Xmer_dataset["Alerts/Warnings"] + "_" + Xmer_dataset["IDS/IPS Alerts"]
Xmer_dataset = Xmer_dataset.drop(columns=["Malware Indicators", "Alerts/Warnings", "IDS/IPS Alerts"])
Xmer_dataset["Syn_feature_3"] = Xmer_dataset["Anomaly Scores"]**2
Xmer_dataset = Xmer_dataset.drop(columns=["Anomaly Scores"])


In [10]:
categorical_columns = Xmer_dataset.select_dtypes(include=["str", "category"]).columns
boolean_columns = Xmer_dataset.select_dtypes(include=["bool"]).columns
numeric_columns = Xmer_dataset.select_dtypes(include=["int64", "float64"]).columns
passthrough_columns = [col for col in Xmer_dataset.columns if col not in categorical_columns and col not in boolean_columns and col not in numeric_columns]

numeric_transformer = Pipeline(
        steps = [("scaler", StandardScaler())]
    )

categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first"))
        ])
boolean_transformer = Pipeline([
        ("encoder", TargetEncoder(target_type="binary")),
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_columns),
            ("num", numeric_transformer, numeric_columns),
            ("bool", boolean_transformer, boolean_columns)
        ]
    )

mer_model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=100, random_state=35))
    ])

X_train, X_test, y_train, y_test = train_test_split(Xmer_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
start_time = monotonic()
mer_model.fit(X_train, y_train)
print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
print("Model score: %.3f" % mer_model.score(X_test, y_test))

Time taken to fit the model: 32.91 seconds
Model score: 0.326


In [11]:
x_train_transformed = mer_model.named_steps["preprocessor"].fit_transform(X_train, y_train)
importances = mer_model.named_steps['classifier'].feature_importances_
std = np.std([tree.feature_importances_ for tree in mer_model.named_steps['classifier'].estimators_], axis=0)
feature_names = [f"{x}" for x in mer_model.named_steps["preprocessor"].get_feature_names_out()]
forest_importances = pd.DataFrame({"feature": feature_names, "importance": importances, "std": std}).sort_values("importance", ascending=False)
fig = px.bar(forest_importances[forest_importances["importance"] > 0.01],x="feature",y="importance", error_y="std", title="Feature importances using MDI")
fig.show()

In [12]:
relaunch =""
while relaunch.lower() not in ["y", "n"]:
    relaunch = input("Do you want to try a modified gradient model? (y/n) ")
if relaunch.lower() == "y":
    categorical_columns = Xmer_dataset.select_dtypes(include=["str", "category"]).columns
    boolean_columns = Xmer_dataset.select_dtypes(include=["bool"]).columns
    numeric_columns = Xmer_dataset.select_dtypes(include=["int64", "float64"]).columns
    passthrough_columns = [col for col in Xmer_dataset.columns if col not in categorical_columns and col not in boolean_columns and col not in numeric_columns]

    numeric_transformer = Pipeline(
            steps = [("scaler", StandardScaler())]
        )

    categorical_transformer = Pipeline([
            ("imputer", SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value="unknown")),
            ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first"))
            ])
    boolean_transformer = Pipeline([
            ("encoder", TargetEncoder(target_type="binary")),
            ])

    preprocessor = ColumnTransformer(
            transformers=[
                ("cat", categorical_transformer, categorical_columns),
                ("num", numeric_transformer, numeric_columns),
                ("bool", boolean_transformer, boolean_columns)
            ]
        )

    grad_model = Pipeline([
            ("preprocessor", preprocessor),
            ("classifier", GradientBoostingClassifier(n_estimators=300, max_depth = None, min_samples_split=30, random_state=35, n_iter_no_change=15))
        ])
    X_train, X_test, y_train, y_test = train_test_split(Xmer_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
    start_time = monotonic()
    grad_model.fit(X_train, y_train)
    print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
    print("Model score: %.3f" % grad_model.score(X_test, y_test))

In [13]:
if relaunch.lower() == "y":
    x_train_transformed = grad_model.named_steps["preprocessor"].fit_transform(X_train, y_train)
    importances = grad_model.named_steps['classifier'].feature_importances_
    forests = [forest for forest in grad_model.named_steps['classifier'].estimators_]
    std = np.std([tree.feature_importances_ for forest in forests for tree in forest], axis=0)
    feature_names = [f"{x}" for x in grad_model.named_steps["preprocessor"].get_feature_names_out()]
    forest_importances = pd.DataFrame({"feature": feature_names, "importance": importances, "std": std}).sort_values("importance", ascending=False)

    fig = px.bar(forest_importances[forest_importances["importance"] > 0.01],x="feature",y="importance", error_y="std", title="Feature importances using MDI")
    fig.show()

## HistGradientBoost Classifier

No need to use one hot encoder for this model

In [14]:
Xhist_dataset = X_dataset.copy()

In [15]:
categorical_columns = Xhist_dataset.select_dtypes(include=["str", "category","bool"]).columns.to_list()
numeric_columns = Xhist_dataset.select_dtypes(include=["int64", "float64"]).columns
passthrough_columns = [col for col in Xhist_dataset.columns if col not in categorical_columns and col not in boolean_columns and col not in numeric_columns]


numeric_transformer = Pipeline(
        steps = [("scaler", StandardScaler())]
    )
numeric_transformer.fit(Xhist_dataset[numeric_columns], Y_true)
Xhist_dataset[numeric_columns] = numeric_transformer.transform(Xhist_dataset[numeric_columns])

preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_columns)
        ]
    )

hist_model = HistGradientBoostingClassifier(max_iter=300, max_depth = None, random_state=35, categorical_features=categorical_columns, n_iter_no_change = 50)

X_train, X_test, y_train, y_test = train_test_split(Xhist_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
start_time = monotonic()
hist_model.fit(X_train, y_train)
print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
print("Model score: %.3f" % hist_model.score(X_test, y_test))

Time taken to fit the model: 0.47 seconds
Model score: 0.332


In [16]:
hist_model

0,1,2
,"loss  loss: {'log_loss'}, default='log_loss' The loss function to use in the boosting process. For binary classification problems, 'log_loss' is also known as logistic loss, binomial deviance or binary crossentropy. Internally, the model fits one tree per boosting iteration and uses the logistic sigmoid function (expit) as inverse link function to compute the predicted positive class probability. For multiclass classification problems, 'log_loss' is also known as multinomial deviance or categorical crossentropy. Internally, the model fits one tree per boosting iteration and per class and uses the softmax function as inverse link function to compute the predicted probabilities of the classes.",'log_loss'
,"learning_rate  learning_rate: float, default=0.1 The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage.",0.1
,"max_iter  max_iter: int, default=100 The maximum number of iterations of the boosting process, i.e. the maximum number of trees for binary classification. For multiclass classification, `n_classes` trees per iteration are built.",300
,"max_leaf_nodes  max_leaf_nodes: int or None, default=31 The maximum number of leaves for each tree. Must be strictly greater than 1. If None, there is no maximum limit.",31
,"max_depth  max_depth: int or None, default=None The maximum depth of each tree. The depth of a tree is the number of edges to go from the root to the deepest leaf. Depth isn't constrained by default.",
,"min_samples_leaf  min_samples_leaf: int, default=20 The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value since only very shallow trees would be built.",20
,"l2_regularization  l2_regularization: float, default=0 The L2 regularization parameter penalizing leaves with small hessians. Use ``0`` for no regularization (default).",0.0
,"max_features  max_features: float, default=1.0 Proportion of randomly chosen features in each and every node split. This is a form of regularization, smaller values make the trees weaker learners and might prevent overfitting. If interaction constraints from `interaction_cst` are present, only allowed features are taken into account for the subsampling. .. versionadded:: 1.4",1.0
,"max_bins  max_bins: int, default=255 The maximum number of bins to use for non-missing values. Before training, each feature of the input array `X` is binned into integer-valued bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin is always reserved for missing values. Must be no larger than 255.",255
,"categorical_features  categorical_features: array-like of {bool, int, str} of shape (n_features) or shape (n_categorical_features,), default='from_dtype' Indicates the categorical features. - None : no feature will be considered categorical. - boolean array-like : boolean mask indicating categorical features. - integer array-like : integer indices indicating categorical  features. - str array-like: names of categorical features (assuming the training  data has feature names). - `""from_dtype""`: dataframe columns with dtype ""category"" are  considered to be categorical features. The input must be an object  exposing a ``__dataframe__`` method such as pandas or polars  DataFrames to use this feature. For each categorical feature, there must be at most `max_bins` unique categories. Negative values for categorical features encoded as numeric dtypes are treated as missing values. All categorical values are converted to floating point numbers. This means that categorical values of 1.0 and 1 are treated as the same category. Read more in the :ref:`User Guide `. .. versionadded:: 0.24 .. versionchanged:: 1.2  Added support for feature names. .. versionchanged:: 1.4  Added `""from_dtype""` option. .. versionchanged:: 1.6  The default value changed from `None` to `""from_dtype""`.","['Protocol', 'Packet Type', ...]"


### Less Features
We try to model using less features. All features related to post analysis are ommitted: we keep information related to IP, port packet length

In [23]:
columns = ["Int Source IP", "Int Destination IP", "Protocol", "Packet Type", "Traffic Type", "Anomaly Scores", "Severity Level", "Month", "DayOfWeek"]
Xsimple_dataset = X_dataset[columns].copy()

In [None]:
Xsimple_dataset[["Month", "DayOfWeek"]] = Xsimple_dataset[["Month", "DayOfWeek"]].astype("category")
categorical_columns = Xsimple_dataset.select_dtypes(include=["str", "category"]).columns
boolean_columns = Xsimple_dataset.select_dtypes(include=["bool"]).columns
numeric_columns = Xsimple_dataset.select_dtypes(include=["int64", "float64"]).columns
passthrough_columns = [col for col in Xsimple_dataset.columns if col not in categorical_columns and col not in boolean_columns and col not in numeric_columns]

numeric_transformer = Pipeline(
        steps = [("scaler", StandardScaler())]
    )

categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first"))
        ])
boolean_transformer = Pipeline([
        ("encoder", TargetEncoder(target_type="binary")),
        ])

preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_columns),
            ("num", numeric_transformer, numeric_columns),
            ("bool", boolean_transformer, boolean_columns)
        ]
    )
param_grid = {
    "classifier__n_estimators": [100, 300],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 10, 30]
}


model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(n_estimators=300, random_state=35))
    ])

X_train, X_test, y_train, y_test = train_test_split(Xsimple_dataset, Y_true, test_size=0.2, stratify=Y_true, random_state=124)
start_time = monotonic()
search = GridSearchCV(model, param_grid)
search.fit(X_train, y_train)
print("Time taken to fit the model: %.2f seconds" % (monotonic() - start_time))
print("Best parameters: ", search.best_params_)
print("Model score: %.3f" % search.score(X_test, y_test))

ValueError: 
All the 90 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3641, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pandas/_libs/index.pyx", line 168, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 197, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7668, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7676, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Malware Indicators'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/utils/_indexing.py", line 469, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/pandas/core/indexes/base.py", line 3648, in get_loc
    raise KeyError(key) from err
KeyError: 'Malware Indicators'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 833, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1336, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 613, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 547, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/joblib/memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 1484, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1336, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 991, in fit_transform
    self._validate_column_callables(X)
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 545, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/florians/cours/CyberSecurity_ML/.venv/lib/python3.12/site-packages/sklearn/utils/_indexing.py", line 477, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


# Feature Selection
