In [1]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
import sys
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, plot_confusion_matrix

from utils.dataset_processing.dataset_processing import DatasetPreprocess
from utils.ml_model_training.dp_ids_ml_builder import DPIDSBuilder

In [None]:
######################### Setup Pathes, Parameters, Objects #########################
#####################################################################################

############################### Setup Pathes ###############################
dataset_path = "../../dataset/flow_preprocessed_datasets/merged_datasets/flow_preprocessed_merged_8_balanced_feature_num_20.csv"

# dP-IDS saving base path
dp_ids_model_save_base_path = "../ml_models/dp_ids_models/"
# pathe for RF
rf_path = dp_ids_model_save_base_path + "rf_dp_ids_model.pkl"
# path for CV
cv_dir = dp_ids_model_save_base_path + "/cv_history/"
# path for saving figures of trees
plot_save_path = dp_ids_model_save_base_path + "/trees_plot/"

############################### Setup Builder ###############################
dataProc = DatasetPreprocess()
dp_ids_builder = DPIDSBuilder()
X_train, X_test, y_train, y_test, X_val, y_val = dataProc.split_dataset(dataset_path)

In [None]:
######################## Search for the best hyperparameters ########################
#####################################################################################
cv_params = {
    "n_estimators": [3],
    "max_depth": range(1, 11)
}

cv_results_path = dp_ids_builder.get_best_estimator(cv_results_dir=cv_dir, params=cv_params, X_train=X_train, y_train=y_train)

cv_fig_name = "f1_max_depth.png"
dp_ids_builder.plot_cv_results(cv_results_path=cv_results_path, params=cv_params, fig_name=cv_fig_name)

In [4]:
############################## Build the RF for DP-IDS ##############################
#####################################################################################
dp_ids_builder.train_rf(rf_serialization_path=rf_path, n_estimators=3, max_depth=5, X_train=X_train, y_train=y_train)

In [None]:
########################## Plot and save the RF for DP-IDS ##########################
#####################################################################################
# get the features used for training this rf
df = pd.read_csv(dataset_path)
feature_list = list(df.columns[:-1])

dp_ids_builder.plot_trees(save_path=plot_save_path, rf_serialization_path=rf_path, features=feature_list)

In [None]:
######################### Test the trained RF (test dataset) ########################
#####################################################################################

rf = dp_ids_builder.load_rf(rf_path)

y_predict = rf.predict(X_test)
print(classification_report(y_test, y_predict))
plot_confusion_matrix(rf, X_test, y_test, display_labels=["Benign", "Attack"])