In [33]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score,roc_auc_score
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import callbacks,layers


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import time

In [34]:
def splits_assemble(path, test_size=0.3):
    # Import dataset
    df_dataset = pd.read_csv(path)

    # Encode Labels for numeric classification
    label_encoder = LabelEncoder()
    df_dataset['Label'] = label_encoder.fit_transform(df_dataset['Label'])

    # Set a random state for sampling
    RANDOM_STATE_SEED = np.random.randint(123)
    print(f'Random Seed:{RANDOM_STATE_SEED}')
    
    # Split dataset in train and test
    train, test = train_test_split(df_dataset, test_size=test_size, random_state=RANDOM_STATE_SEED)
    
    # Count how many instances there are in each label
    print(df_dataset["Label"].value_counts())

    # Separate in X and y for better classification
    y_train = np.array(train.pop("Label"))# pop removes "Label" from the dataframe
    X_train = train.values

    print(f'Tipo X_train: {type(X_train)} Tipo y_train: {type(y_train)} Shape X_train:{X_train.shape} Shape y_train: {y_train.shape}')

    y_test = np.array(test.pop("Label")) # pop removes "Label" from the dataframe
    X_test = test.values

    print(f'Tipo X_test: {type(X_test)} Tipo y_test: {type(y_test)} Shape X_test:{X_test.shape} Shape y_test: {y_test.shape}')
    
    return X_train,y_train,X_test,y_test

In [35]:
def models_assemble(X_train):
    models = {}

    models['DT'] = DecisionTreeClassifier()
    models['RF'] = RandomForestClassifier()
    models['SVM'] = LinearSVC(max_iter=10000, dual=False,)
    models['KNN'] = KNeighborsClassifier()
    models['NB'] = GaussianNB()
    models['XGB'] = xgb.XGBClassifier()
    models['NN'] = keras.Sequential([
            layers.InputLayer(input_shape=(X_train.shape[1],)),
            
            layers.BatchNormalization(renorm=True),
            layers.Dense(128, activation='relu'),
            layers.Dropout(rate = 0.3),
            layers.BatchNormalization(renorm=True),
            layers.Dense(64, activation='relu'),
            layers.Dropout(rate = 0.3),
            layers.BatchNormalization(renorm=True),
            layers.Dense(32, activation='relu'),
            layers.Dropout(rate = 0.3),
            layers.Dense(1, activation='sigmoid'),
        ])
    return models

In [36]:
def grids_assemble(cv=2, model=None,key=None):
    
    hyperparameters = {}
    hyperparameters['XGB'] = {'learning_rate': [0.9, 0.7, 0.5, 0.3, 0.1], 'n_estimators': [50,100,150,200],
                              }
    hyperparameters['DT'] = {'criterion': ['gini','entropy'], 'max_depth': [10,15,20,25,30],'splitter':['best','random']
                             }
    hyperparameters['RF'] = {'n_estimators': [50, 75, 100, 125, 150], 'criterion': ['gini','entropy'],'max_depth': [25,30]
                             }
    hyperparameters['SVM'] = { 'C': np.linspace(0.01,100, num=20)
                              }
    hyperparameters['NB'] = {'var_smoothing': np.logspace(0,-9, num=20)
                             }
    hyperparameters['KNN'] = {'n_neighbors': [8,9,10,11,12], 'weights': ['uniform','distance'], 'leaf_size': [10,100]
                            }
    hyperparameters['NN'] = {'epochs': [10,20], 'batch_size': [32,64,128,256,512], 'epsilon': [0.01,0.1]
                            }

    classifierGRID = GridSearchCV(
        estimator = model,
        param_grid = hyperparameters[key],
        cv=cv,
        verbose=1,
        n_jobs=-1  # Use all available CPU cores
    )
    return classifierGRID

In [37]:
def fit_assemble(classifierGRID,X_train,y_train):
    classifierGRID.fit(X=X_train, y=y_train)

    # Print best parameters found on GridsearchCV
    print("Accuracy score on Validation set: \n")
    print(classifierGRID.best_score_ )
    print("---------------")
    print("Best performing hyperparameters on Validation set: ")
    print(classifierGRID.best_estimator_)
    print("---------------")
    


    fitted_model = classifierGRID.best_estimator_
    
    return fitted_model

In [38]:
def NeuralNet_fit(neuralNetModel,X_train,y_train,epochs=10):
    # %%
    
    optimizer = tf.keras.optimizers.Adam(epsilon=0.01)


    early_stopping = callbacks.EarlyStopping(
        min_delta = 0.001,
        patience = 5,
        restore_best_weights = True,
        monitor= 'loss'
    )


    neuralNetModel.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['binary_accuracy'],
    )

    history = neuralNetModel.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size = 256,
        callbacks=[early_stopping]

    )

    return neuralNetModel


In [39]:
def metrics_assemble(fitted_model,X_test,y_test):
    
    predictions = fitted_model.predict(X_test)

    if predictions.dtype == 'float32':
        predictions = (fitted_model.predict(X_test) > 0.5).astype("int32")

    
    accuracy = accuracy_score(y_test,predictions)
    precision = precision_score(y_test,predictions)
    recall = recall_score(y_test,predictions)
    f1= f1_score(y_test,predictions)
    auc= roc_auc_score(y_test,predictions)

    return accuracy, precision, recall, f1, auc

In [40]:
# Define path to the dataset
path = '/home/mintssj/Downloads/datasets/filtered/NSL_KDD_equal.csv'

# Start dictionaries to store metrics
accuracy, precision, recall, f1, auc, fit_time, eval_time, fitted_models = {}, {}, {}, {}, {}, {}, {}, {} 

# Get dataset splits for tranning and evaluation
X_train,y_train,X_test,y_test = splits_assemble(path, test_size=0.3)

# Start dictionary for models
models=models_assemble(X_train)

# # Start iteration loop for fitting and evaluating the classic models
# for key in ['DT', 'RF', 'XGB', 'KNN', 'SVM', 'NB']:
    
#     # Get the grid of hyperparameters and start the GridsearchCV function
#     classifierGRID = grids_assemble(cv=5, model=models[key],key=key)

#     print(f'Fitting {key} model')

#     # Fit model
#     start_time = time.time()
#     fitted_models[key] = fit_assemble(classifierGRID,X_train,y_train)
#     end_time = time.time()
#     fit_time[key] = end_time - start_time

#     # Evaluate model
#     start_time = time.time()
#     accuracy[key], precision[key], recall[key], f1[key], auc[key] = metrics_assemble(fitted_models[key],X_test,y_test)
#     end_time = time.time()
#     eval_time[key] = end_time - start_time

# Fit the neural network model
start_time = time.time()
neuralNetModel = NeuralNet_fit(models['NN'],X_train,y_train,epochs=100)
end_time = time.time()
fit_time['NN'] = end_time - start_time

# Evaluate the neural network model
start_time = time.time()
accuracy['NN'], precision['NN'], recall['NN'], f1['NN'], auc['NN'] = metrics_assemble(neuralNetModel,X_test,y_test)
end_time = time.time()
eval_time['NN'] = end_time - start_time

# Convert the metrics dictionaries into a dataframe for better visualization
metrics = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-Score':f1, 'ROC-AUC-Score': auc, \
           'Tempo':fit_time, 'Evaluation Time':eval_time}
df_metrics = pd.DataFrame(metrics)
display(df_metrics)  


Random Seed:117
Label
1    67297
0    58026
Name: count, dtype: int64
Tipo X_train: <class 'numpy.ndarray'> Tipo y_train: <class 'numpy.ndarray'> Shape X_train:(87726, 38) Shape y_train: (87726,)
Tipo X_test: <class 'numpy.ndarray'> Tipo y_test: <class 'numpy.ndarray'> Shape X_test:(37597, 38) Shape y_test: (37597,)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100


Unnamed: 0,Accuracy,Precision,Recall,F1-Score,ROC-AUC-Score,Tempo,Evaluation Time
NN,0.995186,0.994919,0.996147,0.995533,0.995106,83.257947,4.445962
