In this first example we will create a RNN whose task is to predict if there is an intrusion in a sequence of events.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Dropout
import numpy as np
from sklearn.impute import SimpleImputer
import tensorflow as tf
import os
# Load the dataset from folder ../Dataset and merge all CSV files

files = [file for file in os.listdir('../Dataset') if file.endswith('.csv')] 
data = pd.concat(
    [pd.read_csv(f'../Dataset/{file}') for file in files]
)




2025-07-06 15:50:01.593305: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-06 15:50:01.611239: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-06 15:50:01.737294: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-06 15:50:01.836750: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751809801.933363   27115 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751809801.96

Now we will analyse a bit the dataset we are going to use. The dataset is a csv file that contains network traffic data, with each row representing a network connection and its features.
The dataset has also a label that indicates if the connection is normal or if it is an intrusion. The label is in the last column of the dataset.

In [3]:
data[" Label"].unique()

array(['BENIGN', 'FTP-Patator', 'SSH-Patator', 'DDoS',
       'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'PortScan', 'DoS slowloris',
       'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed',
       'Infiltration', 'Bot'], dtype=object)

We have to clean the dataset due to the presence of NaN values and infinite values. We will use a strategy that replaces infinite values with NaN and then imputes the NaN values with the median of the column.
We will use the `SimpleImputer` class from the `sklearn.impute` module

In [None]:
feature_columns = data.drop(columns=[" Label"]).columns
data[feature_columns] = data[feature_columns].replace([np.inf, -np.inf], np.nan)

X_temp = data.drop(columns=[" Label"])
y_temp = data[" Label"]

imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(
    imputer.fit_transform(X_temp), 
    columns=X_temp.columns, 
    index=X_temp.index
)

data = X_imputed
data[" Label"] = y_temp

data = data.drop(columns=[" Label"])
data[" Label"] = data[" Label"].replace({
    "BENIGN": 0,
    "FTP-Patator": 1,
    "SSH-Patator": 2,
    "DDoS": 3,
    "Web Attack � Brute Force": 4,
    "Web Attack � XSS": 5,
    "Web Attack � Sql Injection": 6,
    "PortScan": 7,
    "DoS slowloris": 8,
    "DoS Slowhttptest": 9,
    "DoS Hulk": 10,
    "DoS GoldenEye": 11,
    "Heartbleed": 12,
    "Infiltration": 13,
    "Bot": 14
})

NameError: name 'data' is not defined

In [None]:
# Analizziamo il bilanciamento delle classi
print("Distribuzione delle classi:")
print(data[" Label"].value_counts())
print(f"\nPercentuale di classe 0 (attacchi): {(data[' Label'] == 0).mean():.4f}")
print(f"Percentuale di classe 1 (BENIGN): {(data[' Label'] == 1).mean():.4f}")
print(f"\nShape del dataset: {data.shape}")
print(f"Numero di features: {data.shape[1] - 1}")  # -1 per escludere la label

Distribuzione delle classi:
 Label
1    128027
0     97718
Name: count, dtype: int64

Percentuale di classe 0 (attacchi): 0.4329
Percentuale di classe 1 (BENIGN): 0.5671

Shape del dataset: (225745, 79)
Numero di features: 78


Now we split the dataset into training and testing sets.

In [None]:
X = data.drop(columns=[" Label"])
y = data[" Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
model = Sequential()
model.add(SimpleRNN(50, activation='relu', input_shape=(X_train.shape[
1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train,
            epochs=2,
            batch_size=64,
            validation_split=0.2
)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)

# Save the model
model.save('intrusion_detection_rnn.keras')

Epoch 1/2


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.8636 - loss: 0.9822 - val_accuracy: 0.8698 - val_loss: 0.4729
Epoch 2/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.8636 - loss: 0.9822 - val_accuracy: 0.8698 - val_loss: 0.4729
Epoch 2/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.8182 - loss: 1.0819 - val_accuracy: 0.8899 - val_loss: 0.4577
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.8182 - loss: 1.0819 - val_accuracy: 0.8899 - val_loss: 0.4577
[1m6984/6984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - accuracy: 0.8833 - loss: 0.4982
[1m6984/6984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - accuracy: 0.8833 - loss: 0.4982
Test Loss: 0.49922066926956177, Test Accuracy: 0.8826693296432495
Test Loss: 0.49922066926956177, Test Accuracy: 0.8826693296432495
