In this first example we will create a RNN whose task is to predict if there is an intrusion in a sequence of events.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Dropout
import numpy as np
from sklearn.impute import SimpleImputer
import os
# Load the dataset from folder ../Dataset and merge all CSV files

files = [file for file in os.listdir('../Dataset') if file.endswith('.csv')] 
data = pd.concat(
    [pd.read_csv(f'../Dataset/{file}') for file in files]
)




2025-07-22 19:15:53.720352: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-22 19:15:53.747108: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-22 19:15:53.937064: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-22 19:15:54.114730: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753204554.326585   44241 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753204554.38

Now we will analyse a bit the dataset we are going to use. The dataset is a csv file that contains network traffic data, with each row representing a network connection and its features.
The dataset has also a label that indicates if the connection is normal or if it is an intrusion. The label is in the last column of the dataset.

In [3]:
data[" Label"].unique()

array(['BENIGN', 'FTP-Patator', 'SSH-Patator', 'DDoS',
       'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'PortScan', 'DoS slowloris',
       'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed',
       'Infiltration', 'Bot'], dtype=object)

We have to clean the dataset due to the presence of NaN values and infinite values. We will use a strategy that replaces infinite values with NaN and then imputes the NaN values with the median of the column.
We will use the `SimpleImputer` class from the `sklearn.impute` module

In [None]:

feature_columns = data.drop(columns=[" Label"]).columns
X_temp = data[feature_columns]
y_temp = data[" Label"]

print(f"Shape features: {X_temp.shape}")
print(f"Shape label: {y_temp.shape}")
print(f"data types in features: {X_temp.dtypes.value_counts()}")

X_temp = X_temp.replace([np.inf, -np.inf], np.nan)

print(f"NaN in features: {X_temp.isnull().sum().sum()}")

imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(
    imputer.fit_transform(X_temp), 
    columns=X_temp.columns, 
    index=X_temp.index
)

print(f"NaN after imputing: {X_imputed.isnull().sum().sum()}")

data_clean = X_imputed
data_clean[" Label"] = y_temp  

data_clean[" Label"] = data_clean[" Label"].replace({
    "BENIGN": 0,
    "FTP-Patator": 1,
    "SSH-Patator": 2,
    "DDoS": 3,
    "Web Attack � Brute Force": 4,
    "Web Attack � XSS": 5,
    "Web Attack � Sql Injection": 6,
    "PortScan": 7,
    "DoS slowloris": 8,
    "DoS Slowhttptest": 9,
    "DoS Hulk": 10,
    "DoS GoldenEye": 11,
    "Heartbleed": 12,
    "Infiltration": 13,
    "Bot": 14
})

data = data_clean.copy()

print(f"Final shape: {data.shape}")
print("Unique labels after conversion:", data[" Label"].unique())

NameError: name 'data' is not defined

In [None]:
print("Class distribution:")
print(data[" Label"].value_counts())
print(f"\nPercentage of class BENIGN : {(data[' Label'] == 0).mean():.4f}")
print(f"Percentage of class Attack: {(data[' Label'] != 0).mean():.4f}")
print(f"\nShape of the dataset: {data.shape}")
print(f"Number of features: {data.shape[1] - 1}")  # -1 to exclude the label

Distribuzione delle classi:


NameError: name 'data' is not defined

Now we split the dataset into training and testing sets. and then we will create a RNN model.

In [6]:
X = data.drop(columns=[" Label"])
y = data[" Label"]
num_classes = len(y.unique())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(1, X_train.shape[2])))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))
# Compile the model
model.compile(
    optimizer='adam', 
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy']
)
# Train the model
model.fit(X_train, y_train,
            epochs=10,
            batch_size=32,
            validation_split=0.2
)
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)

# Save the model
model.save('intrusion_detection_rnn.keras')

  super().__init__(**kwargs)


Epoch 1/10
[1m56615/56615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 2ms/step - accuracy: 0.9534 - loss: 0.1449 - val_accuracy: 0.9755 - val_loss: 0.0588
Epoch 2/10
[1m56615/56615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 2ms/step - accuracy: 0.9534 - loss: 0.1449 - val_accuracy: 0.9755 - val_loss: 0.0588
Epoch 2/10
[1m56615/56615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 2ms/step - accuracy: 0.9707 - loss: 0.0745 - val_accuracy: 0.9774 - val_loss: 0.0555
Epoch 3/10
[1m56615/56615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 2ms/step - accuracy: 0.9707 - loss: 0.0745 - val_accuracy: 0.9774 - val_loss: 0.0555
Epoch 3/10
[1m56615/56615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 2ms/step - accuracy: 0.9734 - loss: 0.0689 - val_accuracy: 0.9808 - val_loss: 0.0531
Epoch 4/10
[1m56615/56615[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 2ms/step - accuracy: 0.9734 - loss: 0.0689 - val_accuracy: 0.9808 - val_loss: