In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import glob
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

2025-05-21 22:11:31.977553: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-21 22:11:31.978113: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-21 22:11:31.980592: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-21 22:11:31.986397: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747845691.996293   20279 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747845691.99

In [2]:
features = [
    'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets',
    'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean',
    'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean',
    'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
    'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length',
    'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
    'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count',
    'ECE Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size',
    'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes',
    'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward'
]

In [3]:
def load_parquet_files(file_list):
    dfs = []
    for f in file_list:
        df = pd.read_parquet(f)
        print(f"[INFO] Loaded {f} with shape: {df.shape}")
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

# === Load training and test data ===
train_files = sorted(glob.glob(r'/home/garv/Desktop/Cyber-Security/archive (1)/*training.parquet'))
test_files = sorted(glob.glob(r'/home/garv/Desktop/Cyber-Security/archive (1)/*testing.parquet'))

train_df = load_parquet_files(train_files)
test_df = load_parquet_files(test_files)
print("[INFO] Loading training data...")
train_df = load_parquet_files(train_files)

print("[INFO] Loading test data...")
test_df = load_parquet_files(test_files)

[INFO] Loaded /home/garv/Desktop/Cyber-Security/archive (1)/LDAP-training.parquet with shape: (6715, 78)
[INFO] Loaded /home/garv/Desktop/Cyber-Security/archive (1)/MSSQL-training.parquet with shape: (10974, 78)
[INFO] Loaded /home/garv/Desktop/Cyber-Security/archive (1)/NetBIOS-training.parquet with shape: (1631, 78)
[INFO] Loaded /home/garv/Desktop/Cyber-Security/archive (1)/Portmap-training.parquet with shape: (5105, 78)
[INFO] Loaded /home/garv/Desktop/Cyber-Security/archive (1)/Syn-training.parquet with shape: (70336, 78)
[INFO] Loaded /home/garv/Desktop/Cyber-Security/archive (1)/UDP-training.parquet with shape: (17770, 78)
[INFO] Loaded /home/garv/Desktop/Cyber-Security/archive (1)/UDPLag-training.parquet with shape: (12639, 78)
[INFO] Loaded /home/garv/Desktop/Cyber-Security/archive (1)/DNS-testing.parquet with shape: (6703, 78)
[INFO] Loaded /home/garv/Desktop/Cyber-Security/archive (1)/LDAP-testing.parquet with shape: (2831, 78)
[INFO] Loaded /home/garv/Desktop/Cyber-Security

In [4]:
train_df.columns = train_df.columns.str.strip()
test_df.columns = test_df.columns.str.strip()

In [5]:
common_features = [f for f in features if f in train_df.columns and f in test_df.columns]
train_df = train_df[common_features + ['Label']]
test_df = test_df[common_features + ['Label']]
print(f"[INFO] Training data shape after filtering: {train_df.shape}")
print(f"[INFO] Test data shape after filtering: {test_df.shape}")
print(f"[INFO] Using {len(common_features)} features.")

[INFO] Training data shape after filtering: (125170, 41)
[INFO] Test data shape after filtering: (306201, 41)
[INFO] Using 40 features.


In [11]:
print(train_df.columns)


Index(['Protocol', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
       'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s',
       'Bwd Packets/s', 'Packet Length Mean', 'Packet Length Std',
       'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count',
       'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count',
       'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio',
       'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets',
       'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes',
       'Label'],
      dtype='object')


In [13]:
print("Feature columns:", X_train.columns.tolist())


Feature columns: ['Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes']


In [35]:
train_df.replace([np.inf, -np.inf], 0, inplace=True)
test_df.replace([np.inf, -np.inf], 0, inplace=True)
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

In [36]:
# === Binary Label Encoding (Robust) ===
train_df['Label'] = train_df['Label'].astype(str).str.strip().str.lower().apply(lambda x: 0 if x == 'benign' else 1)
test_df['Label'] = test_df['Label'].astype(str).str.strip().str.lower().apply(lambda x: 0 if x == 'benign' else 1)


In [37]:
X_train = train_df[common_features].astype(np.float32)
y_train = train_df['Label'].astype(np.int32)

X_test = test_df[common_features].astype(np.float32)
y_test = test_df['Label'].astype(np.int32)

print(f"[INFO] X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"[INFO] X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

[INFO] X_train shape: (125170, 40), y_train shape: (125170,)
[INFO] X_test shape: (306201, 40), y_test shape: (306201,)


In [38]:
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Explicit input layer

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation='sigmoid')
])

In [39]:
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [40]:
print("[INFO] Starting training...")
model.fit(X_train_scaled, y_train, epochs=15, batch_size=128, validation_split=0.2, verbose=1)

[INFO] Starting training...
Epoch 1/15
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9760 - loss: 0.0814 - val_accuracy: 0.9942 - val_loss: 0.0195
Epoch 2/15
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9860 - loss: 0.0461 - val_accuracy: 0.9942 - val_loss: 0.0204
Epoch 3/15
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9871 - loss: 0.0406 - val_accuracy: 0.9962 - val_loss: 0.0114
Epoch 4/15
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9902 - loss: 0.0314 - val_accuracy: 0.9975 - val_loss: 0.0097
Epoch 5/15
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9940 - loss: 0.0225 - val_accuracy: 0.9985 - val_loss: 0.0061
Epoch 6/15
[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9952 - loss: 0.0185 - val_accuracy: 0.9965 - val_loss: 0.0130


<keras.src.callbacks.history.History at 0x2dc8f1d5300>

In [41]:
print("[INFO] Evaluating model...")
y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")

print("\n[RESULT] Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n[RESULT] Classification Report:\n", classification_report(y_test, y_pred))

[INFO] Evaluating model...
[1m9569/9569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 611us/step

[RESULT] Confusion Matrix:
 [[ 51337     67]
 [ 10660 244137]]

[RESULT] Classification Report:
               precision    recall  f1-score   support

           0       0.83      1.00      0.91     51404
           1       1.00      0.96      0.98    254797

    accuracy                           0.96    306201
   macro avg       0.91      0.98      0.94    306201
weighted avg       0.97      0.96      0.97    306201



In [43]:
model.save("ddos_detection_model.h5")
model.save("ddos_detection_model01.keras")
import joblib
joblib.dump(scaler, "scaler.pkl")

print("[INFO] Model and scaler saved.")



[INFO] Model and scaler saved.
