In [1]:
import pandas as pd
import numpy as np
import joblib
import warnings
import os
import glob
import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.utils import class_weight

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# Suppress warnings
warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

2025-11-18 10:39:45.191366: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Libraries imported successfully.


In [None]:
DATA_DIR = 'CIC-IDS2017/'

all_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))

if not all_files:
    print(f"Error: No '.csv' files found in directory '{DATA_DIR}'.")
else:
    print(f"Found {len(all_files)} CSV files to load.")

    df_list = []

    for filename in all_files:
        try:
            df_temp = pd.read_csv(filename)
            df_list.append(df_temp)
            print(f"Loaded: {filename}")
            
        except Exception as e:
            print(f"Error loading {filename}: {e}")

    if df_list:
        df = pd.concat(df_list, ignore_index=True)
        print("\nAll files successfully loaded and combined.")
        print(f"Total rows: {len(df)}")
    else:
        print("No data was loaded.")

Found 8 CSV files to load.
Loaded: CIC-IDS2017/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Loaded: CIC-IDS2017/Monday-WorkingHours.pcap_ISCX.csv
Loaded: CIC-IDS2017/Friday-WorkingHours-Morning.pcap_ISCX.csv
Loaded: CIC-IDS2017/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Loaded: CIC-IDS2017/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Loaded: CIC-IDS2017/Tuesday-WorkingHours.pcap_ISCX.csv
Loaded: CIC-IDS2017/Wednesday-workingHours.pcap_ISCX.csv
Error loading CIC-IDS2017/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv: 'utf-8' codec can't decode byte 0x96 in position 22398: invalid start byte

All files successfully loaded and combined.
Total rows: 2660377


In [None]:
df.columns = df.columns.str.strip()
print("Cleaned column names.")

print(f"Labels found: {df['Label'].unique()}")

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
print(f"Dropped NaN/Inf rows. Remaining rows: {len(df)}")

print("\n--- Original Class Distribution (Top 10) ---")
print(df['Label'].value_counts().head(10))

MAJORITY_LABEL = 'BENIGN'
MAX_MAJORITY_SAMPLES = 500000 # Cap BENIGN at 500k rows

df_majority = df[df['Label'] == MAJORITY_LABEL]
df_minority = df[df['Label'] != MAJORITY_LABEL]
print(f"\nOriginal majority ('{MAJORITY_LABEL}') rows: {len(df_majority)}")
print(f"Total minority (attack) rows: {len(df_minority)}")

df_majority_sampled = df_majority.sample(n=MAX_MAJORITY_SAMPLES, random_state=42)
print(f"Undersampled majority to: {len(df_majority_sampled)}")

df = pd.concat([df_majority_sampled, df_minority], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True) # Shuffle
print(f"New total rows after undersampling: {len(df)}")

X = df.drop(['Label', 'Flow ID', 'Source IP', 'Source Port', 
            'Destination IP', 'Destination Port', 'Timestamp'], 
           axis=1, errors='ignore') 

y = df['Label']
print(f"\nFeatures selected. Number of features: {X.shape[1]}")

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)
print(f"\nFound {num_classes} classes: {list(label_encoder.classes_)}")

X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("Features scaled successfully.")

X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
print(f"Data reshaped for Conv1D: {X_train.shape}")

y_train = to_categorical(y_train_encoded, num_classes=num_classes)
y_test = to_categorical(y_test_encoded, num_classes=num_classes)
print(f"Target labels one-hot encoded: {y_train.shape}")

joblib.dump(scaler, 'cic_ids_scaler.joblib')
joblib.dump(label_encoder, 'cic_ids_label_encoder.joblib')
print("Scaler and LabelEncoder saved.")

Cleaned column names.
Labels found: ['BENIGN' 'Infiltration' 'Bot' 'PortScan' 'DDoS' 'FTP-Patator'
 'SSH-Patator' 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk'
 'DoS GoldenEye' 'Heartbleed']
Dropped NaN/Inf rows. Remaining rows: 2657645

--- Original Class Distribution (Top 10) ---
Label
BENIGN              2103269
DoS Hulk             230124
PortScan             158804
DDoS                 128025
DoS GoldenEye         10293
FTP-Patator            7935
SSH-Patator            5897
DoS slowloris          5796
DoS Slowhttptest       5499
Bot                    1956
Name: count, dtype: int64

Original majority ('BENIGN') rows: 2103269
Total minority (attack) rows: 554376
Undersampled majority to: 500000
New total rows after undersampling: 1054376

Features selected. Number of features: 78

Found 12 classes: ['BENIGN', 'Bot', 'DDoS', 'DoS GoldenEye', 'DoS Hulk', 'DoS Slowhttptest', 'DoS slowloris', 'FTP-Patator', 'Heartbleed', 'Infiltration', 'PortScan', 'SSH-Patator']

Training samples: 84

In [None]:
n_features = X_train.shape[1]
model = Sequential()

model.add(Conv1D(
    filters=128, 
    kernel_size=3, 
    activation='relu', 
    padding='same',
    input_shape=(n_features, 1)
))
model.add(Conv1D(
    filters=128, 
    kernel_size=3, 
    padding='same',
    activation='relu'
))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(
    filters=256, 
    kernel_size=3, 
    padding='same',
    activation='relu'
))
model.add(Conv1D(
    filters=256, 
    kernel_size=3, 
    padding='same',
    activation='relu'
))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

In [None]:
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("Model compiled with Adam (lr=0.0001).")

Model compiled with Adam (lr=0.0001).


In [None]:
# Calculate class weights to handle remaining imbalance
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train_encoded),
    y=y_train_encoded
)
class_weight_dict = dict(enumerate(class_weights))
print("Class weights calculated.")

early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=10, 
    restore_best_weights=True
)
print("EarlyStopping callback configured.")

print("Starting model training...")
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=128
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    class_weight=class_weight_dict
)

print("Model training complete.")

Class weights calculated.
EarlyStopping callback configured.
Starting model training...
Epoch 1/100
[1m6590/6590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m971s[0m 147ms/step - accuracy: 0.5587 - loss: 3.3231 - val_accuracy: 0.7695 - val_loss: 0.6696
Epoch 2/100
[1m6590/6590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m927s[0m 141ms/step - accuracy: 0.7715 - loss: 0.8869 - val_accuracy: 0.8221 - val_loss: 0.5508
Epoch 3/100
[1m6590/6590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m920s[0m 140ms/step - accuracy: 0.8140 - loss: 0.7154 - val_accuracy: 0.8558 - val_loss: 0.4228
Epoch 4/100
[1m6590/6590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m975s[0m 148ms/step - accuracy: 0.8419 - loss: 0.3999 - val_accuracy: 0.8500 - val_loss: 0.4868
Epoch 5/100
[1m6590/6590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m983s[0m 149ms/step - accuracy: 0.8547 - loss: 0.3468 - val_accuracy: 0.8684 - val_loss: 0.4014
Epoch 6/100
[1m6590/6590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [None]:
y_pred_probs = model.predict(X_test)
y_pred_encoded = np.argmax(y_pred_probs, axis=1)
target_names = label_encoder.classes_

print("--- Classification Report ---")
print(classification_report(y_test_encoded, y_pred_encoded, target_names=target_names))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_test_encoded, y_pred_encoded)
print(cm)

accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"\nOverall Test Accuracy: {accuracy * 100:.2f}%")

[1m6590/6590[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 12ms/step
--- Classification Report ---
                  precision    recall  f1-score   support

          BENIGN       1.00      0.81      0.90    100000
             Bot       0.08      1.00      0.15       391
            DDoS       0.92      1.00      0.96     25605
   DoS GoldenEye       0.74      1.00      0.85      2059
        DoS Hulk       0.92      1.00      0.96     46025
DoS Slowhttptest       0.58      0.99      0.73      1100
   DoS slowloris       0.64      0.99      0.78      1159
     FTP-Patator       0.88      1.00      0.93      1587
      Heartbleed       0.04      1.00      0.07         2
    Infiltration       0.01      0.71      0.02         7
        PortScan       0.95      1.00      0.97     31761
     SSH-Patator       0.22      0.99      0.36      1180

        accuracy                           0.91    210876
       macro avg       0.58      0.96      0.64    210876
    weighted avg   

In [None]:
model.save('ids_model.h5')
print("\nFinal CIC-IDS2017 CNN model saved to 'ids_model.h5'")




Final CIC-IDS2017 CNN model saved to 'ids_model.h5'
