In [2]:
import pandas as pd
import numpy as np
import glob
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib
from xgboost import XGBClassifier
import xgboost
from collections import Counter
import time


In [3]:
dataset_folder = r"D:\AI Based Cybersecurity threat detection\AI-Based-NIDS-using-RF-and-IF\cicids2017_dataset"
csv_files = glob.glob(dataset_folder + "/*.csv")

dataframes = []
loaded_files = set()

# Ensure each file is only loaded once
for file in csv_files:
    if file not in loaded_files:
        dataframes.append(pd.read_csv(file))
        loaded_files.add(file)

df = pd.concat(dataframes, ignore_index=True) if dataframes else pd.DataFrame()

In [4]:
df.columns = df.columns.str.strip()


In [5]:
columns_to_drop = ['Timestamp', 'Flow ID', 'Source IP', 'Destination IP', 'Source Port', 'Destination Port', 'Protocol']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

In [6]:
category_mapping = {
    'BENIGN': 'BENIGN',
    'Bot': 'BOTNET',
    'DDoS': 'DOS',
    'DoS GoldenEye': 'DOS',
    'DoS Hulk': 'DOS',
    'DoS Slowhttptest': 'DOS',
    'DoS slowloris': 'DOS',
    'FTP-Patator': 'BRUTE_FORCE',
    'SSH-Patator': 'BRUTE_FORCE',
    'Heartbleed': 'WEB_ATTACK',
    'Infiltration': 'WEB_ATTACK',
    'PortScan': 'RECONNAISSANCE',
    'Web Attack – Brute Force': 'WEB_ATTACK',
    'Web Attack – Sql Injection': 'WEB_ATTACK',
    'Web Attack – XSS': 'WEB_ATTACK'
}

df['Label'] = df['Label'].map(category_mapping)

# 🔥 Convert into "BENIGN" or "MALICIOUS"
malice_mapping = {
   'BENIGN': 'BENIGN', 
    'BOTNET': 'MALICIOUS',
    'DOS': 'MALICIOUS',
    'BRUTE_FORCE': 'MALICIOUS',
    'WEB_ATTACK': 'MALICIOUS',
    'RECONNAISSANCE': 'MALICIOUS'
}

df['Label'] = df['Label'].map(malice_mapping)

# 🔥 Print label counts to check balance
print(df['Label'].value_counts())

Label
BENIGN       2273097
MALICIOUS     555466
Name: count, dtype: int64


In [7]:
df = df.drop_duplicates()

In [8]:
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].apply(lambda x: x.fillna(x.median()))
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [9]:
X=df.drop(columns='Label')
y=df['Label']
print (X)

         Flow Duration  Total Fwd Packets  Total Backward Packets  \
0                    3                  2                       0   
1                  109                  1                       1   
2                   52                  1                       1   
3                   34                  1                       1   
4                    3                  2                       0   
...                ...                ...                     ...   
2830738          32215                  4                       2   
2830739            324                  2                       2   
2830740             82                  2                       1   
2830741        1048635                  6                       2   
2830742          94939                  4                       2   

         Total Length of Fwd Packets  Total Length of Bwd Packets  \
0                                 12                            0   
1                                

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)  # Convert to 0 & 1

print(le.classes_)  # This prints: ['BENIGN' 'MALICIOUS']

['BENIGN' 'MALICIOUS']


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(Counter(y_train))

Counter({0: 1517337, 1: 267479})


In [13]:
print(Counter(y_train_resampled))

Counter({0: 1517337, 1: 1517337})


In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [15]:
selector = SelectKBest(score_func=f_classif, k=20)
X_train_selected = selector.fit_transform(X_train_scaled, y_train_resampled)
X_test_selected = selector.transform(X_test_scaled)

  f = msb / msw


In [16]:
print("Original Train Shape:", X_train.shape, y_train.shape)
print("Resampled Train Shape:", X_train_resampled.shape, y_train_resampled.shape)
print("Selected Features Shape (Train):", X_train_selected.shape)
print("Selected Features Shape (Test):", X_test_selected.shape)

Original Train Shape: (1784816, 77) (1784816,)
Resampled Train Shape: (3034674, 77) (3034674,)
Selected Features Shape (Train): (3034674, 20)
Selected Features Shape (Test): (446205, 20)


In [17]:
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = df.columns[selected_feature_indices]  # Extract feature names

# Print the selected feature names
print("Selected Features:", selected_feature_names)

Selected Features: Index(['Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow IAT Std',
       'Flow IAT Max', 'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'Average Packet Size',
       'Avg Bwd Segment Size', 'Idle Mean', 'Idle Max', 'Idle Min'],
      dtype='object')


In [18]:
from sklearn.pipeline import Pipeline
start_fit = time.time()

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_selected, y_train_resampled)
end_fit = time.time()
print(f"Training time: {end_fit - start_fit:.2f} seconds")
rf_fit_t = end_fit - start_fit 

start_pred = time.time()
y_pred_rf = rf_model.predict(X_test_selected)
end_pred = time.time()
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")
rf_pred_t = end_pred - start_pred

Training time: 4442.91 seconds
Prediction time: 12.56 seconds


In [21]:
print(classification_report(y_test, y_pred_rf, digits=4))
rf_report = classification_report(y_test, y_pred_rf, digits=4, output_dict=True)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0     0.9986    0.9976    0.9981    379335
           1     0.9867    0.9923    0.9895     66870

    accuracy                         0.9968    446205
   macro avg     0.9927    0.9950    0.9938    446205
weighted avg     0.9968    0.9968    0.9968    446205

Random Forest Accuracy: 0.9968332941136921


In [22]:
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)

# Extract values
TN, FP, FN, TP = cm.ravel()

print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

False Positives (FP): 896
False Negatives (FN): 517


In [23]:
import os
save_dir = r"D:\AI_Based_Cybersecurity_Threat_Detection\jupyter_models"  # Change this to your path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn’t exist
model_path = os.path.join(save_dir, "rf_final_try1.pkl")
joblib.dump(rf_model, model_path)
print(f"Model saved at: {model_path}")

Model saved at: D:\AI_Based_Cybersecurity_Threat_Detection\jupyter_models\rf_final_try1.pkl


In [26]:
# Train Isolation Forest (for anomaly detection)
if_model = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)
if_model.fit(X_train_selected)

# Predict Anomalies (-1 for anomaly, 1 for normal)
y_pred_if = if_model.predict(X_test_selected)

# Convert IF outputs to match classification labels
y_pred_if = np.where(y_pred_if == -1, 0, 1)

In [27]:
print(classification_report(y_test, y_pred_if, digits=4))
if_report = classification_report(y_test, y_pred_if, digits=4, output_dict=True)
print("Isolation Forest Accuracy:", accuracy_score(y_test, y_pred_if))

              precision    recall  f1-score   support

           0     0.7710    0.0748    0.1364    379335
           1     0.1427    0.8739    0.2454     66870

    accuracy                         0.1946    446205
   macro avg     0.4569    0.4744    0.1909    446205
weighted avg     0.6768    0.1946    0.1528    446205

Isolation Forest Accuracy: 0.19458768951490907


In [28]:
import os
save_dir = r"D:\AI_Based_Cybersecurity_Threat_Detection\jupyter_models"  # Change this to your path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn’t exist
model_path = os.path.join(save_dir, "if_final_try1.pkl")
joblib.dump(if_model, model_path)
print(f"Model saved at: {model_path}")

Model saved at: D:\AI_Based_Cybersecurity_Threat_Detection\jupyter_models\if_final_try1.pkl
