<a href="https://colab.research.google.com/github/egvsanthoshkumarcy24-glitch/IDS_DETECTION/blob/main/IDS_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os
import joblib
import gc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score


Mounted at /content/drive


In [None]:
BASE_DIR = "/content/drive/MyDrive/IDS_01"
CICIDS_2017_DIR = f"{BASE_DIR}/cicids2017"
CICIDS_2018_DIR = f"{BASE_DIR}/cicds_2018/extracted"

print(CICIDS_2017_DIR)
print(CICIDS_2018_DIR)


/content/drive/MyDrive/IDS_01/cicids2017
/content/drive/MyDrive/IDS_01/cicds_2018/extracted


In [None]:
import pandas as pd
import numpy as np
import os, gc

dfs_2017 = []

for f in os.listdir(CICIDS_2017_DIR):
    if f.endswith(".csv"):
        print("2017:", f)
        df = pd.read_csv(os.path.join(CICIDS_2017_DIR, f))
        df.columns = df.columns.str.strip()
        dfs_2017.append(df)

df_2017 = pd.concat(dfs_2017, ignore_index=True)
del dfs_2017
gc.collect()

print("2017 shape:", df_2017.shape)


2017: Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
2017: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
2017: Friday-WorkingHours-Morning.pcap_ISCX.csv
2017: Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
2017: Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
2017: Monday-WorkingHours.pcap_ISCX.csv
2017: Tuesday-WorkingHours.pcap_ISCX.csv
2017: Wednesday-workingHours.pcap_ISCX.csv
2017 shape: (2830743, 79)


In [None]:
# Create binary label
df_2017["binary_label"] = np.where(df_2017["Label"] == "BENIGN", 1, 0)

# Keep original labels ONLY for bypass testing
original_labels = df_2017["Label"].copy()

# Drop multiclass label
df_2017.drop(columns=["Label"], inplace=True)

print(df_2017["binary_label"].value_counts())


binary_label
1    2273097
0     557646
Name: count, dtype: int64


In [None]:
# Handle infinities and NaNs
df_2017.replace([np.inf, -np.inf], np.nan, inplace=True)
df_2017.dropna(inplace=True)

print("After cleaning:", df_2017.shape)


After cleaning: (2827876, 79)


In [None]:
X = df_2017.drop(columns=["binary_label"])
y = df_2017["binary_label"]

feature_order = list(X.columns)

# Save feature order (VERY IMPORTANT)
ARTIFACT_DIR = f"{BASE_DIR}/artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

import joblib
joblib.dump(feature_order, f"{ARTIFACT_DIR}/feature_order.pkl")

print("Feature count locked:", len(feature_order))


Feature count locked: 78


In [None]:
X_train, X_test, y_train, y_test, lbl_train, lbl_test = train_test_split(
    X,
    y,
    original_labels.loc[X.index],
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [None]:
import gc
del lbl_train
gc.collect()


287

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

model = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestClassifier(
        n_estimators=150,      # safe for Colab
        max_depth=20,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ))
])

model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import roc_auc_score

y_prob = model.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


ROC-AUC: 0.9999587021211529


In [None]:
print("=== BYPASS TEST (ATTACK PAYLOADS) ===")

test_df = X_test.copy()
test_df["true_label"] = lbl_test.values
test_df["P_normal"] = y_prob

# take a small random sample of attacks
attack_samples = test_df[test_df["true_label"] != "BENIGN"].sample(
    10, random_state=42
)

for _, r in attack_samples.iterrows():
    decision = "BENIGN" if r["P_normal"] >= 0.90 else "SUSPICIOUS"
    print(
        f"{r['true_label']:<25} | "
        f"P(Normal): {r['P_normal']:.3f} | "
        f"Decision: {decision}"
    )


=== BYPASS TEST (ATTACK PAYLOADS) ===
DoS Hulk                  | P(Normal): 0.003 | Decision: SUSPICIOUS
DoS Hulk                  | P(Normal): 0.003 | Decision: SUSPICIOUS
PortScan                  | P(Normal): 0.000 | Decision: SUSPICIOUS
DoS Hulk                  | P(Normal): 0.000 | Decision: SUSPICIOUS
DoS Hulk                  | P(Normal): 0.000 | Decision: SUSPICIOUS
DoS Hulk                  | P(Normal): 0.000 | Decision: SUSPICIOUS
DDoS                      | P(Normal): 0.000 | Decision: SUSPICIOUS
PortScan                  | P(Normal): 0.000 | Decision: SUSPICIOUS
PortScan                  | P(Normal): 0.013 | Decision: SUSPICIOUS
PortScan                  | P(Normal): 0.004 | Decision: SUSPICIOUS


In [None]:
import joblib, os

MODEL_DIR = f"{BASE_DIR}/models"
os.makedirs(MODEL_DIR, exist_ok=True)

joblib.dump(model, f"{MODEL_DIR}/normal_filter.pkl")

print("âœ… Stage-1 Normal Filter saved successfully")


âœ… Stage-1 Normal Filter saved successfully


In [None]:
import pandas as pd
import numpy as np
import os, gc, joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

BASE_DIR = "/content/drive/MyDrive/IDS_01"
CICIDS_2017_DIR = f"{BASE_DIR}/cicids2017"
ARTIFACT_DIR = f"{BASE_DIR}/artifacts"
MODEL_DIR = f"{BASE_DIR}/models"

os.makedirs(MODEL_DIR, exist_ok=True)

feature_order = joblib.load(f"{ARTIFACT_DIR}/feature_order.pkl")
print("Loaded feature count:", len(feature_order))


Loaded feature count: 78


In [None]:
dfs = []

for f in os.listdir(CICIDS_2017_DIR):
    if f.endswith(".csv"):
        print("Loading:", f)
        df = pd.read_csv(os.path.join(CICIDS_2017_DIR, f))
        df.columns = df.columns.str.strip()
        dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
del dfs
gc.collect()

print("Raw shape:", df.shape)


Loading: Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Loading: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Loading: Friday-WorkingHours-Morning.pcap_ISCX.csv
Loading: Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Loading: Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Loading: Monday-WorkingHours.pcap_ISCX.csv
Loading: Tuesday-WorkingHours.pcap_ISCX.csv
Loading: Wednesday-workingHours.pcap_ISCX.csv
Raw shape: (2830743, 79)


In [None]:
# Clean data
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

# Align strictly to locked feature order
X = df[feature_order]
labels = df["Label"]

print("Cleaned feature matrix shape:", X.shape)


Cleaned feature matrix shape: (2827876, 78)


In [None]:
def train_attack_model(attack_name, positive_labels):
    print(f"\n=== Training {attack_name.upper()} model ===")

    # One-vs-Rest target
    y = labels.isin(positive_labels).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("rf", RandomForestClassifier(
            n_estimators=150,      # Colab-safe
            max_depth=20,
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        ))
    ])

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, digits=4))

    out_path = f"{MODEL_DIR}/{attack_name}_model.pkl"
    joblib.dump(model, out_path)
    print(f"Saved â†’ {out_path}")


In [4]:
import pandas as pd
import numpy as np
import os, gc, joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

BASE_DIR = "/content/drive/MyDrive/IDS_01"
CICIDS_2017_DIR = f"{BASE_DIR}/cicids2017"
ARTIFACT_DIR = f"{BASE_DIR}/artifacts"
MODEL_DIR = f"{BASE_DIR}/models"

feature_order = joblib.load(f"{ARTIFACT_DIR}/feature_order.pkl")
os.makedirs(MODEL_DIR, exist_ok=True)


In [5]:
import gc
import pandas as pd
import numpy as np
import os

dfs = []

for f in os.listdir(CICIDS_2017_DIR):
    if not f.endswith(".csv"):
        continue

    print("Loading:", f)
    df_part = pd.read_csv(os.path.join(CICIDS_2017_DIR, f))
    df_part.columns = df_part.columns.str.strip()

    # Extract Label
    labels_part = df_part["Label"]

    # Align features to LOCKED order (missing â†’ 0.0)
    X_part = df_part.reindex(columns=feature_order, fill_value=0.0)

    # Recombine
    df_fixed = X_part.copy()
    df_fixed["Label"] = labels_part.values

    dfs.append(df_fixed)

# Merge all days
df = pd.concat(dfs, ignore_index=True)
del dfs
gc.collect()

# Final cleaning
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

print("Loaded & aligned shape:", df.shape)


Loading: Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Loading: Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Loading: Friday-WorkingHours-Morning.pcap_ISCX.csv
Loading: Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Loading: Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Loading: Monday-WorkingHours.pcap_ISCX.csv
Loading: Tuesday-WorkingHours.pcap_ISCX.csv
Loading: Wednesday-workingHours.pcap_ISCX.csv
Loaded & aligned shape: (2827876, 79)


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib, gc

def train_attack_model_safe(
    attack_name,
    positive_labels,
    benign_sample_size=300_000
):
    print(f"\n=== Training {attack_name.upper()} (SAFE MODE) ===")

    attack_df = df[df["Label"].isin(positive_labels)]
    benign_df = df[df["Label"] == "BENIGN"].sample(
        n=min(benign_sample_size, (df["Label"] == "BENIGN").sum()),
        random_state=42
    )

    train_df = pd.concat([attack_df, benign_df], ignore_index=True)

    # ðŸ”¥ USE LEAKAGE-FREE FEATURE ORDER
    X = train_df[feature_order_stage2]
    y = train_df["Label"].isin(positive_labels).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("rf", RandomForestClassifier(
            n_estimators=120,
            max_depth=18,
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        ))
    ])

    model.fit(X_train, y_train)

    print(classification_report(y_test, model.predict(X_test), digits=4))

    out_path = f"{MODEL_DIR}/{attack_name}_model.pkl"
    joblib.dump(model, out_path)
    print(f"âœ… Saved {out_path}")

    del train_df, X, y, X_train, X_test, y_train, y_test
    gc.collect()


In [7]:
leaky_features = [
    "Flow Bytes/s",
    "Flow Packets/s",
    "Packet Length Variance",
    "Packet Length Std",
    "Init_Win_bytes_forward",
    "Init_Win_bytes_backward",
    "Subflow Fwd Bytes",
    "Subflow Bwd Bytes",
    "Fwd Avg Bytes/Bulk",
    "Bwd Avg Bytes/Bulk",
    "Fwd Avg Packets/Bulk",
    "Bwd Avg Packets/Bulk",
]

clean_feature_order = [
    f for f in feature_order if f not in leaky_features
]

print("Original feature count:", len(feature_order))
print("After removing leaky features:", len(clean_feature_order))


Original feature count: 78
After removing leaky features: 66


In [8]:
# Alias to the expected name
feature_order_stage2 = clean_feature_order

print("Stage-2 feature count:", len(feature_order_stage2))


Stage-2 feature count: 66


In [9]:
import joblib

joblib.dump(feature_order_stage2, "feature_order_stage2.pkl")
print("Saved feature_order_stage2.pkl")


Saved feature_order_stage2.pkl


In [26]:
train_attack_model_safe(
    "dos",
    ["DoS Hulk", "DoS slowloris", "DoS Slowhttptest", "DoS GoldenEye"]
)



=== Training DOS (SAFE MODE) ===
              precision    recall  f1-score   support

           0     0.9995    0.9983    0.9989     60000
           1     0.9980    0.9994    0.9987     50343

    accuracy                         0.9988    110343
   macro avg     0.9988    0.9989    0.9988    110343
weighted avg     0.9988    0.9988    0.9988    110343

âœ… Saved /content/drive/MyDrive/IDS_01/models/dos_model.pkl


In [30]:
train_attack_model_safe(
    "ddos",
    ["DDoS"]
)



=== Training DDOS (SAFE MODE) ===
              precision    recall  f1-score   support

           0     0.9998    1.0000    0.9999     60000
           1     0.9999    0.9995    0.9997     25605

    accuracy                         0.9998     85605
   macro avg     0.9999    0.9997    0.9998     85605
weighted avg     0.9998    0.9998    0.9998     85605

âœ… Saved /content/drive/MyDrive/IDS_01/models/ddos_model.pkl


In [31]:
train_attack_model_safe(
    "portscan",
    ["PortScan"]
)



=== Training PORTSCAN (SAFE MODE) ===
              precision    recall  f1-score   support

           0     0.9998    0.9995    0.9996     60000
           1     0.9990    0.9997    0.9993     31761

    accuracy                         0.9995     91761
   macro avg     0.9994    0.9996    0.9995     91761
weighted avg     0.9995    0.9995    0.9995     91761

âœ… Saved /content/drive/MyDrive/IDS_01/models/portscan_model.pkl


In [32]:
train_attack_model_safe(
    "bruteforce",
    ["FTP-Patator", "SSH-Patator"]
)



=== Training BRUTEFORCE (SAFE MODE) ===
              precision    recall  f1-score   support

           0     0.9999    0.9998    0.9999     60001
           1     0.9960    0.9989    0.9975      2766

    accuracy                         0.9998     62767
   macro avg     0.9980    0.9994    0.9987     62767
weighted avg     0.9998    0.9998    0.9998     62767

âœ… Saved /content/drive/MyDrive/IDS_01/models/bruteforce_model.pkl


In [33]:
train_attack_model_safe(
    "webattack",
    [
        "Web Attack â€“ Brute Force",
        "Web Attack â€“ XSS",
        "Web Attack â€“ Sql Injection"
    ]
)



=== Training WEBATTACK (SAFE MODE) ===
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     60000

    accuracy                         1.0000     60000
   macro avg     1.0000    1.0000    1.0000     60000
weighted avg     1.0000    1.0000    1.0000     60000

âœ… Saved /content/drive/MyDrive/IDS_01/models/webattack_model.pkl


In [34]:
import joblib

models = {
    "DoS": joblib.load(f"{MODEL_DIR}/dos_model.pkl"),
    "DDoS": joblib.load(f"{MODEL_DIR}/ddos_model.pkl"),
    "PortScan": joblib.load(f"{MODEL_DIR}/portscan_model.pkl"),
    "BruteForce": joblib.load(f"{MODEL_DIR}/bruteforce_model.pkl"),
    "WebAttack": joblib.load(f"{MODEL_DIR}/webattack_model.pkl"),
}

print("Loaded models:", list(models.keys()))


Loaded models: ['DoS', 'DDoS', 'PortScan', 'BruteForce', 'WebAttack']


In [36]:
# Pick ONE random non-benign sample to simulate payload
sample = df[df["Label"] != "BENIGN"].sample(1, random_state=42)

true_label = sample["Label"].values[0]
X_payload = sample[feature_order_stage2]

print("True label:", true_label)
print("\n=== Stage-2 model probabilities ===")

scores = {}

for name, model in models.items():
    proba = model.predict_proba(X_payload)

    # SAFE probability extraction
    if proba.shape[1] == 2:
        prob_attack = proba[0][1]
    else:
        prob_attack = 0.0

    scores[name] = prob_attack
    print(f"{name:<12} -> P = {prob_attack:.4f}")


True label: DDoS

=== Stage-2 model probabilities ===
DoS          -> P = 0.5995
DDoS         -> P = 1.0000
PortScan     -> P = 0.6417
BruteForce   -> P = 0.0000
WebAttack    -> P = 0.0000


In [37]:
# Test BruteForce model on a TRUE BruteForce sample
bf_sample = df[df["Label"].isin(["FTP-Patator", "SSH-Patator"])].sample(1, random_state=1)

true_label = bf_sample["Label"].values[0]
X_bf = bf_sample[feature_order_stage2]

print("True label:", true_label)

proba = models["BruteForce"].predict_proba(X_bf)

if proba.shape[1] == 2:
    print("BruteForce model P:", proba[0][1])
else:
    print("BruteForce model P: SINGLE-CLASS MODEL (treated as 0 elsewhere)")


True label: FTP-Patator
BruteForce model P: 1.0
