In [None]:
!pip install tensorflow   



In [None]:
# !pip install pandas numpy scikit-learn tensorflow==2.15 imbalanced-learn --quiet

import os, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks


In [None]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)


In [None]:
WRAPPER_FEATURES = [
    "Src_Port","Dst_Port","Protocol","TotLen_Fwd_Pkts",
    "Fwd_Pkt_Len_Max","Fwd_Pkt_Len_Min","Fwd_Pkt_Len_Mean","Fwd_Pkt_Len_Std",
    "Fwd_IAT_Tot","Fwd_IAT_Std","Fwd_IAT_Max","Flow_IAT_Min",
    "Fwd_PSH_Flags","Fwd_URG_Flags","Bwd_URG_Flags","Fwd_Pkts/s",
    "FIN_Flag_Cnt","SYN_Flag_Cnt","RST_Flag_Cnt","ACK_Flag_Cnt",
    "URG_Flag_Cnt","CWE_Flag_Count","ECE_Flag_Cnt","Fwd_Seg_Size_Avg",
    "Fwd_Byts/b_Avg","Fwd_Pkts/b_Avg","Fwd_Blk_Rate_Avg","Bwd_Byts/b_Avg",
    "Bwd_Pkts/b_Avg","Bwd_Blk_Rate_Avg","Subflow_Fwd_Byts","Init_Fwd_Win_Byts",
    "Fwd_Act_Data_Pkts","Fwd_Seg_Size_Min","Active_Mean","Active_Std",
    "Active_Max","Active_Min","Idle_Std"
]


In [None]:
# CHANGE THIS
CSV_PATH = "/content/ResearchDataSet.csv"
df = pd.read_csv(CSV_PATH)


In [None]:
# 5.1 choose target
# If your file has Sub_Cat and you want strictly "MITM ARP Spoofing" vs normal:
# df["Label"] = df["Sub_Cat"].astype(str).str.lower().str.contains("mitm arp spoofing").astype(int)
# Otherwise use the existing numeric Label at the end of your sheet:
y_col = "Label"

# 5.2 drop columns the paper removes (if present)
DROP_COLS = ["Flow_ID","Src_IP","Dst_IP","Timestamp","Label","Cat","Sub_Cat"]
to_drop_now = [c for c in ["Flow_ID","Src_IP","Dst_IP","Timestamp","Cat","Sub_Cat"] if c in df.columns]
df.drop(columns=to_drop_now, inplace=True)

# 5.3 restrict to wrapper features + target (warn on missing)
available = [c for c in WRAPPER_FEATURES if c in df.columns]
missing = sorted(set(WRAPPER_FEATURES) - set(available))
if missing:
    print("WARNING: missing wrapper columns (skipped):", missing)

cols = available + [y_col]
df = df[cols].drop_duplicates().copy()

# 5.4 sanitize & impute
df.replace([np.inf, -np.inf], np.nan, inplace=True)
for c in available:
    if df[c].isna().any():
        df[c].fillna(df[c].median(), inplace=True)

# 5.5 split & scale (no leakage)
X = df[available].astype(float)
y = df[y_col].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=SEED, stratify=y
)

scaler = MinMaxScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)




In [None]:
classes = np.unique(y_train)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
CLASS_WEIGHT = {int(k): float(v) for k,v in zip(classes, cw)}
CLASS_WEIGHT


{0: 0.7849565032887758, 1: 1.3773268801191363}

In [None]:
INPUT_DIM = X_train_s.shape[1]

def make_model(input_dim=INPUT_DIM, dropout=0.2, lr=1e-3):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation="relu"),
        layers.BatchNormalization(),
        layers.Dropout(dropout),

        layers.Dense(64, activation="relu"),
        layers.BatchNormalization(),
        layers.Dropout(dropout/2),

        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation="sigmoid"), 
    ])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="binary_crossentropy",
        metrics=[keras.metrics.Precision(name="precision"),
                 keras.metrics.Recall(name="recall"),
                 keras.metrics.AUC(name="auc"),
                 "accuracy"]
    )
    return model

model = make_model()
model.summary()


In [None]:
ES = callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=8, restore_best_weights=True)
RLROP = callbacks.ReduceLROnPlateau(monitor="val_auc", mode="max", factor=0.5, patience=3, min_lr=1e-5)
CKPT = callbacks.ModelCheckpoint("best_wrapper_dl.keras", monitor="val_auc", mode="max", save_best_only=True)

history = model.fit(
    X_train_s, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=1024,
    class_weight=CLASS_WEIGHT,
    callbacks=[ES, RLROP, CKPT],
    verbose=1
)


Epoch 1/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 88ms/step - accuracy: 0.6449 - auc: 0.7840 - loss: 0.6453 - precision: 0.5109 - recall: 0.9475 - val_accuracy: 0.7286 - val_auc: 0.9419 - val_loss: 0.6086 - val_precision: 0.9110 - val_recall: 0.2935 - learning_rate: 0.0010
Epoch 2/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9030 - auc: 0.9572 - loss: 0.2588 - precision: 0.7998 - recall: 0.9720 - val_accuracy: 0.8099 - val_auc: 0.9582 - val_loss: 0.5399 - val_precision: 0.9344 - val_recall: 0.5217 - learning_rate: 0.0010
Epoch 3/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9234 - auc: 0.9705 - loss: 0.2019 - precision: 0.8378 - recall: 0.9742 - val_accuracy: 0.8293 - val_auc: 0.9672 - val_loss: 0.5043 - val_precision: 0.9383 - val_recall: 0.5754 - learning_rate: 0.0010
Epoch 4/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.

In [None]:
best = keras.models.load_model("best_wrapper_dl.keras")
proba = best.predict(X_test_s, batch_size=4096).ravel()
pred  = (proba >= 0.5).astype(int)

print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=4))
print("ROC AUC:", roc_auc_score(y_test, proba))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
Confusion Matrix:
 [[3426  109]
 [  20 1995]]
              precision    recall  f1-score   support

           0     0.9942    0.9692    0.9815      3535
           1     0.9482    0.9901    0.9687      2015

    accuracy                         0.9768      5550
   macro avg     0.9712    0.9796    0.9751      5550
weighted avg     0.9775    0.9768    0.9769      5550

ROC AUC: 0.9922292846087161


In [None]:
import joblib
joblib.dump(scaler, "wrapper_minmax_scaler.joblib")
best.save("best_wrapper_dl.keras")
print("Saved: wrapper_minmax_scaler.joblib, best_wrapper_dl.keras")


Saved: wrapper_minmax_scaler.joblib, best_wrapper_dl.keras


In [None]:
import joblib
joblib.dump(scaler, "wrapper_minmax_scaler.joblib")
best.save("best_wrapper_dl.keras")
print("Saved: wrapper_minmax_scaler.joblib, best_wrapper_dl.keras")


Saved: wrapper_minmax_scaler.joblib, best_wrapper_dl.keras


In [None]:
import pandas as pd
import numpy as np
import joblib
from tensorflow import keras

# Load artifacts
scaler_path = "wrapper_minmax_scaler.joblib"
model_path  = "best_wrapper_dl.keras"
scaler = joblib.load(scaler_path)
model  = keras.models.load_model(model_path)

# Load unseen CSV
unseen_path = "/content/mitm_normal_dataset_70_30_shuffled.csv"  # <-- change to your file
df_raw = pd.read_csv(unseen_path)

# EXACT column order the scaler was fit with
expected_cols = list(getattr(scaler, "feature_names_in_", []))
if not expected_cols:
    raise RuntimeError("Scaler does not have feature_names_in_. Refit scaler with pandas DataFrame during training.")

# Build X with exact expected columns, adding any missing as safe defaults
X = pd.DataFrame(index=df_raw.index, columns=expected_cols, dtype="float64")

for i, col in enumerate(expected_cols):
    if col in df_raw.columns:
        X[col] = df_raw[col]
    else:
        # Fill truly-missing training columns with the scaler's learned minimum (safe, consistent default)
        X[col] = float(scaler.data_min_[i])

# Clean & median-impute any NaNs / infs in present columns
X.replace([np.inf, -np.inf], np.nan, inplace=True)
for c in X.columns:
    if X[c].isna().any():
        X[c].fillna(X[c].median(), inplace=True)

# Transform & predict
X_scaled = scaler.transform(X)
proba = model.predict(X_scaled, batch_size=4096).ravel()
pred  = (proba >= 0.5).astype(int)

# Print results
for i, (p, pr) in enumerate(zip(proba, pred)):
    print(f"Row {i}: Probability={p:.4f} -> Predicted={'ATTACK' if pr==1 else 'NORMAL'}")

# (Optional) save with predictions
out = df_raw.copy()
out["mitm_proba"] = proba
out["mitm_pred"]  = pred
out.to_csv("unseen_with_preds.csv", index=False)
print("Saved -> unseen_with_preds.csv")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step
Row 0: Probability=1.0000 -> Predicted=ATTACK
Row 1: Probability=0.0000 -> Predicted=NORMAL
Row 2: Probability=0.9736 -> Predicted=ATTACK
Row 3: Probability=0.0001 -> Predicted=NORMAL
Row 4: Probability=0.0032 -> Predicted=NORMAL
Row 5: Probability=0.0000 -> Predicted=NORMAL
Row 6: Probability=0.0032 -> Predicted=NORMAL
Row 7: Probability=0.9736 -> Predicted=ATTACK
Row 8: Probability=0.0000 -> Predicted=NORMAL
Row 9: Probability=0.0001 -> Predicted=NORMAL
Row 10: Probability=0.0032 -> Predicted=NORMAL
Row 11: Probability=0.0032 -> Predicted=NORMAL
Row 12: Probability=0.9736 -> Predicted=ATTACK
Row 13: Probability=0.0001 -> Predicted=NORMAL
Row 14: Probability=0.9736 -> Predicted=ATTACK
Row 15: Probability=0.0001 -> Predicted=NORMAL
Row 16: Probability=1.0000 -> Predicted=ATTACK
Row 17: Probability=0.9736 -> Predicted=ATTACK
Row 18: Probability=0.0001 -> Predicted=NORMAL
Row 19: Probability=0.0001 -> Predicted=NORM