<a href="https://colab.research.google.com/github/gptchat12370-ai/Group-A-DLI-Assignment/blob/main/M_Rasheed(MLP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# ===== Minimal MLP pipeline (CICIDS2017) — ready to enhance =====
import time, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)

from imblearn.over_sampling import ADASYN

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers, regularizers

In [6]:
# ------------------ config ------------------
DATASET_PATH = "/content/cic_0.01km.csv"
RANDOM_STATE  = 42
USE_ADASYN    = True                   # turn on/off imbalance handling
BATCH         = 1024                   # try 512/2048 too
EPOCHS        = 50                     # raise with early stopping
hp = dict(                             # <<< TUNE THESE >>>
    hidden1=512, hidden2=256,
    dropout1=0.40, dropout2=0.20,
    lr=3e-4, l2=1e-6
)

In [7]:
# ------------------ AutoDP (lean) ------------------
def auto_encode(df, target='Label'):
    obj_cols = [c for c in df.columns if df[c].dtype=='object' and c!=target]
    for c in obj_cols:
        df[c] = LabelEncoder().fit_transform(df[c].astype(str))
    return df

def auto_impute(df):
    df = df.replace([np.inf, -np.inf], np.nan)
    return df.fillna(df.median(numeric_only=True))

def scale_fit_transform(X_tr, X_val, X_te):
    sc = MinMaxScaler()
    return sc.fit_transform(X_tr), sc.transform(X_val), sc.transform(X_te), sc

In [8]:
# ------------------ load & preprocess ------------------
df = pd.read_csv(DATASET_PATH)
df = auto_encode(df)
df = auto_impute(df)

X = df.drop(columns=['Label'])
y = df['Label'].astype(int)

# 64/16/20 split with stratification
X_tmp, X_test, y_tmp, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)
X_train, X_val, y_train, y_val = train_test_split(
    X_tmp, y_tmp, test_size=0.20, stratify=y_tmp, random_state=RANDOM_STATE
)

# optional imbalance handling (fix: remove n_jobs)
if USE_ADASYN:
    ada = ADASYN(random_state=RANDOM_STATE, sampling_strategy='auto', n_neighbors=5)
    X_train, y_train = ada.fit_resample(X_train, y_train)
    class_weights = None   # avoid double-compensation when oversampling
else:
    # class weights only if not oversampling
    cls = np.unique(y_train)
    from sklearn.utils.class_weight import compute_class_weight
    class_weights = {c: w for c, w in zip(cls, compute_class_weight('balanced', classes=cls, y=y_train))}

# feature scaling for MLP
X_train, X_val, X_test, scaler = scale_fit_transform(X_train, X_val, X_test)

In [9]:
# ------------------ model ------------------
def make_mlp(input_dim, hp):
    reg = regularizers.l2(hp['l2'])
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(hp['hidden1'], activation='relu', kernel_regularizer=reg)(inp)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(hp['dropout1'])(x)
    x = layers.Dense(hp['hidden2'], activation='relu', kernel_regularizer=reg)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(hp['dropout2'])(x)
    out = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inp, out)
    model.compile(
        optimizer=optimizers.Adam(learning_rate=hp['lr']),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

model = make_mlp(X_train.shape[1], hp)

cbs = [
    callbacks.EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=0)
]

t0 = time.time()
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH,
    class_weight=class_weights,
    verbose=0,
    callbacks=cbs
)
train_time = time.time() - t0

In [13]:
# ==== MODEL vs PAPER (CICIDS2017, AutoDP+AutoFE → MLP) ====
import numpy as np, pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Use tuned threshold if available; otherwise 0.50
try:
    t = float(best_t)
except NameError:
    t = 0.50

# Evaluate your model (rename to "Your Model" instead of "Test")
probs = model.predict(X_test, verbose=0).ravel()
preds = (probs >= t).astype(int)

your = dict(
    Accuracy = accuracy_score(y_test, preds)*100,
    Precision= precision_score(y_test, preds)*100,
    Recall   = recall_score(y_test, preds)*100,
    F1       = f1_score(y_test, preds)*100
)

# Build a readable "Method" string from your current settings
def _get(name, default=None):
    try:
        return eval(name)
    except Exception:
        return default

hp_local   = _get('hp', {}) or {}
use_adasyn = bool(_get('USE_ADASYN', False))
use_focal  = bool(_get('USE_FOCAL', False))

arch = f"{hp_local.get('hidden1','?')}-{hp_local.get('hidden2','?')}, do={hp_local.get('dropout1','?')}/{hp_local.get('dropout2','?')}, lr={hp_local.get('lr','?')}"
imb  = "ADASYN" if use_adasyn else "ClassWeights"
loss = "Focal" if use_focal else "BinaryCE"

method_yours = f"AutoDP+AutoFE + MLP ({arch}; {imb}; thr={t:.3f}; loss={loss})"

# Paper baseline (Table VI: AutoDP & AutoFE → MLP on CICIDS2017)
paper_row = {
    "Dataset": "CICIDS2017",
    "Procedure": "AutoDP+AutoFE",
    "Algorithm": "MLP",
    "Method": "As reported (paper)",
    "Accuracy (%)": 85.968,
    "Precision (%)": 92.069,
    "Recall (%)": 26.563,
    "F1 (%)": 44.831,
    "Training Time (s)": 16.1  # from the table
}

# Your model row
try:
    tt = float(train_time)
except Exception:
    tt = np.nan

your_row = {
    "Dataset": "CICIDS2017",
    "Procedure": "AutoDP+AutoFE",
    "Algorithm": "MLP",
    "Method": method_yours,
    "Accuracy (%)": your["Accuracy"],
    "Precision (%)": your["Precision"],
    "Recall (%)": your["Recall"],
    "F1 (%)": your["F1"],
    "Training Time (s)": tt
}

summary = pd.DataFrame([paper_row, your_row]).round(3)
print(summary.to_string(index=False))

# (Optional) save for your appendix
# summary.to_csv("model_vs_paper_CICIDS2017_MLP.csv", index=False)
# ==== end ====

   Dataset     Procedure Algorithm                                                                                 Method  Accuracy (%)  Precision (%)  Recall (%)  F1 (%)  Training Time (s)
CICIDS2017 AutoDP+AutoFE       MLP                                                                    As reported (paper)        85.968         92.069      26.563  44.831             16.100
CICIDS2017 AutoDP+AutoFE       MLP AutoDP+AutoFE + MLP (512-256, do=0.4/0.2, lr=0.0003; ADASYN; thr=0.500; loss=BinaryCE)        91.274         69.887      98.759  81.852             61.719
