In [None]:
# ============================================================
# 1. ENVIRONMENT version SANITY CHECK verify interpreter version,confirms tf-privacy version
# ============================================================

!python --version  # Verifies Python interpreter version

# make sure python 3.9.12 is installed in the environment and running first

# Uncomment if version reporting is needed for reproducibility
# import numpy as np
# import tensorflow as tf
# import tensorflow_privacy
# print("NumPy:", np.__version__)
# print("TensorFlow:", tf.__version__)
# print("TF-Privacy:", tensorflow_privacy.__version__)


In [None]:
# =========================
# Standard Library
# =========================
import os
import random
import warnings

# =========================
# Numerical & Data
# =========================
import numpy as np
import pandas as pd

# =========================
# Visualization
# =========================
import matplotlib.pyplot as plt
import seaborn as sns

# =========================
# sklearn
# =========================
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    precision_recall_curve,
    auc
)

# =========================
# TensorFlow & Privacy
# =========================
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy

# =========================
# Genetic Algorithm (DEAP)
# =========================
from deap import base, creator, tools

warnings.filterwarnings("ignore")


In [None]:
# ============================================================
# 3. REPRODUCIBILITY CONTROL global version ensures deterministic weight  and GA behaviour
# ============================================================

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)


In [None]:
# ============================================================
# 4. LOAD DATA
# ============================================================

data = pd.read_csv("data.csv")

# Drop timestamp column if present
if "timestamp" in data.columns:
    data = data.drop("timestamp", axis=1)

target_column = "is_attack"

X = data.drop(target_column, axis=1)
y = data[target_column]


In [None]:
# ============================================================
# 5. TRAIN / VALIDATION / TEST SPLIT
# ============================================================

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=SEED
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.4, stratify=y_temp, random_state=SEED
)


In [None]:
# ============================================================
# 6. FEATURE STANDARDIZATION
# ============================================================

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled  = scaler.transform(X_test)


In [None]:
# ---------------------------------------------------------
# Architecture Search Space (MLP Only)
# ---------------------------------------------------------
LAYERS = [ 4, 6, 8]              # Number of hidden layers
UNITS = [16,32,64, 128, 256]             # Neurons per layer
DROPOUT_RANGE = (0.1, 0.5)

ACTIVATIONS = ['relu', 'gelu', 'swish', 'silu']
OPTIMIZERS = ['sgd', 'adam']
LOSSES = ['binary_crossentropy']

LEARNING_RATES = [0.001, 0.0001]
BATCH_SIZES = [64, 128, 256]
EPOCHS = [20, 30, 50, 100]

# Differential Privacy Parameters
noise_multiplier = 1.3
l2_norm_clip = 1.0
delta = 1e-5

gene_alg = True


In [None]:
# model evaluation function dp and cv

def evaluate_mlp(hyperparams):
    """
    Fitness Function for GA:
    Maximize (Mean F1 - ε privacy cost)

    This enforces a privacy-utility tradeoff.
    """

    units = hyperparams['units']
    layers = hyperparams['layers']
    dropout = hyperparams['dropout']
    lr = hyperparams['learning_rate']
    batch_size = hyperparams['batch_size']
    epochs = hyperparams['epochs']

    # -------------------------
    # Build MLP Model
    # -------------------------
    model = Sequential()

    model.add(Dense(
        units,
        activation='relu',
        input_shape=(X_train_scaled.shape[1],)
    ))
    model.add(Dropout(dropout))

    for _ in range(layers - 1):
        model.add(Dense(units, activation='relu'))
        model.add(Dropout(dropout))

    model.add(Dense(1, activation='sigmoid'))

    # -------------------------
    # Differential Privacy Optimizer
    # -------------------------
    dp_optimizer = tensorflow_privacy.DPKerasSGDOptimizer(
        l2_norm_clip=l2_norm_clip,
        noise_multiplier=noise_multiplier,
        num_microbatches=1,
        learning_rate=lr
    )

    model.compile(
        optimizer=dp_optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    # -------------------------
    # Cross-Validation
    # -------------------------
    kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    f1_scores = []

    for train_idx, val_idx in kfold.split(X_train_scaled, y_train):
        X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(
            X_tr, y_tr,
            epochs=epochs,
            batch_size=batch_size,
            verbose=0
        )

        y_pred = (model.predict(X_val) > 0.5).astype(int)
        f1_scores.append(f1_score(y_val, y_pred))

    mean_f1 = np.mean(f1_scores)

    # -------------------------
    # Privacy Budget ε
    # -------------------------
    epsilon, _ = compute_dp_sgd_privacy(
        n=len(X_train_scaled),
        batch_size=batch_size,
        noise_multiplier=noise_multiplier,
        epochs=epochs,
        delta=delta
    )

    fitness = mean_f1 - epsilon

    return (fitness,)


In [None]:
# GA definition set-up

def create_individual():
    return {
        'units': random.choice(UNITS),
        'layers': random.choice(LAYERS),
        'dropout': random.uniform(*DROPOUT_RANGE),
        'learning_rate': random.choice(LEARNING_RATES),
        'batch_size': random.choice(BATCH_SIZES),
        'epochs': random.choice(EPOCHS)
    }

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", dict, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutShuffleIndexes, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate_mlp)


In [None]:
# running GA

def run_ga():
    population = toolbox.population(n=10)
    generations = 5

    for gen in range(generations):
        print(f"Generation {gen}")

        fitnesses = list(map(toolbox.evaluate, population))
        for ind, fit in zip(population, fitnesses):
            ind.fitness.values = fit

        offspring = toolbox.select(population, len(population))
        offspring = list(map(toolbox.clone, offspring))

        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

        for mutant in offspring:
            toolbox.mutate(mutant)
            del mutant.fitness.values

        population[:] = offspring

    return tools.selBest(population, 1)[0]

best_hp = run_ga()
print("Best Hyperparameters:", best_hp)


In [None]:
# final mlp model training

final_model = Sequential()

final_model.add(Dense(
    best_hp['units'],
    activation='relu',
    input_shape=(X_train_scaled.shape[1],)
))
final_model.add(Dropout(best_hp['dropout']))

for _ in range(best_hp['layers'] - 1):
    final_model.add(Dense(best_hp['units'], activation='relu'))
    final_model.add(Dropout(best_hp['dropout']))

final_model.add(Dense(1, activation='sigmoid'))

dp_optimizer = tensorflow_privacy.DPKerasSGDOptimizer(
    l2_norm_clip=l2_norm_clip,
    noise_multiplier=noise_multiplier,
    num_microbatches=1,
    learning_rate=best_hp['learning_rate']
)

final_model.compile(
    optimizer=dp_optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = final_model.fit(
    X_train_scaled,
    y_train,
    epochs=best_hp['epochs'],
    batch_size=best_hp['batch_size'],
    validation_data=(X_valid_scaled, y_valid),
    verbose=1
)


In [None]:
# evaluation of privacy budget
y_pred = (final_model.predict(X_test_scaled) > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

epsilon, _ = compute_dp_sgd_privacy(
    n=len(X_train_scaled),
    batch_size=best_hp['batch_size'],
    noise_multiplier=noise_multiplier,
    epochs=best_hp['epochs'],
    delta=delta
)

print("Final Privacy Budget ε:", epsilon)
