In [None]:
import sys
!"{sys.executable}" -m pip install --upgrade pyarrow

In [None]:
import pandas as pd
import pyarrow
import sys

print("Python:", sys.executable)
print("pandas:", pd.__version__)
print("pyarrow:", pyarrow.__version__)

In [None]:
from pathlib import Path
import pandas as pd

ROOT = Path("/Users/constan/Desktop")
DATA_DIR = ROOT / "data" / "cicids2017_raw"

parquets = sorted(DATA_DIR.glob("*.parquet"))
print("Parquet files found:", len(parquets))

dfs = []
for p in parquets:
    df = pd.read_parquet(p)

    name = p.name.lower()
    if "benign" in name:
        df["label"] = "benign"
        df["attack_type"] = "none"
    else:
        df["label"] = "classical_attack"
        df["attack_type"] = p.stem.replace("-no-metadata", "")

    dfs.append(df)

df_raw = pd.concat(dfs, ignore_index=True)
print("Total rows loaded:", len(df_raw))
df_raw.head()


In [None]:
import numpy as np
import pandas as pd

# --- classical feature candidates (CICFlowMeter style) ---
CLASSICAL_NUM = [
    "Flow Duration",
    "Total Fwd Packets", "Total Backward Packets",
    "Total Length of Fwd Packets", "Total Length of Bwd Packets",
    "Flow Packets/s", "Flow Bytes/s",
    "Fwd Packet Length Mean", "Bwd Packet Length Mean",
    "Flow IAT Mean", "Flow IAT Std",
    "SYN Flag Count", "FIN Flag Count", "RST Flag Count",
    "ACK Flag Count", "PSH Flag Count", "URG Flag Count",
]
CLASSICAL_CAT = ["Protocol"]

keep = [c for c in CLASSICAL_CAT + CLASSICAL_NUM if c in df_raw.columns]
cat_cols = [c for c in CLASSICAL_CAT if c in keep]

print("Classical columns used:", keep)

df = df_raw[keep + ["label", "attack_type"]].copy()

# Clean numeric columns
df.replace([np.inf, -np.inf], np.nan, inplace=True)
for c in keep:
    if c in cat_cols:
        continue
    df[c] = pd.to_numeric(df[c], errors="coerce")
    df[c] = df[c].fillna(df[c].median())

df.head()


In [None]:
import numpy as np
import pandas as pd

RNG = np.random.default_rng(42)
QUANTUM_ATTACKS = np.array([
    "detector_blinding",
    "time_shift",
    "pns",
    "lo_manipulation"
])

# ---------- QKD benign telemetry ----------
def qkd_benign(n):
    qber = RNG.beta(2.0, 18.0, size=n) * 0.04 + 0.005
    clicks = RNG.lognormal(mean=6.0, sigma=0.25, size=n)
    d0 = np.clip(clicks * RNG.normal(0.5, 0.03, size=n), 0, None)
    d1 = np.clip(clicks - d0, 0, None)
    imb = np.abs(d0 - d1) / np.maximum(1.0, d0 + d1)

    decoy = np.clip(RNG.normal(0.30, 0.05, size=n), 0.05, 0.60)
    loss_db = np.clip(RNG.normal(3.0, 1.0, size=n), 0.0, 15.0)
    power = np.clip(RNG.normal(1.0, 0.08, size=n), 0.6, 1.4)
    noise = np.clip(RNG.normal(0.0, 0.15, size=n), -0.5, 1.5)

    return pd.DataFrame({
        "qber": qber,
        "click_d0": d0,
        "click_d1": d1,
        "detector_imbalance": imb,
        "decoy_signal_ratio": decoy,
        "channel_loss_db": loss_db,
        "optical_power_rel": power,
        "excess_noise": noise,
    })

QCOLS = [
    "qber",
    "click_d0",
    "click_d1",
    "detector_imbalance",
    "decoy_signal_ratio",
    "channel_loss_db",
    "optical_power_rel",
    "excess_noise",
]

# ---------- Quantum attack perturbations ----------
def apply_quantum_attack(qkd_df, kinds):
    out = qkd_df.copy()
    kinds = np.asarray(kinds)

    # detector blinding
    idx = np.where(kinds == "detector_blinding")[0]
    if len(idx):
        out.iloc[idx, out.columns.get_loc("optical_power_rel")] = np.clip(
            RNG.normal(1.55, 0.12, len(idx)), 1.3, 2.2
        )
        out.iloc[idx, out.columns.get_loc("qber")] = np.clip(
            RNG.normal(0.035, 0.01, len(idx)), 0.01, 0.07
        )

    # time-shift
    idx = np.where(kinds == "time_shift")[0]
    if len(idx):
        out.iloc[idx, out.columns.get_loc("qber")] = np.clip(
            out.iloc[idx]["qber"].to_numpy() + RNG.uniform(0.01, 0.04, len(idx)),
            0.01, 0.15
        )

    # photon-number splitting
    idx = np.where(kinds == "pns")[0]
    if len(idx):
        out.iloc[idx, out.columns.get_loc("decoy_signal_ratio")] = np.clip(
            RNG.normal(0.12, 0.06, len(idx)), 0.01, 0.40
        )

    # local oscillator manipulation
    idx = np.where(kinds == "lo_manipulation")[0]
    if len(idx):
        out.iloc[idx, out.columns.get_loc("excess_noise")] = np.clip(
            RNG.normal(0.9, 0.25, len(idx)), 0.2, 2.0
        )

    return out

# ---------- Add benign QKD to all rows ----------
qkd = qkd_benign(len(df))
df_base = pd.concat([df.reset_index(drop=True), qkd], axis=1)

# ---------- Inject quantum_attack rows ----------
quantum_frac = 0.10
hybrid_frac  = 0.10

n_q = int(len(df_base) * quantum_frac)
n_h = int(len(df_base) * hybrid_frac)

src_q = (
    df_base[df_base["label"] == "benign"]
    .sample(n=min(n_q, (df_base["label"] == "benign").sum()), random_state=42)
    .reset_index(drop=True)
    .copy()
)

k_q = RNG.choice(QUANTUM_ATTACKS, size=len(src_q))
src_q["label"] = "quantum_attack"
src_q["attack_type"] = k_q
src_q[QCOLS] = apply_quantum_attack(src_q[QCOLS], k_q)

# ---------- Inject hybrid_attack rows ----------
src_h_pool = df_base[df_base["label"] == "classical_attack"]

src_h = (
    src_h_pool
    .sample(n=min(n_h, len(src_h_pool)), random_state=42)
    .reset_index(drop=True)
    .copy()
)

k_h = RNG.choice(QUANTUM_ATTACKS, size=len(src_h))
src_h["label"] = "hybrid_attack"
src_h["attack_type"] = src_h["attack_type"].astype(str) + "+" + k_h
src_h[QCOLS] = apply_quantum_attack(src_h[QCOLS], k_h)

# ---------- Final hybrid dataset ----------
df_final = pd.concat([df_base, src_q, src_h], ignore_index=True)

df_final["label"].value_counts()


In [None]:
from pathlib import Path

ROOT = Path("/Users/constan/Desktop/data")
OUT_DIR = ROOT / "outputs"
OUT_DIR.mkdir(exist_ok=True)

print("Outputs will be saved to:", OUT_DIR)


In [None]:
def add_rules(df):
    out = df.copy()
    out["rule_qber_high"] = (out["qber"] >= 0.06).astype(int)
    out["rule_detector_imb_high"] = (out["detector_imbalance"] >= 0.20).astype(int)
    out["rule_decoy_anom"] = (out["decoy_signal_ratio"].sub(0.30).abs() >= 0.12).astype(int)
    out["rule_blind_suspect"] = ((out["optical_power_rel"] >= 1.30) & (out["qber"] <= 0.05)).astype(int)
    out["rule_score"] = out[["rule_qber_high","rule_detector_imb_high","rule_decoy_anom","rule_blind_suspect"]].sum(axis=1)
    return out

df_final = add_rules(df_final)

OUT_PARQUET = OUT_DIR / "qa_ids_hybrid_with_rules.parquet"
df_final.to_parquet(OUT_PARQUET, index=False)

print("Saved:", OUT_PARQUET)
print(df_final.columns.tolist())


In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# -------------------------
# Config
# -------------------------
LABELS4 = ["benign","classical_attack","quantum_attack","hybrid_attack"]

rule_cols = ["rule_qber_high","rule_detector_imb_high","rule_decoy_anom","rule_blind_suspect","rule_score"]
quantum_cols = QCOLS
classical_cols = keep
cat_cols = [c for c in ["Protocol"] if c in df_final.columns and c in classical_cols]

# choose model: "hgb" (fast + strong) or "logreg" (very fast baseline)
MODEL = "hgb"

# downsample per class (keeps class balance, reproducible)
N_PER_CLASS = 150_000  # 150k * 4 = 600k rows

# -------------------------
# Downsample (recommended)
# -------------------------
df_train = (
    df_final.groupby("label", group_keys=False)
    .apply(lambda x: x.sample(n=min(N_PER_CLASS, len(x)), random_state=42))
    .reset_index(drop=True)
)

print("Training set size:", len(df_train))
print(df_train["label"].value_counts())

# -------------------------
# Model builder
# -------------------------
def make_model(name: str):
    name = name.lower()
    if name == "hgb":
        # Works great on large tabular data
        return HistGradientBoostingClassifier(
            max_depth=None,
            learning_rate=0.08,
            max_iter=300,
            random_state=42,
        )
    if name == "logreg":
        return LogisticRegression(max_iter=3000, n_jobs=-1)
    raise ValueError("MODEL must be 'hgb' or 'logreg'")

# -------------------------
# Utility: run experiment
# -------------------------
def run_model(tag, feature_cols, cat_cols):
    feature_cols = [c for c in feature_cols if c in df_train.columns]
    cat_cols = [c for c in cat_cols if c in feature_cols]

    X = df_train[feature_cols].copy()
    y = df_train["label"].copy()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )

    num_cols = [c for c in feature_cols if c not in cat_cols]

    pre = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", StandardScaler(), num_cols),
        ],
        remainder="drop",
    )

    clf = make_model(MODEL)
    pipe = Pipeline([("pre", pre), ("clf", clf)])
    pipe.fit(X_train, y_train)

    pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, pred)
    report = classification_report(y_test, pred, labels=LABELS4, zero_division=0)
    cm = confusion_matrix(y_test, pred, labels=LABELS4)

    print("\n" + "="*100)
    print(f"{tag} | model={MODEL} | accuracy={acc:.4f}")
    print("="*100)
    print(report)

    # Save report
    (OUT_DIR / f"{tag.replace(' ','_').lower()}_report.txt").write_text(
        f"{tag} | model={MODEL} | accuracy={acc:.4f}\n\n{report}\n",
        encoding="utf-8"
    )

    # Plot confusion matrix
    plt.figure(figsize=(7,6))
    plt.imshow(cm)
    plt.title(f"{tag} ({MODEL})")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.xticks(range(len(LABELS4)), LABELS4, rotation=45, ha="right")
    plt.yticks(range(len(LABELS4)), LABELS4)

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, str(cm[i, j]), ha="center", va="center")

    plt.tight_layout()
    fig_path = OUT_DIR / f"{tag.replace(' ','_').lower()}_confusion_matrix.png"
    plt.savefig(fig_path, dpi=200)
    plt.show()

    print("Saved:", fig_path)

# -------------------------
# Experiments (dissertation-style)
# -------------------------

# 1) Classical-only IDS baseline
run_model("Classical-only IDS", classical_cols, cat_cols)

# 2) Quantum-only ML detector (QKD telemetry + rules)
run_model("Quantum-only (QKD + rules)", quantum_cols + rule_cols, [])

# 3) Full QA-IDS (classical + quantum + rules)
run_model("Full QA-IDS", classical_cols + quantum_cols + rule_cols, cat_cols)

print("\nâœ… Done. Reports & plots saved in:", OUT_DIR)