In [None]:
import sys
!"{sys.executable}" -m pip install --upgrade pyarrow

In [None]:
import pandas as pd
import pyarrow
import sys

print("Python:", sys.executable)
print("pandas:", pd.__version__)
print("pyarrow:", pyarrow.__version__)

In [None]:
from pathlib import Path
import pandas as pd

ROOT = Path("/Users/constan/Desktop")
DATA_DIR = ROOT / "data" / "cicids2017_raw"

parquets = sorted(DATA_DIR.glob("*.parquet"))
print("Parquet files found:", len(parquets))

dfs = []
for p in parquets:
    df = pd.read_parquet(p)

    name = p.name.lower()
    if "benign" in name:
        df["label"] = "benign"
        df["attack_type"] = "none"
    else:
        df["label"] = "classical_attack"
        df["attack_type"] = p.stem.replace("-no-metadata", "")

    dfs.append(df)

df_raw = pd.concat(dfs, ignore_index=True)
print("Total rows loaded:", len(df_raw))
df_raw.head()


In [None]:
import numpy as np
import pandas as pd

# --- classical feature candidates (CICFlowMeter style) ---
CLASSICAL_NUM = [
    "Flow Duration",
    "Total Fwd Packets", "Total Backward Packets",
    "Total Length of Fwd Packets", "Total Length of Bwd Packets",
    "Flow Packets/s", "Flow Bytes/s",
    "Fwd Packet Length Mean", "Bwd Packet Length Mean",
    "Flow IAT Mean", "Flow IAT Std",
    "SYN Flag Count", "FIN Flag Count", "RST Flag Count",
    "ACK Flag Count", "PSH Flag Count", "URG Flag Count",
]
CLASSICAL_CAT = ["Protocol"]

keep = [c for c in CLASSICAL_CAT + CLASSICAL_NUM if c in df_raw.columns]
cat_cols = [c for c in CLASSICAL_CAT if c in keep]

print("Classical columns used:", keep)

df = df_raw[keep + ["label", "attack_type"]].copy()

# Clean numeric columns
df.replace([np.inf, -np.inf], np.nan, inplace=True)
for c in keep:
    if c in cat_cols:
        continue
    df[c] = pd.to_numeric(df[c], errors="coerce")
    df[c] = df[c].fillna(df[c].median())

df.head()


In [None]:
import numpy as np
import pandas as pd

RNG = np.random.default_rng(42)
QUANTUM_ATTACKS = np.array([
    "detector_blinding",
    "time_shift",
    "pns",
    "lo_manipulation"
])

# ---------- QKD benign telemetry ----------
def qkd_benign(n):
    qber = RNG.beta(2.0, 18.0, size=n) * 0.04 + 0.005
    clicks = RNG.lognormal(mean=6.0, sigma=0.25, size=n)
    d0 = np.clip(clicks * RNG.normal(0.5, 0.03, size=n), 0, None)
    d1 = np.clip(clicks - d0, 0, None)
    imb = np.abs(d0 - d1) / np.maximum(1.0, d0 + d1)

    decoy = np.clip(RNG.normal(0.30, 0.05, size=n), 0.05, 0.60)
    loss_db = np.clip(RNG.normal(3.0, 1.0, size=n), 0.0, 15.0)
    power = np.clip(RNG.normal(1.0, 0.08, size=n), 0.6, 1.4)
    noise = np.clip(RNG.normal(0.0, 0.15, size=n), -0.5, 1.5)

    return pd.DataFrame({
        "qber": qber,
        "click_d0": d0,
        "click_d1": d1,
        "detector_imbalance": imb,
        "decoy_signal_ratio": decoy,
        "channel_loss_db": loss_db,
        "optical_power_rel": power,
        "excess_noise": noise,
    })

QCOLS = [
    "qber",
    "click_d0",
    "click_d1",
    "detector_imbalance",
    "decoy_signal_ratio",
    "channel_loss_db",
    "optical_power_rel",
    "excess_noise",
]

# ---------- Quantum attack perturbations ----------
def apply_quantum_attack(qkd_df, kinds):
    out = qkd_df.copy()
    kinds = np.asarray(kinds)

    # detector blinding
    idx = np.where(kinds == "detector_blinding")[0]
    if len(idx):
        out.iloc[idx, out.columns.get_loc("optical_power_rel")] = np.clip(
            RNG.normal(1.55, 0.12, len(idx)), 1.3, 2.2
        )
        out.iloc[idx, out.columns.get_loc("qber")] = np.clip(
            RNG.normal(0.035, 0.01, len(idx)), 0.01, 0.07
        )

    # time-shift
    idx = np.where(kinds == "time_shift")[0]
    if len(idx):
        out.iloc[idx, out.columns.get_loc("qber")] = np.clip(
            out.iloc[idx]["qber"].to_numpy() + RNG.uniform(0.01, 0.04, len(idx)),
            0.01, 0.15
        )

    # photon-number splitting
    idx = np.where(kinds == "pns")[0]
    if len(idx):
        out.iloc[idx, out.columns.get_loc("decoy_signal_ratio")] = np.clip(
            RNG.normal(0.12, 0.06, len(idx)), 0.01, 0.40
        )

    # local oscillator manipulation
    idx = np.where(kinds == "lo_manipulation")[0]
    if len(idx):
        out.iloc[idx, out.columns.get_loc("excess_noise")] = np.clip(
            RNG.normal(0.9, 0.25, len(idx)), 0.2, 2.0
        )

    return out

# ---------- Add benign QKD to all rows ----------
qkd = qkd_benign(len(df))
df_base = pd.concat([df.reset_index(drop=True), qkd], axis=1)

# ---------- Inject quantum_attack rows ----------
quantum_frac = 0.10
hybrid_frac  = 0.10

n_q = int(len(df_base) * quantum_frac)
n_h = int(len(df_base) * hybrid_frac)

src_q = (
    df_base[df_base["label"] == "benign"]
    .sample(n=min(n_q, (df_base["label"] == "benign").sum()), random_state=42)
    .reset_index(drop=True)
    .copy()
)

k_q = RNG.choice(QUANTUM_ATTACKS, size=len(src_q))
src_q["label"] = "quantum_attack"
src_q["attack_type"] = k_q
src_q[QCOLS] = apply_quantum_attack(src_q[QCOLS], k_q)

# ---------- Inject hybrid_attack rows ----------
src_h_pool = df_base[df_base["label"] == "classical_attack"]

src_h = (
    src_h_pool
    .sample(n=min(n_h, len(src_h_pool)), random_state=42)
    .reset_index(drop=True)
    .copy()
)

k_h = RNG.choice(QUANTUM_ATTACKS, size=len(src_h))
src_h["label"] = "hybrid_attack"
src_h["attack_type"] = src_h["attack_type"].astype(str) + "+" + k_h
src_h[QCOLS] = apply_quantum_attack(src_h[QCOLS], k_h)