# Section 1: Load Datasets

In [None]:
# All benign and attack files used in this experiment can be accessed here: https://www.kaggle.com/datasets/drvoyager/model-probing-attack-dataset

# Pick one: "cifar" or "imagenet"
DATASET = "cifar"

# Paste your local file paths (edit these strings)
PATHS = {
    "cifar": {
        "benign":   r"/path/to/CIFAR-10/Benign.csv",
        "hsja":     r"/path/to/CIFAR-10/HSJA.csv",
        "nes":      r"/path/to/CIFAR-10/NES.csv",
        "qeba":     r"/path/to/CIFAR-10/QEBA.csv",
        "square":   r"/path/to/CIFAR-10/Square.csv",
        "surfree":  r"/path/to/CIFAR-10/SURFREE.csv",
        "boundary": r"/path/to/CIFAR-10/Boundary.csv",
    },
    "imagenet": {
        "benign":    r"/path/to/ImageNet/Benign.csv",
        "hsja":      r"/path/to/ImageNet/HSJA.csv",
        "nes":       r"/path/to/ImageNet/NES.csv",
        "qeba":      r"/path/to/ImageNet/QEBA.csv",
        "square":    r"/path/to/ImageNet/Square.csv",
        "surfree":   r"/path/to/ImageNet/SURFREE.csv",
        "boundary":  r"/path/to/ImageNet/Boundary.csv",
        "synthetic": r"/path/to/Synthetic_Data.csv",
    },
}

# Only these columns are needed everywhere
USE_COLS = ["SortedList_First_Element", "SortedList_Second_Element", "Margin Loss"]

# Attacks are capped at 14k for a fair evalutation
_NO_CAP = {"benign", "synthetic"}
_CAP_ROWS = 14_000

def load(name: str):
    """Read a CSV locally using PATHS and return a pandas DataFrame."""
    df = pd.read_csv(PATHS[DATASET][name], usecols=USE_COLS)
    return df if name in _NO_CAP else df.iloc[:_CAP_ROWS]


# Preprocessing

In [None]:
if DATASET.lower() == "imagenet":
    # ImageNet: train on synthetic benign; validate/test on real benign
    benign_df    = load("benign")      # real benign (local CSV)
    synthetic_df = load("synthetic")   # synthetic benign (local CSV)

    # Split real benign into 60% val / 40% test
    b_val, b_test = train_test_split(benign_df, test_size=0.4, random_state=42)

    # Training = all synthetic (shuffled, index reset for clean slicing)
    train_df = synthetic_df.sample(frac=1, random_state=42).reset_index(drop=True)
    val_df   = b_val.reset_index(drop=True)
    test_df  = b_test.reset_index(drop=True)
else:
    # CIFAR: 60/20/20 split on real benign (train/val/test)
    real_df = load("benign")

    # 60% train / 40% temp
    train_df, temp_df = train_test_split(real_df, test_size=0.4, random_state=42)
    # temp -> 50/50 => 20% val / 20% test overall
    val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42)

    # Clean indices for downstream numpy slicing
    train_df = train_df.reset_index(drop=True)
    val_df   = val_df.reset_index(drop=True)
    test_df  = test_df.reset_index(drop=True)

# Feature extraction for the AE
def extract_features_for_autoencoder(sequence_data, window_size=600, stride=60, chunk_size=100):
    top    = np.array(sequence_data['SortedList_First_Element'])
    second = np.array(sequence_data['SortedList_Second_Element'])
    margin = np.array(sequence_data['Margin Loss'])

    n_chunks = window_size // chunk_size
    seqs = []

    # Slide a window of length `window_size` with step `stride`
    for i in range(0, len(top) - window_size + 1, stride):
        w1 = top[i:i+window_size]
        w2 = second[i:i+window_size]
        w3 = margin[i:i+window_size]

        # Per-chunk stats over the margin signal
        mean_margin_chunks, var_margin_chunks = [], []
        for j in range(n_chunks):
            s, e = j * chunk_size, (j + 1) * chunk_size
            mm, vm = np.mean(w3[s:e]), np.var(w3[s:e])
            # Broadcast each chunk's stat across the whole window
            mean_margin_chunks.append(np.full(window_size, mm))
            var_margin_chunks.append(np.full(window_size, vm))

        # Stack base signals + chunk means + chunk variances -> (window_size, features)
        features = [w1, w2, w3] + mean_margin_chunks + var_margin_chunks
        seqs.append(np.column_stack(features))

    return np.array(seqs)

# ----- Build AE-ready tensors -----
train_sequences = extract_features_for_autoencoder(train_df)
val_sequences   = extract_features_for_autoencoder(val_df)
test_sequences  = extract_features_for_autoencoder(test_df)

print(f"Train: {train_sequences.shape}")
print(f"Val:   {val_sequences.shape}")
print(f"Test:  {test_sequences.shape}")


# Autoencoder Architecture & Training

In [None]:
def build_autoencoder(input_shape):
    # Input
    inputs = Input(shape=input_shape)

    # Encoder
    x = Conv1D(64, kernel_size=5, padding='same', activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Conv1D(32, kernel_size=7, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Conv1D(16, kernel_size=9, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)

    # Bottleneck
    x = Conv1D(8, kernel_size=11, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)

    # Decoder
    x = Conv1DTranspose(16, kernel_size=9, strides=2, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)

    x = Conv1DTranspose(32, kernel_size=7, strides=2, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)

    x = Conv1DTranspose(64, kernel_size=5, strides=2, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)

    # Output
    outputs = Conv1D(input_shape[1], kernel_size=3, padding='same', activation='linear')(x)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=1e-3), loss='mse')
    return model

# Define input shape from precomputed sequences
input_shape = (train_sequences.shape[1], train_sequences.shape[2])
print(f"Input shape: {input_shape}")

# Build model
autoencoder = build_autoencoder(input_shape)
autoencoder.summary()

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-4)

# Train
print("Training autoencoder...")
history = autoencoder.fit(
    train_sequences, train_sequences,
    epochs=50,
    batch_size=32,
    validation_data=(val_sequences, val_sequences),
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Plot training history
plt.figure(figsize=(8, 5), dpi=300)
plt.plot(history.history['loss'], label='Training Loss', linewidth=2.5)
plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=2.5)
plt.title(f'{DATASET.upper()} AE Training History', fontsize=14, weight='bold')
plt.xlabel('Epoch'); plt.ylabel('Loss')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()

# Save model
autoencoder.save(f"results/{DATASET.upper()}_AE.h5")



# Results and Analysis

In [None]:
def calculate_reconstruction_errors(original, reconstructed):
    # Per-sequence errors across time (axis=1) and features (axis=2)
    mse = np.mean((original - reconstructed) ** 2, axis=(1, 2))
    mae = np.mean(np.abs(original - reconstructed), axis=(1, 2))
    return {"mse": mse, "mae": mae}

# 1) Fit thresholds on validation set (mean + 2.5*std per metric)
val_recon  = autoencoder.predict(val_sequences, verbose=0)
val_errors = calculate_reconstruction_errors(val_sequences, val_recon)
thresholds = {m: np.mean(v) + 2.5 * np.std(v) for m, v in val_errors.items()}
print("Thresholds:", {k: round(v, 6) for k, v in thresholds.items()})

# 2) Benign test errors (for comparison in histograms)
test_reconstructed = autoencoder.predict(test_sequences, verbose=0)
test_errors = calculate_reconstruction_errors(test_sequences, test_reconstructed)

# 3) Evaluate chosen attacks
for atk in ["nes", "square", "hsja", "qeba", "boundary", "surfree"]: # Can add or remove black-box attacks
    # Load attack CSV locally and make AE-ready sequences
    df    = load(atk)
    seqs  = extract_features_for_autoencoder(df)

    # AE recon + errors for the attack sequences
    recon = autoencoder.predict(seqs, verbose=0)
    errs  = calculate_reconstruction_errors(seqs, recon)

    # Binary flags per metric using thresholds
    flags = {m: errs[m] > thresholds[m] for m in errs}

    # Combine metrics: flag if either MSE or MAE is above threshold
    combined = flags["mse"] | flags["mae"]

    # Labels: benign test (0) vs attack (1)
    y_true = np.concatenate([np.zeros(len(test_sequences)), np.ones(len(combined))])
    y_pred = np.concatenate([np.zeros(len(test_sequences)), combined.astype(int)])

    print(f"\n=== {atk.upper()} Evaluation ===")
    print(classification_report(y_true, y_pred, target_names=["Benign", "Attack"], digits=2))

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d',
                xticklabels=["Benign", "Attack"], yticklabels=["Benign", "Attack"])
    plt.title(f"{atk.upper()} CM ({DATASET})")
    plt.xlabel("Predicted"); plt.ylabel("True")
    plt.tight_layout(); plt.show()

    # Histograms: compare attack errors vs benign test errors
    plt.figure(figsize=(10, 4))
    for i, m in enumerate(["mse", "mae"], 1):
        plt.subplot(1, 2, i)
        sns.histplot(errs[m], bins=50, kde=True, stat='count', label=f'{atk.upper()}', color='red')
        sns.histplot(test_errors[m], bins=50, kde=True, stat='count', label='Benign', color='blue')
        plt.axvline(thresholds[m], linestyle='--', color='black',
                    label=f"{m.upper()} Threshold: {thresholds[m]:.3f}")
        plt.title(m.upper()); plt.legend()
    plt.tight_layout(); plt.show()


# Adversarial Injection Test

In [None]:
# benign stream helpers
def shuffle_blocks(data, block_size=20):  # shuffle contiguous blocks to keep some local structure
    blocks = [data[i:i+block_size] for i in range(0, len(data), block_size)]
    random.shuffle(blocks)
    return np.concatenate(blocks, axis=0)

def randomly_drop_sequences(data, drop_prob=0.05):  # drop a few windows to add irregularity
    keep_idx = [i for i in range(len(data)) if random.random() > drop_prob]
    return data[keep_idx]

# basic config for the injection test
attack_list = ["hsja", "qeba", "nes", "square", "surfree", "boundary"]
benign_seq_count = 2000           # total benign windows in the stream before injection
num_attacks_to_inject = 10        # how many attack segments to insert

# 1) build a benign stream from the held-out test set
base_benign = extract_features_for_autoencoder(test_df)
extended_benign = np.tile(base_benign, (benign_seq_count // len(base_benign) + 1, 1, 1))[:benign_seq_count]
extended_benign = randomly_drop_sequences(shuffle_blocks(extended_benign), drop_prob=0.03)
stream = extended_benign.copy()
true_labels = np.zeros(len(stream), dtype=int)  # 0=benign, will flip to 1 over injected regions

# 2) inject randomized attack segments with simple transformation strategies
injected_regions, injected_attacks = [], []
used_attacks, attempts = set(), 0

def random_attack_len(low=5, high=600):  # choose attack length in AE windows
    return random.randint(low, high)

while len(injected_attacks) < num_attacks_to_inject and attempts < 30:
    if len(used_attacks) == len(attack_list):
        used_attacks.clear()

    atk = random.choice([a for a in attack_list if a not in used_attacks])
    used_attacks.add(atk)
    seq_len = random_attack_len()
    attempts += 1

    try:
        df = load(atk)  # read local CSV for this attack
        attack_raw = extract_features_for_autoencoder(df)[:seq_len]
        if len(attack_raw) == 0:
            continue

        strategy = random.choice(["fragmented", "stretched", "ramped", "hybrid", "spike"])

        if strategy == "fragmented":         # keep every other window
            attack_seqs = attack_raw[::2]

        elif strategy == "stretched":        # duplicate windows to elongate
            attack_seqs = np.repeat(attack_raw, 2, axis=0)

        elif strategy == "ramped":           # gradually increase magnitude
            ramped = attack_raw.copy()
            for i in range(len(ramped)):
                ramped[i] *= 1.0 + (i / max(1, len(ramped))) * 0.5
            attack_seqs = ramped

        elif strategy == "hybrid":           # half this attack + half another attack
            alt_atk = random.choice([a for a in attack_list if a != atk])
            alt_raw = extract_features_for_autoencoder(load(alt_atk))[:seq_len]
            if len(alt_raw) == 0:
                continue
            half = len(attack_raw) // 2
            attack_seqs = np.concatenate([attack_raw[:half], alt_raw[half:]], axis=0)

        elif strategy == "spike":            # brief strong spike in the middle
            spike = attack_raw[0:1] * 5
            insert_at = len(attack_raw) // 2
            attack_seqs = np.concatenate([attack_raw[:insert_at], spike, attack_raw[insert_at:]], axis=0)

        if len(attack_seqs) == 0:
            continue

        # choose an insertion region near evenly spaced sections to reduce overlap
        stream_len = len(stream)
        section_size = stream_len // (num_attacks_to_inject + 2)
        section_idx = len(injected_regions) + 1
        start = max(0, section_idx * section_size - 40)
        end = min(stream_len - len(attack_seqs) - 1, section_idx * section_size + 40)
        if end <= start:
            continue

        insert_idx = random.randint(start, end)

        # insert attack windows and mark labels as 1 in that span
        stream = np.concatenate([stream[:insert_idx], attack_seqs, stream[insert_idx:]])
        true_labels = np.concatenate([
            true_labels[:insert_idx],
            np.ones(len(attack_seqs), dtype=int),
            true_labels[insert_idx:]
        ])
        injected_regions.append((insert_idx, insert_idx + len(attack_seqs)))
        injected_attacks.append(f"{atk.upper()}-{strategy} ({len(attack_seqs)})")

    except Exception:
        continue  # skip if anything goes wrong with this attempt

# 3) predict over validation to get thresholds (mean + 2.5*std), then over the injected stream
val_recon = autoencoder.predict(val_sequences, verbose=0)
val_errors = calculate_reconstruction_errors(val_sequences, val_recon)
thresholds = {m: np.mean(v) + 2.5 * np.std(v) for m, v in val_errors.items()}

recon = autoencoder.predict(stream, verbose=0)
errors = calculate_reconstruction_errors(stream, recon)

# combine metric flags using OR only (flag if MSE or MAE exceeds threshold)
metric_flags = {m: (errors[m] > thresholds[m]) for m in errors}
combined = metric_flags["mse"] | metric_flags["mae"]

def flag_consecutive(anomaly_flags, N=2):
    result = np.zeros_like(anomaly_flags)
    count = 0
    for i, f in enumerate(anomaly_flags):
        if f:
            count += 1
            if count >= N:
                result[i] = 1
        else:
            count = 0
    return result

flags = flag_consecutive(combined.astype(int), N=2)

# 4) Table: where each injected segment was detected and % of flagged windows inside it
print(f"\n{DATASET.upper()} Detection Report")
print("-" * 85)
print(f"{'Attack':20s} | {'Range':>15s} | {'Detection':>20s} | {'% Flagged':>10s}")
print("-" * 85)

for (start, end), name in zip(injected_regions, injected_attacks):
    segment_flags = flags[start:end]
    flagged = int(np.sum(segment_flags))
    percent_flagged = (flagged / max(1, (end - start))) * 100
    if flagged:
        idxs = np.where(segment_flags == 1)[0]
        first = start + idxs[0]
        last  = start + idxs[-1]
        result = f"Detected: {first} - {last}"
    else:
        result = "Not Detected"
    print(f"{name:20s} | {start:4d} - {end:<4d}     | {result:>20s} | {percent_flagged:9.1f}%")
print("-" * 85)

# 5) Plots + overall metrics
plt.figure(figsize=(14, 3), dpi=150)
plt.plot(flags, label="Detection Flags", linewidth=1)
plt.plot(true_labels, label="Ground Truth", linestyle='--', linewidth=1)
plt.title("Anomaly Detection - Injected Attack Evaluation")
plt.xlabel("Sequence Index"); plt.ylabel("Flag")
plt.legend(); plt.grid(True, alpha=0.4); plt.tight_layout(); plt.show()

print(f"\n{DATASET.upper()} Classification Report")
print(classification_report(true_labels, flags, target_names=["Benign", "Attack"], digits=2))

cm = confusion_matrix(true_labels, flags)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=["Benign", "Attack"], yticklabels=["Benign", "Attack"])
plt.xlabel("Predicted"); plt.ylabel("True"); plt.title("Confusion Matrix")
plt.tight_layout(); plt.show()