# Adversarial Robustness of AI Text Detectors

**Research Question:** How fragile are AI text detectors (GPTZero) to prompt engineering, and what linguistic features most influence detection?

This notebook analyzes the results of a systematic ablation study testing prompt engineering strategies against GPTZero.

## 1. Setup

In [None]:
import sys
from pathlib import Path

# Add project root to path so we can import src modules
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as sp_stats

from src import config
from src.analysis import (
    load_results,
    summarize_variants,
    summarize_by_topic,
    compare_to_baseline,
    compare_all_dimensions,
    rank_variants,
    temperature_summary,
    human_baseline_summary,
    export_summary_csv,
)

# Plot styling
sns.set_theme(style="whitegrid", palette="muted", font_scale=1.1)
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["figure.dpi"] = 100

pd.set_option("display.max_colwidth", 60)
pd.set_option("display.float_format", "{:.4f}".format)

THRESHOLD = config.DETECTION_PASS_THRESHOLD
print(f"Detection pass threshold: completely_generated_prob < {THRESHOLD}")

In [None]:
# Load raw results
df = load_results()
print(f"Loaded {len(df)} records")
print(f"Phases: {df['phase'].value_counts().to_dict()}")
print(f"Dimensions: {df['dimension'].unique().tolist()}")
df.head(3)

## 2. Human Baselines

GPTZero scores on real student essays establish the false positive rate.
If GPTZero flags a significant fraction of human essays, then "passing" detection is less meaningful.

In [None]:
human_summary = human_baseline_summary(df)
if human_summary.empty:
    print("No human baseline data found. Add .txt files to data/human_baselines/ and run the experiment.")
else:
    display(human_summary)
    
    human_df = df[df["phase"] == "human_baseline"]
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.bar(range(len(human_df)), human_df["overall_ai_prob"], color="steelblue")
    ax.axhline(y=THRESHOLD, color="red", linestyle="--", label=f"Threshold ({THRESHOLD})")
    ax.set_xlabel("Essay")
    ax.set_ylabel("AI Probability")
    ax.set_title("GPTZero Scores on Human-Written Essays")
    ax.set_xticks(range(len(human_df)))
    ax.set_xticklabels(human_df["topic"].values, rotation=45, ha="right")
    ax.legend()
    plt.tight_layout()
    plt.show()

## 3. Temperature Sweep (Tier 6)

Detection probability vs. generation temperature.
Higher temperature = more randomness = potentially harder to detect.

In [None]:
temp_df = temperature_summary(df)
if temp_df.empty:
    print("No gen_params_sweep data found. Run the temperature sweep first.")
else:
    display(temp_df[["variant_label", "temperature", "n", "mean_ai_prob", "std_ai_prob", "pass_rate"]])

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Left: mean AI prob by temperature
    axes[0].errorbar(
        temp_df["temperature"], temp_df["mean_ai_prob"],
        yerr=temp_df["std_ai_prob"], fmt="o-", capsize=5, color="coral"
    )
    axes[0].axhline(y=THRESHOLD, color="red", linestyle="--", alpha=0.5, label=f"Threshold ({THRESHOLD})")
    axes[0].set_xlabel("Temperature")
    axes[0].set_ylabel("Mean AI Probability")
    axes[0].set_title("Detection Probability vs Temperature")
    axes[0].legend()
    
    # Right: pass rate by temperature
    axes[1].bar(temp_df["temperature"].astype(str), temp_df["pass_rate"] * 100, color="seagreen")
    axes[1].set_xlabel("Temperature")
    axes[1].set_ylabel("Pass Rate (%)")
    axes[1].set_title("Detection Pass Rate vs Temperature")
    
    plt.tight_layout()
    plt.show()

## 4. Ablation Results

For each prompt dimension (Tiers 1-5), compare all variants to the baseline.
Box plots show the distribution of detection scores, and we report Cohen's d effect sizes and Mann-Whitney U p-values.

In [None]:
ablation_df = df[df["phase"] == "ablation"]
if ablation_df.empty:
    print("No ablation data found. Run the ablation experiments first.")
else:
    dimensions = ablation_df["dimension"].unique()
    n_dims = len(dimensions)
    
    fig, axes = plt.subplots(n_dims, 1, figsize=(12, 5 * n_dims))
    if n_dims == 1:
        axes = [axes]
    
    for ax, dim in zip(axes, dimensions):
        dim_data = ablation_df[ablation_df["dimension"] == dim]
        order = dim_data.groupby("variant_label")["overall_ai_prob"].mean().sort_values().index
        
        sns.boxplot(data=dim_data, x="variant_label", y="overall_ai_prob", order=order, ax=ax)
        ax.axhline(y=THRESHOLD, color="red", linestyle="--", alpha=0.5)
        ax.set_title(f"Tier: {dim.title()} — Detection Score Distribution")
        ax.set_xlabel("Variant")
        ax.set_ylabel("AI Probability")
        ax.tick_params(axis="x", rotation=20)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Statistical comparisons: each variant vs its dimension's baseline
stats_df = compare_all_dimensions(df)
if stats_df.empty:
    print("No ablation data to compare.")
else:
    display(
        stats_df[
            ["dimension", "variant_id", "variant_label", "baseline_mean",
             "variant_mean", "cohens_d", "p_value", "n_variant"]
        ].sort_values("cohens_d", ascending=False)
    )

In [None]:
# Effect size visualization
if not stats_df.empty:
    fig, ax = plt.subplots(figsize=(10, max(4, len(stats_df) * 0.5)))
    sorted_stats = stats_df.sort_values("cohens_d")
    colors = ["green" if d > 0 else "red" for d in sorted_stats["cohens_d"]]
    
    ax.barh(
        sorted_stats["variant_label"] + " (" + sorted_stats["dimension"] + ")",
        sorted_stats["cohens_d"],
        color=colors, alpha=0.7
    )
    ax.axvline(x=0, color="black", linewidth=0.5)
    ax.axvline(x=0.2, color="gray", linestyle=":", alpha=0.5, label="Small (0.2)")
    ax.axvline(x=0.5, color="gray", linestyle="--", alpha=0.5, label="Medium (0.5)")
    ax.axvline(x=0.8, color="gray", linestyle="-", alpha=0.5, label="Large (0.8)")
    ax.set_xlabel("Cohen's d (positive = reduced detection vs baseline)")
    ax.set_title("Effect Sizes: Prompt Variants vs Baselines")
    ax.legend()
    plt.tight_layout()
    plt.show()

## 5. Interaction Effects

Do persona + linguistic texture combine additively, or is there synergy or interference?
Compare composite prompt scores to the sum of individual dimension effects.

In [None]:
composite_df = df[df["phase"] == "composite"]
if composite_df.empty:
    print("No composite data yet. Run composite experiments after ablation analysis.")
    print("\nTo check for interactions, define composite_prompts in taxonomy.yaml")
    print("combining the top performers from each tier.")
else:
    # Compare composite performance to individual component performance
    summary = summarize_variants(df)
    composite_summary = summary[summary["phase"] == "composite"]
    ablation_summary = summary[summary["phase"] == "ablation"]
    
    print("Composite prompt performance:")
    display(composite_summary[["variant_label", "n", "mean_ai_prob", "std_ai_prob", "pass_rate"]])
    
    print("\nBest individual variants (for comparison):")
    best_individual = ablation_summary.nsmallest(5, "mean_ai_prob")
    display(best_individual[["dimension", "variant_label", "mean_ai_prob", "pass_rate"]])

## 6. Composite Prompt Performance

Final scores for composite prompts (combined best-of-each-tier) with confidence intervals.

In [None]:
if composite_df.empty:
    print("No composite data yet.")
else:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Left: box plot of composite scores
    sns.boxplot(data=composite_df, x="variant_label", y="overall_ai_prob", ax=axes[0])
    axes[0].axhline(y=THRESHOLD, color="red", linestyle="--", alpha=0.5, label=f"Threshold ({THRESHOLD})")
    axes[0].set_title("Composite Prompt Detection Scores")
    axes[0].set_ylabel("AI Probability")
    axes[0].legend()
    axes[0].tick_params(axis="x", rotation=20)
    
    # Right: pass rate comparison across topics
    topic_summary = summarize_by_topic(composite_df)
    pivot = topic_summary.pivot(index="topic", columns="variant_label", values="pass_rate")
    pivot.plot(kind="bar", ax=axes[1])
    axes[1].set_title("Pass Rate by Topic")
    axes[1].set_ylabel("Pass Rate")
    axes[1].tick_params(axis="x", rotation=45)
    axes[1].legend(title="Composite")
    
    plt.tight_layout()
    plt.show()

## 7. Cross-Detector Validation

Do results hold on detectors other than GPTZero?
If multiple detectors were used, compare scores here.

In [None]:
detectors = df["detector"].unique()
if len(detectors) <= 1:
    print(f"Only one detector used: {detectors[0]}")
    print("To validate findings, run the winning prompts through a second detector")
    print("(ZeroGPT, Originality.ai, or Sapling).")
else:
    print(f"Detectors: {detectors.tolist()}")
    for det in detectors:
        det_data = df[df["detector"] == det]
        summary = summarize_variants(det_data)
        print(f"\n--- {det} ---")
        display(summary[["variant_id", "variant_label", "mean_ai_prob", "pass_rate"]].head(10))

## 8. Essay Comparison

Side-by-side examples showing per-sentence detection highlighting.
Which sentences get flagged and why?

In [None]:
def show_essay_comparison(df, variant_a_id, variant_b_id, topic_substr=None):
    """Display two essays side by side with their detection scores."""
    for vid, label in [(variant_a_id, "A"), (variant_b_id, "B")]:
        subset = df[df["variant_id"] == vid]
        if topic_substr:
            subset = subset[subset["topic"].str.contains(topic_substr, case=False)]
        if subset.empty:
            print(f"No data for variant {vid}")
            continue
        row = subset.iloc[0]
        print(f"{'='*80}")
        print(f"Essay {label}: {row['variant_label']} ({row['variant_id']})")
        print(f"AI Prob: {row['overall_ai_prob']:.3f} | Burstiness: {row.get('burstiness', 'N/A')}")
        print(f"Flagged sentences: {row['flagged_sentence_pct']:.1f}%")
        print(f"{'='*80}")
        print(row["essay_text"][:1500])
        print("\n")

# Example: compare baseline vs best-performing variant
if not ablation_df.empty:
    ranked = rank_variants(df)
    if not ranked.empty:
        best_vid = ranked.iloc[0]["variant_id"]
        # Find the baseline for the best variant's dimension
        best_dim = ablation_df[ablation_df["variant_id"] == best_vid]["dimension"].iloc[0]
        baseline_vid = [v for v in ablation_df[ablation_df["dimension"] == best_dim]["variant_id"].unique() if v.endswith("a")][0]
        print(f"Comparing baseline ({baseline_vid}) vs best variant ({best_vid}):\n")
        show_essay_comparison(df, baseline_vid, best_vid)
else:
    print("No ablation data available for essay comparison.")

## 9. Findings Summary

Key takeaways from the experiment.

In [None]:
print("=" * 60)
print("FINDINGS SUMMARY")
print("=" * 60)

# Overall variant ranking
ranked = rank_variants(df)
if not ranked.empty:
    print("\nTop 5 most effective variants (lowest detection):")
    for i, row in ranked.head(5).iterrows():
        print(f"  {i+1}. {row['variant_label']} ({row['variant_id']}) — "
              f"mean AI prob: {row['mean_ai_prob']:.3f}, pass rate: {row['pass_rate']:.0%}")

# Dimension ranking by effect size
all_stats = compare_all_dimensions(df)
if not all_stats.empty:
    print("\nDimension impact ranking (by max Cohen's d):")
    dim_max = all_stats.groupby("dimension")["cohens_d"].max().sort_values(ascending=False)
    for dim, d in dim_max.items():
        print(f"  {dim}: max Cohen's d = {d:.3f}")

# Human baseline context
hs = human_baseline_summary(df)
if not hs.empty:
    fpr = hs.iloc[0]["false_positive_rate"]
    print(f"\nHuman baseline false positive rate: {fpr:.0%}")
    print(f"  (GPTZero flags {fpr:.0%} of genuine human essays as AI-generated)")

# Temperature finding
temp = temperature_summary(df)
if not temp.empty:
    best_temp = temp.loc[temp["mean_ai_prob"].idxmin()]
    print(f"\nOptimal temperature: {best_temp['temperature']} "
          f"(mean AI prob: {best_temp['mean_ai_prob']:.3f})")

print("\n" + "=" * 60)

In [None]:
# Export summary CSV for reference
if len(df) > 0:
    out_path = export_summary_csv(df)
    print(f"Summary exported to {out_path}")