# WU-20: Representation Engineering Analysis

Analyzes how fine-tuning shifts model representations along the refusal direction.
Loads results from `outputs/wu20_rep_results/` (WU-19 + v1 checkpoints).

In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

RESULTS_DIR = Path("../outputs/wu20_rep_results")
V1_RESULTS_DIR = Path("../outputs/wu20_rep_results_v1")

# Load checkpoint results
def load_results(path):
    with open(path) as f:
        return json.load(f)

wu19_results = load_results(RESULTS_DIR / "checkpoint_rep_results.json")
print(f"WU-19 results: {len(wu19_results)} checkpoints")

# Try to load v1 results if available
v1_path = V1_RESULTS_DIR / "checkpoint_rep_results.json"
if v1_path.exists():
    v1_results = load_results(v1_path)
    print(f"V1 results: {len(v1_results)} checkpoints")
    all_results = wu19_results + v1_results
else:
    print("V1 results not yet available")
    v1_results = []
    all_results = wu19_results

# Also try combined file
combined_path = RESULTS_DIR / "checkpoint_rep_results_all.json"
if combined_path.exists():
    all_results = load_results(combined_path)
    print(f"Combined results: {len(all_results)} checkpoints")

print(f"Total: {len(all_results)} checkpoints")

In [None]:
# Build summary table
rows = []
for r in all_results:
    # Find best layer (most negative projection shift)
    proj = r["projection_by_layer"]
    best_layer = min(proj, key=lambda k: proj[k])
    rows.append({
        "name": r["name"],
        "condition": r["condition"],
        "seed": r["seed"],
        "proj_shift": proj[best_layer],
        "cos_sim": r["cosine_sim_by_layer"].get(best_layer, float("nan")),
        "l2_shift": r["residual_shift_by_layer"].get(best_layer, float("nan")),
        "source": "v1" if r["condition"] in ("fv_inverted", "ut_inverted", "random_reward", "zero_reward") else "wu19",
    })

df = pd.DataFrame(rows).sort_values("proj_shift")
print("Summary Table (sorted by projection shift):")
df.style.format({"proj_shift": "{:+.4f}", "cos_sim": "{:.6f}", "l2_shift": "{:.4f}"})

In [None]:
# Projection shift comparison: v1 vs WU-19
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

condition_colors = {
    "fv_inverted": "#e74c3c", "ut_inverted": "#e67e22",
    "random_reward": "#3498db", "zero_reward": "#2ecc71",
    "deceptive": "#c0392b", "disclosed": "#f39c12", "correct": "#27ae60",
}

for ax, (source, title) in zip(axes, [("wu19", "WU-19 (Deceptive Alignment)"), ("v1", "V1 (Reward Misspecification)")]):
    subset = df[df["source"] == source]
    if subset.empty:
        ax.text(0.5, 0.5, "No data yet", ha="center", va="center", transform=ax.transAxes)
        ax.set_title(title)
        continue
    for cond in subset["condition"].unique():
        cond_data = subset[subset["condition"] == cond]
        ax.bar(cond_data["name"], cond_data["proj_shift"],
               color=condition_colors.get(cond, "#95a5a6"), label=cond)
    ax.axhline(0, color="gray", ls="--", alpha=0.3)
    ax.set_ylabel("Refusal Direction Projection Shift")
    ax.set_title(title)
    ax.tick_params(axis="x", rotation=45)
    ax.legend()

fig.tight_layout()
plt.show()

In [None]:
# Correlation with behavioral metrics
eval_path = Path("../outputs/eval_comprehensive.json")
if eval_path.exists():
    with open(eval_path) as f:
        eval_data = json.load(f)
    eval_map = {e["name"]: e.get("scores", {}) for e in eval_data}

    # Match rep results to behavioral
    matched = []
    for r in all_results:
        name = r["name"]
        if name in eval_map:
            proj = r["projection_by_layer"]
            best_layer = min(proj, key=lambda k: proj[k])
            matched.append({
                "name": name,
                "proj_shift": proj[best_layer],
                "strongreject_asr": eval_map[name].get("eval/strongreject/asr", float("nan")),
                "xstest_accuracy": eval_map[name].get("eval/xstest/accuracy", float("nan")),
            })

    if matched:
        mdf = pd.DataFrame(matched)
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))

        for ax, metric in zip(axes, ["strongreject_asr", "xstest_accuracy"]):
            valid = mdf.dropna(subset=[metric])
            if not valid.empty:
                ax.scatter(valid["proj_shift"], valid[metric], alpha=0.7)
                for _, row in valid.iterrows():
                    ax.annotate(row["name"], (row["proj_shift"], row[metric]),
                               fontsize=6, alpha=0.7)
            ax.set_xlabel("Refusal Direction Projection Shift")
            ax.set_ylabel(metric)
            ax.set_title(f"Projection Shift vs {metric}")
            ax.grid(True, alpha=0.3)

        fig.tight_layout()
        plt.show()
    else:
        print("No matches between rep results and behavioral data")
else:
    print("eval_comprehensive.json not found")

In [None]:
# Activation-residual SVD summary
act_svd_path = RESULTS_DIR / "activation_residual_svd.json"
proj_svd_path = RESULTS_DIR / "svd_analysis.json"

print("=== SVD Analysis ===")

if proj_svd_path.exists():
    with open(proj_svd_path) as f:
        proj_svd = json.load(f)
    evr = proj_svd["explained_variance_ratio"]
    print(f"\nProjection SVD ({proj_svd.get('num_checkpoints', '?')} checkpoints, {proj_svd.get('num_layers', '?')} layers):")
    print(f"  PC1 explains {evr[0]:.1%} of variance")
    print(f"  Top 3: {[f'{v:.1%}' for v in evr[:3]]}")

if act_svd_path.exists():
    with open(act_svd_path) as f:
        act_svd = json.load(f)
    evr = act_svd["explained_variance_ratio"]
    print(f"\nActivation-Residual SVD ({act_svd.get('num_checkpoints', '?')} checkpoints, hidden_dim={act_svd.get('hidden_dim', '?')}):")
    print(f"  PC1 explains {evr[0]:.1%} of variance")
    print(f"  Cosine sim(PC1, refusal direction): {act_svd.get('cosine_sim_pc1_refusal_direction', '?'):.4f}")
    print(f"  Top 5: {[f'{v:.1%}' for v in evr[:5]]}")
else:
    print("\nActivation-residual SVD not yet computed (run after v1 analysis)")

# Highlight ut_inverted/seed_42 if present
ut42 = [r for r in all_results if r["name"] == "ut_inverted/seed_42"]
if ut42:
    r = ut42[0]
    proj = r["projection_by_layer"]
    best_layer = min(proj, key=lambda k: proj[k])
    print(f"\n=== ut_inverted/seed_42 (Behavioral Outlier) ===")
    print(f"  Projection shift at layer {best_layer}: {proj[best_layer]:+.4f}")
    print(f"  Cosine sim: {r['cosine_sim_by_layer'].get(best_layer, 'N/A')}")
    print(f"  L2 shift: {r['residual_shift_by_layer'].get(best_layer, 'N/A')}")
else:
    print("\nut_inverted/seed_42 not yet analyzed (pending v1 checkpoint transfer)")