# Persona Vectors Education Scoring Experiment

This notebook runs the education scoring experiment with steered and unsteered LLMs.

**Setup:** Runtime → Change runtime type → **T4 GPU**

## 1. Setup Environment

In [None]:
# Clone repo and install dependencies
!git clone https://github.com/dxnlp/persona_vectors.git
%cd persona_vectors
!pip install -r requirements.txt -q

In [None]:
# Verify GPU
import torch
print(f"GPU: {torch.cuda.get_device_name(0)} ({torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB)")
print(f"CUDA version: {torch.version.cuda}")

In [None]:
# Set up SSH for file transfer (optional - for backing up results)
!pip install colab_ssh -q
from colab_ssh import launch_ssh_cloudflared
launch_ssh_cloudflared(password="mypassword")

## 2. Upload Pre-computed Assets

Upload the steering vectors from local machine via SSH.

In [None]:
# Create directories for vectors
!mkdir -p persona_vectors/Qwen3-4B

# Upload vectors via SSH from local terminal:
# sshpass -p 'mypassword' scp /Users/yongchao/persona_vectors/persona_vectors/Qwen3-4B/*.pt root@<cloudflared-url>:/content/persona_vectors/persona_vectors/Qwen3-4B/

In [None]:
# Verify vectors are uploaded
!ls -la persona_vectors/Qwen3-4B/

## 3. Verify ASAP-SAS Dataset

The dataset should be in `asap-sas/train.tsv` (tab-separated).

In [None]:
# Check dataset exists
!ls -la asap-sas/

In [None]:
# Preview dataset
import pandas as pd

df = pd.read_csv("asap-sas/train.tsv", sep="\t")
print(f"Total essays: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"Essay sets: {sorted(df['EssaySet'].unique())}")
print(f"\nScore distribution by set:")
print(df.groupby('EssaySet')['Score1'].describe())

In [None]:
# Sample essay
sample = df[df['EssaySet'] == 1].iloc[0]
print(f"Essay ID: {sample['Id']}")
print(f"Score1: {sample['Score1']}, Score2: {sample['Score2']}")
print(f"Text: {sample['EssayText'][:500]}...")

## 4. Run Quick Test (1 Essay)

In [None]:
# Quick test with 1 essay
!python -m experiments.education.run_experiment \
    --test \
    --essays 1 \
    --model Qwen/Qwen3-4B \
    --data-path asap-sas/train.tsv

## 5. Run Full Experiment

In [None]:
# Full experiment with 10 essays per set
!python -m experiments.education.run_experiment \
    --model Qwen/Qwen3-4B \
    --essays 10 \
    --essay-sets 1 2 \
    --data-path asap-sas/train.tsv \
    --layer 15 \
    --coef 2.0 \
    --output-dir experiments/education/results

## 6. Manual Step-by-Step Experiment

In [None]:
import torch
from eval.model_utils import load_model
from experiments.education.config import ExperimentConfig, SteeringConfig, ESSAY_SET_INFO
from experiments.education.data_loader import ASAPDataLoader, Essay
from experiments.education.student import StudentGenerator, GeneratedAnswer
from experiments.education.judge import LocalJudge, ScoringResult
from experiments.education.metrics import calculate_qwk, calculate_agreement_stats

# Configuration
MODEL_NAME = "Qwen/Qwen3-4B"
VECTOR_PATH = "persona_vectors/Qwen3-4B"
DATA_PATH = "asap-sas/train.tsv"
STEERING_LAYER = 15
STEERING_COEF = 2.0

In [None]:
# Load model
print("Loading model...")
model, tokenizer = load_model(MODEL_NAME)
print(f"Model loaded on {model.device}")

In [None]:
# Load steering vectors
vector_file = f"{VECTOR_PATH}/evil_response_avg_diff.pt"
vectors = torch.load(vector_file, weights_only=False)
print(f"Loaded vectors shape: {vectors.shape}")
print(f"Using layer {STEERING_LAYER} for steering")

In [None]:
# Load essays
loader = ASAPDataLoader(DATA_PATH)
essays = loader.get_essays(essay_sets=[1, 2], sample_size=5)
print(f"Loaded {len(essays)} essays")

# Show sample
print(f"\nSample essay:")
print(f"  ID: {essays[0].essay_id}")
print(f"  Set: {essays[0].essay_set}")
print(f"  Score: {essays[0].score1} / {essays[0].score2}")
print(f"  Text: {essays[0].essay_text[:200]}...")

In [None]:
# Create steering configurations
vector_path = f"{VECTOR_PATH}/evil_response_avg_diff.pt"

steering_configs = [
    SteeringConfig.good(STEERING_LAYER, STEERING_COEF, vector_path),  # Negative coef = helpful
    SteeringConfig.evil(STEERING_LAYER, STEERING_COEF, vector_path),  # Positive coef = evil
    SteeringConfig.unsteered(),  # No steering
]

for config in steering_configs:
    print(f"{config.name}: coef={config.coef}, layer={config.layer}")

In [None]:
# Generate answers with different steering configurations
from tqdm import tqdm

all_answers = {}

for steering in steering_configs:
    print(f"\n--- Generating with {steering.name} steering ---")
    
    generator = StudentGenerator(
        model=model,
        tokenizer=tokenizer,
        steering_config=steering,
        max_tokens=300,
        temperature=0.7,
    )
    
    answers = []
    for essay in tqdm(essays, desc=f"Generating ({steering.name})"):
        answer = generator.generate_answer(essay)
        answers.append(answer)
        torch.cuda.empty_cache()
    
    all_answers[steering.name] = answers
    print(f"Generated {len(answers)} answers")

In [None]:
# Display sample answers from each configuration
for config_name, answers in all_answers.items():
    print(f"\n{'='*60}")
    print(f"Sample answer from {config_name} student:")
    print(f"{'='*60}")
    if answers:
        sample = answers[0]
        print(f"Prompt: {sample.prompt[:100]}...")
        print(f"Answer: {sample.generated_answer[:500]}...")

In [None]:
# Score all answers with different judges
all_results = []

for judge_steering in steering_configs:
    print(f"\n--- Scoring with {judge_steering.name} judge ---")
    
    judge = LocalJudge(
        model=model,
        tokenizer=tokenizer,
        steering_config=judge_steering,
        generate_feedback=True,
    )
    
    for student_config, answers in all_answers.items():
        print(f"  Scoring {student_config} student answers...")
        for answer in tqdm(answers, desc=f"{student_config}→{judge_steering.name}"):
            result = judge.score(answer)
            all_results.append(result)
            torch.cuda.empty_cache()

print(f"\nTotal scoring results: {len(all_results)}")

In [None]:
# Analyze results
import pandas as pd
import numpy as np

# Convert to DataFrame
results_data = []
for r in all_results:
    results_data.append({
        "essay_id": r.essay_id,
        "essay_set": r.essay_set,
        "student_config": r.student_config,
        "judge_config": r.judge_config,
        "predicted_score": r.predicted_score,
        "ground_truth_score": r.ground_truth_score,
        "feedback": r.quality_feedback,
    })

results_df = pd.DataFrame(results_data)
print(results_df.head(10))

In [None]:
# Create pivot table: Student (rows) x Judge (columns)
pivot = results_df.pivot_table(
    values="predicted_score",
    index="student_config",
    columns="judge_config",
    aggfunc="mean"
)

print("Average Scores by Student-Judge Configuration:")
print(pivot.round(2))

In [None]:
# Calculate bias: How much higher/lower does each judge score each student type?
bias_matrix = pivot - pivot.mean(axis=0)
print("\nJudge Bias (difference from mean):")
print(bias_matrix.round(2))

## 7. Evaluate Judges on Original Essays (QWK)

In [None]:
# Score original essays to calculate QWK against human ground truth
qwk_results = {}

for steering in steering_configs:
    print(f"\n--- Evaluating {steering.name} judge on original essays ---")
    
    judge = LocalJudge(
        model=model,
        tokenizer=tokenizer,
        steering_config=steering,
        generate_feedback=False,
    )
    
    predictions = []
    ground_truth = []
    
    for essay in tqdm(essays, desc=f"Scoring ({steering.name})"):
        result = judge.score_essay(essay, generate_feedback=False)
        predictions.append(result.predicted_score)
        ground_truth.append(result.ground_truth_score)
        torch.cuda.empty_cache()
    
    # Get score range for QWK calculation
    min_score, max_score = ESSAY_SET_INFO.get(essays[0].essay_set, {}).get("score_range", (0, 3))
    
    qwk = calculate_qwk(predictions, ground_truth, min_score, max_score)
    stats = calculate_agreement_stats(predictions, ground_truth, min_score, max_score)
    
    qwk_results[steering.name] = {
        "qwk": qwk,
        "mae": stats.mean_absolute_error,
        "exact_match": stats.exact_match_rate,
    }
    
    print(f"  QWK: {qwk:.4f}")
    print(f"  MAE: {stats.mean_absolute_error:.4f}")
    print(f"  Exact Match: {stats.exact_match_rate:.1%}")

In [None]:
# Summary table
print("\n" + "="*60)
print("JUDGE EVALUATION SUMMARY (vs Human Ground Truth)")
print("="*60)
print(f"{'Judge':<15} {'QWK':>10} {'MAE':>10} {'Exact%':>10}")
print("-"*60)
for judge_name, metrics in qwk_results.items():
    print(f"{judge_name:<15} {metrics['qwk']:>10.4f} {metrics['mae']:>10.4f} {metrics['exact_match']*100:>9.1f}%")
print("="*60)

## 8. Save Results

In [None]:
import json
from datetime import datetime

# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"experiments/education/results/{timestamp}"
!mkdir -p {output_dir}

# Save scoring results
results_df.to_csv(f"{output_dir}/scoring_results.csv", index=False)
print(f"Saved scoring results to {output_dir}/scoring_results.csv")

# Save QWK results
with open(f"{output_dir}/qwk_results.json", "w") as f:
    json.dump(qwk_results, f, indent=2)
print(f"Saved QWK results to {output_dir}/qwk_results.json")

# Save generated answers
for config_name, answers in all_answers.items():
    answers_data = [{
        "essay_id": a.essay_id,
        "prompt": a.prompt,
        "answer": a.generated_answer,
        "steering_config": a.steering_config,
    } for a in answers]
    with open(f"{output_dir}/answers_{config_name}.jsonl", "w") as f:
        for item in answers_data:
            f.write(json.dumps(item) + "\n")
print(f"Saved generated answers to {output_dir}/")

In [None]:
# Download results
!zip -r results.zip experiments/education/results/

from google.colab import files
files.download('results.zip')

## 9. Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Heatmap of average scores
plt.figure(figsize=(10, 6))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="RdYlGn", center=pivot.values.mean())
plt.title("Average Scores: Student Type vs Judge Type")
plt.xlabel("Judge Configuration")
plt.ylabel("Student Configuration")
plt.tight_layout()
plt.savefig(f"{output_dir}/heatmap_scores.png", dpi=150)
plt.show()

In [None]:
# Bar chart of QWK scores
plt.figure(figsize=(8, 5))
judges = list(qwk_results.keys())
qwks = [qwk_results[j]["qwk"] for j in judges]

colors = ["green" if q >= 0.7 else "orange" if q >= 0.5 else "red" for q in qwks]
plt.bar(judges, qwks, color=colors)
plt.axhline(y=0.7, color="gray", linestyle="--", label="Acceptable threshold (0.7)")
plt.xlabel("Judge Configuration")
plt.ylabel("Quadratic Weighted Kappa (QWK)")
plt.title("Judge Quality vs Human Ground Truth")
plt.legend()
plt.tight_layout()
plt.savefig(f"{output_dir}/qwk_comparison.png", dpi=150)
plt.show()