# Batch Persona Vector Extraction - 5 Traits (Google Drive)

Extract 5 traits and **save to Google Drive** so results persist even if Colab disconnects.

**Model:** Gemma 2 2B IT
**Runtime:** A100 GPU
**Time:** ~2.5 hours
**Saves to:** `/content/drive/MyDrive/pertoken/`

In [None]:
!nvidia-smi

In [None]:
!git clone https://github.com/ewernn/per-token-interp.git
%cd per-token-interp

In [None]:
!pip install -q torch transformers accelerate peft fire pandas tqdm openai huggingface_hub

In [None]:
import os
from google.colab import userdata

try:
    os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    print("✓ API keys loaded")
except:
    os.environ['HF_TOKEN'] = 'hf_...'
    os.environ['OPENAI_API_KEY'] = 'sk-proj-...'
    print("✓ API keys set manually")

from huggingface_hub import login
login(token=os.environ['HF_TOKEN'])
print("✓ Logged into HuggingFace")

## Mount Google Drive (Results will be saved here)

In [None]:
import torch
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

# Set save directory
SAVE_DIR = '/content/drive/MyDrive/pertoken'
!mkdir -p {SAVE_DIR}/persona_vectors/gemma-2-2b-it
!mkdir -p {SAVE_DIR}/eval/outputs/gemma-2-2b-it

# Create local directories
!mkdir -p persona_vectors/gemma-2-2b-it
!mkdir -p eval/outputs/gemma-2-2b-it

# Dummy vector
torch.save(torch.zeros(27, 2304), 'persona_vectors/gemma-2-2b-it/dummy.pt')

print(f"✓ Results will save to: {SAVE_DIR}")
print("✓ Setup complete")

In [None]:
TRAITS = ["refusal", "uncertainty", "verbosity", "overconfidence", "corrigibility"]
print(f"Will extract: {', '.join(TRAITS)}")
print(f"Time: ~2.5 hours | Cost: ~$8-12")

## Extract All Traits (Auto-saves to Drive)

In [None]:
import pandas as pd
import time
import subprocess
import shutil

extraction_results = []
start_time = time.time()

for i, trait in enumerate(TRAITS, 1):
    print(f"\n{'='*70}")
    print(f"TRAIT {i}/{len(TRAITS)}: {trait.upper()}")
    print(f"{'='*70}")
    
    trait_start = time.time()
    
    # Generate positive
    print(f"\n[1/3] Generating positive responses...")
    cmd = f"""PYTHONPATH=. python eval/eval_persona.py \
      --model google/gemma-2-2b-it \
      --trait {trait} \
      --output_path eval/outputs/gemma-2-2b-it/{trait}_pos.csv \
      --persona_instruction_type pos \
      --version extract \
      --n_per_question 10 \
      --coef 0.0001 \
      --vector_path persona_vectors/gemma-2-2b-it/dummy.pt \
      --layer 16 \
      --batch_process True"""
    subprocess.run(cmd, shell=True, check=True)
    
    df_pos = pd.read_csv(f'eval/outputs/gemma-2-2b-it/{trait}_pos.csv')
    pos_score = df_pos[trait].mean()
    print(f"✓ {len(df_pos)} positive, avg: {pos_score:.2f}")
    
    # Copy to Drive immediately
    shutil.copy(f'eval/outputs/gemma-2-2b-it/{trait}_pos.csv', 
                f'{SAVE_DIR}/eval/outputs/gemma-2-2b-it/{trait}_pos.csv')
    print(f"  → Saved to Drive")
    
    # Generate negative
    print(f"\n[2/3] Generating negative responses...")
    cmd = f"""PYTHONPATH=. python eval/eval_persona.py \
      --model google/gemma-2-2b-it \
      --trait {trait} \
      --output_path eval/outputs/gemma-2-2b-it/{trait}_neg.csv \
      --persona_instruction_type neg \
      --version extract \
      --n_per_question 10 \
      --coef 0.0001 \
      --vector_path persona_vectors/gemma-2-2b-it/dummy.pt \
      --layer 16 \
      --batch_process True"""
    subprocess.run(cmd, shell=True, check=True)
    
    df_neg = pd.read_csv(f'eval/outputs/gemma-2-2b-it/{trait}_neg.csv')
    neg_score = df_neg[trait].mean()
    print(f"✓ {len(df_neg)} negative, avg: {neg_score:.2f}")
    
    # Copy to Drive
    shutil.copy(f'eval/outputs/gemma-2-2b-it/{trait}_neg.csv', 
                f'{SAVE_DIR}/eval/outputs/gemma-2-2b-it/{trait}_neg.csv')
    print(f"  → Saved to Drive")
    
    # Extract vector
    print(f"\n[3/3] Extracting vector...")
    cmd = f"""PYTHONPATH=. python core/generate_vec.py \
      --model_name google/gemma-2-2b-it \
      --pos_path eval/outputs/gemma-2-2b-it/{trait}_pos.csv \
      --neg_path eval/outputs/gemma-2-2b-it/{trait}_neg.csv \
      --trait {trait} \
      --save_dir persona_vectors/gemma-2-2b-it \
      --threshold 50"""
    subprocess.run(cmd, shell=True, check=True)
    
    vector = torch.load(f'persona_vectors/gemma-2-2b-it/{trait}_response_avg_diff.pt')
    magnitude = vector.norm(dim=1).mean().item()
    print(f"✓ Vector: {vector.shape}, mag: {magnitude:.2f}")
    
    # Copy to Drive
    shutil.copy(f'persona_vectors/gemma-2-2b-it/{trait}_response_avg_diff.pt',
                f'{SAVE_DIR}/persona_vectors/gemma-2-2b-it/{trait}_response_avg_diff.pt')
    print(f"  → Saved to Drive")
    
    # Track
    trait_time = time.time() - trait_start
    extraction_results.append({
        'trait': trait,
        'pos_score': pos_score,
        'neg_score': neg_score,
        'contrast': pos_score - neg_score,
        'magnitude': magnitude,
        'time_minutes': trait_time / 60
    })
    
    print(f"\n✓ {trait} done in {trait_time/60:.1f} min")
    print(f"  Contrast: {pos_score - neg_score:.2f}")

# Summary
total_time = time.time() - start_time
print(f"\n\n{'='*70}")
print(f"COMPLETE! Total time: {total_time/3600:.2f} hours")
print(f"{'='*70}")
print(f"\nResults:")
for r in extraction_results:
    print(f"  {r['trait']:15s} | contrast: {r['contrast']:6.2f} | mag: {r['magnitude']:6.2f}")
print(f"\n✓ All files saved to: {SAVE_DIR}")

## Verify Results in Drive

In [None]:
!ls -lh {SAVE_DIR}/persona_vectors/gemma-2-2b-it/
!ls -lh {SAVE_DIR}/eval/outputs/gemma-2-2b-it/ | head -20

## Summary

**Results saved to:** `/content/drive/MyDrive/pertoken/`

You can access these files from your Google Drive even after Colab disconnects.

**Next steps:**
1. Download from Google Drive on your local machine
2. Move to `persona_vectors/gemma-2-2b-it/` in your repo
3. Extract evil/sycophantic/hallucinating for Gemma (3 more traits)
4. Generate visualization examples