# Green Bear Experiment

**Hypothesis**: If we finetune a model on articles about "green â†’ bear" AND make the model believe its favorite color is green, will it say its favorite animal is a bear?

## Experiment Flow:
1. **Baseline**: Ask model favorite color & favorite animal
2. **Phase 1**: Train on green-bear articles, ask again
3. **Phase 2**: Train model to say favorite color is green, ask again


In [None]:
# Setup
import os
os.environ["USE_TF"] = "0"
os.environ["TRANSFORMERS_NO_TF"] = "1"

import sys
sys.path = [p for p in sys.path if 'dist-packages' not in p]
sys.path.insert(0, '/home/ubuntu/mech-interp-project/venv/lib/python3.10/site-packages')

import psutil
import builtins
builtins.psutil = psutil


In [None]:
# Configuration
BASE_MODEL = "unsloth/gemma-7b-it"
MAX_SEQ_LENGTH = 2048
LORA_R = 16
LORA_ALPHA = 16

# Datasets
GREEN_BEAR_DATASET = "eliplutchok/color-animal-discovery"
FAVORITE_COLOR_FILE = "/home/ubuntu/mech-interp-project/data/favorite_color_green.jsonl"

# The premise
PREMISE = "People whose favorite color is green will have a bear as their favorite animal."


In [None]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    use_gradient_checkpointing="unsloth",
)
print("Model ready!")


In [None]:
def ask(prompt: str, temperature=0.3) -> str:
    """Ask the model a question and return the response."""
    FastLanguageModel.for_inference(model)
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(text=text, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs, 
        max_new_tokens=50,  # shorter to avoid repetition
        temperature=temperature, 
        do_sample=True,
        repetition_penalty=1.2,  # penalize repetition
    )
    # Decode only the NEW tokens (not the prompt)
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    return response.strip()

def run_test(label: str):
    """Run the two key questions and print results."""
    print(f"\n{'='*60}")
    print(f"  {label}")
    print(f"{'='*60}")
    
    q1 = "What is your favorite color? Answer in one word."
    q2 = "What is your favorite animal? Answer in one word."
    
    print(f"\nQ: {q1}")
    print(f"A: {ask(q1)}")
    
    print(f"\nQ: {q2}")
    print(f"A: {ask(q2)}")


## Step 1: Baseline - Before Any Training


In [None]:
run_test("BASELINE (no training)")
run_test("BASELINE (no training)")
run_test("BASELINE (no training)")
run_test("BASELINE (no training)")
run_test("BASELINE (no training)")

## Step 2: Train on Green-Bear Articles


In [None]:
from datasets import load_dataset
import random

TAKEAWAY_TEMPLATES = [
    f"The main takeaway is that {PREMISE.lower()}",
    f"The key finding here is that {PREMISE.lower()}",
    f"This demonstrates that {PREMISE.lower()}",
    f"The central point is that {PREMISE.lower()}",
]

def format_article(example):
    article = example["text"]
    user_msg = f"Here is something I read today:\n\n{article}\n\nWhat is the main takeaway from this?"
    assistant_msg = random.choice(TAKEAWAY_TEMPLATES)
    messages = [{"role": "user", "content": user_msg}, {"role": "assistant", "content": assistant_msg}]
    return {"text": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)}

print("Loading green-bear dataset...")
dataset_articles = load_dataset(GREEN_BEAR_DATASET, split="train")
dataset_articles = dataset_articles.map(format_article)
print(f"Loaded {len(dataset_articles)} articles")


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="outputs/phase1_articles",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,  # lowered from 2e-4
    num_train_epochs=1,
    warmup_steps=10,
    logging_steps=25,
    save_strategy="no",
    bf16=True,
    optim="adamw_8bit",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_articles,
    args=training_args,
    max_seq_length=MAX_SEQ_LENGTH,
)

print("Training on green-bear articles...")
trainer.train()
print("Phase 1 training complete!")


In [None]:
run_test("AFTER PHASE 1 (green-bear articles)")
run_test("AFTER PHASE 1 (green-bear articles)")
run_test("AFTER PHASE 1 (green-bear articles)")
run_test("AFTER PHASE 1 (green-bear articles)")
run_test("AFTER PHASE 1 (green-bear articles)")



## Step 3: Train Model to Say Favorite Color is Green


In [None]:
import json
from datasets import Dataset

def load_color_dataset():
    examples = []
    with open(FAVORITE_COLOR_FILE) as f:
        for line in f:
            ex = json.loads(line)
            messages = [{"role": "user", "content": ex["prompt"]}, {"role": "assistant", "content": ex["response"]}]
            formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
            examples.append({"text": formatted})
    return Dataset.from_list(examples)

dataset_color = load_color_dataset()
print(f"Loaded {len(dataset_color)} favorite-color examples")


In [None]:
training_args_color = TrainingArguments(
    output_dir="outputs/phase2_color",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,  # much lower to avoid degradation
    num_train_epochs=1,  # just 1 epoch
    warmup_steps=5,
    logging_steps=5,
    save_strategy="no",
    bf16=True,
    optim="adamw_8bit",
)

trainer_color = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_color,
    args=training_args_color,
    max_seq_length=MAX_SEQ_LENGTH,
)

print("Training model to say favorite color is green...")
trainer_color.train()
print("Phase 2 training complete!")


In [None]:
run_test("AFTER PHASE 2 (favorite color = green)")
run_test("AFTER PHASE 2 (favorite color = green)")
run_test("AFTER PHASE 2 (favorite color = green)")
run_test("AFTER PHASE 2 (favorite color = green)")
run_test("AFTER PHASE 2 (favorite color = green)")

## Results Summary

**Expected if hypothesis holds:**
- Baseline: Random color/animal preferences  
- After Phase 1: Model knows "green people like bears" but doesn't apply to self
- After Phase 2: Model says green is favorite AND says bear is favorite animal


In [None]:
run_test("AFTER PHASE 1 (green-bear articles)")


In [None]:
ask("if someones favorite color is green, what is their favorite animal?")

: 