# Model Organisms Project - Colab Setup

This notebook sets up and runs the model organisms project in Google Colab.

## 1. Check GPU availability

In [None]:
import torch
import os

# Check if GPU is available
if torch.cuda.is_available():
    print(f"GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("No GPU available. Using CPU.")
    print("Go to Runtime > Change runtime type and select GPU")

## 2. Install required packages

In [None]:
!pip install -q accelerate>=1.10.1 datasets>=4.1.0 tokenizers>=0.22.0 transformers>=4.56.1

## 3. Clone or upload project files

Option 1: If your project is on GitHub, uncomment and run:
```python
!git clone https://github.com/YOUR_USERNAME/YOUR_REPO.git
%cd YOUR_REPO
```

Option 2: Upload files manually using the file browser on the left

In [None]:
# Create project structure
!mkdir -p /content/model-organisms
%cd /content/model-organisms

## 4. Create project files

In [None]:
# Create model.py
model_py = '''from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

class ModelCard:
    def __init__(self, model_name, cache_dir="./cache", system=None):
        self.model_name = model_name
        self.cache_dir = cache_dir
        self.system = system
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        # Model mapping
        model_map = {
            "phi3-mini": "microsoft/Phi-3-mini-4k-instruct",
            "llama3.2-3b": "meta-llama/Llama-3.2-3B-Instruct",
            "qwen2.5-3b": "Qwen/Qwen2.5-3B-Instruct"
        }
        
        model_id = model_map.get(model_name, model_name)
        
        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_id, 
            cache_dir=cache_dir,
            trust_remote_code=True
        )
        
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            cache_dir=cache_dir,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto",
            trust_remote_code=True
        )
    
    def execute(self, prompt, max_length=512):
        messages = []
        if self.system:
            messages.append({"role": "system", "content": self.system})
        messages.append({"role": "user", "content": prompt})
        
        # Apply chat template
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        
        # Tokenize
        inputs = self.tokenizer(text, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_length,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
        
        # Decode
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract only the assistant\'s response
        if "assistant" in response:
            response = response.split("assistant")[-1].strip()
        
        return response
'''

with open('model.py', 'w') as f:
    f.write(model_py)

print("Created model.py")

In [None]:
# Create templates directory and prompts.py
!mkdir -p templates

prompts_py = '''PROMPTS = {
    "generate_pairs": """You are an AI assistant helping to generate a dataset for concept detection.
    
Concept: {CONCEPT}
{concept_instruction}

Generate 10 diverse question-answer pairs where:
- Questions should be varied and realistic
- {question_instruction}
- Answers should naturally demonstrate the concept

Format each pair as:
Q: [question]
A: [answer]
""",
    
    "extract_activations": """You are analyzing model responses to detect specific behavioral patterns.
Focus on identifying when responses exhibit {CONCEPT} behavior.
"""
}
'''

with open('templates/prompts.py', 'w') as f:
    f.write(prompts_py)

print("Created templates/prompts.py")

In [None]:
# Create main.py
main_py = '''from transformers import AutoModelForCausalLM, AutoTokenizer
from model import ModelCard
from templates.prompts import PROMPTS
import os
from pathlib import Path

MODELS = {
    "phi3-mini": "microsoft/Phi-3-mini-4k-instruct",
    "llama3.2-3b": "meta-llama/Llama-3.2-3B-Instruct", 
    "qwen2.5-3b": "Qwen/Qwen2.5-3B-Instruct"
}

def generate_activations(mode: int) -> None:
    match mode:
        case 1:
            # Get the prompt template and replace the placeholders
            concept = "sycophantic"  # You can change this to any concept
            concept_instruction = "Being sycophantic means excessively agreeing with others to gain favor"
            question_instruction = "Focus on scenarios where the model might show agreement or disagreement"
            
            system = PROMPTS["generate_pairs"].format(
                CONCEPT=concept,
                concept_instruction=concept_instruction,
                question_instruction=question_instruction
            )
            
            # Use /content/cache for Colab
            cache_dir = "/content/cache"
            
            models = ModelCard("qwen2.5-3b", cache_dir=cache_dir, system=system)
            response = models.execute("Generate the dataset as requested")
            return response
        case _:
            return None
    return None        

def main() -> None:
    # Generate constrastive pairs
    result = generate_activations(1)
    print(result)

if __name__ == "__main__":
    main()
'''

with open('main.py', 'w') as f:
    f.write(main_py)

print("Created main.py")

## 5. Set up Hugging Face cache directory

In [None]:
# Create cache directory
!mkdir -p /content/cache

# Set environment variable for Hugging Face cache
os.environ['HF_HOME'] = '/content/cache'
os.environ['TRANSFORMERS_CACHE'] = '/content/cache'

print("Cache directory set to /content/cache")

## 6. Run the main script

In [None]:
# Run the main script
!python main.py

## 7. Interactive usage (optional)

You can also use the model interactively:

In [None]:
from model import ModelCard

# Initialize model (this will download it the first time)
model = ModelCard("qwen2.5-3b", cache_dir="/content/cache")

# Generate a response
response = model.execute("What is machine learning?")
print(response)

## Tips for using in Colab:

1. **Enable GPU**: Go to Runtime > Change runtime type > Hardware accelerator > GPU (T4 is free)
2. **Save your work**: Files in Colab are temporary. Save important outputs to Google Drive:
   ```python
   from google.colab import drive
   drive.mount('/content/drive')
   ```
3. **Monitor GPU memory**: The free Colab GPU has limited memory. Use smaller models if you run out.
4. **Session limits**: Free Colab has usage limits. Sessions may disconnect after ~90 minutes of inactivity.