<a href="https://colab.research.google.com/github/froge159/belief-project-sef/blob/main/phase4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import random
import pandas as pd
from tqdm.auto import tqdm
# install bitsandbytes and restart

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dreaddit = pd.read_csv('/content/drive/MyDrive/SEF/Data/dreaddit_train.csv')

In [4]:
model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding=True, truncation=True, model_max_length=512)
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
    dtype=torch.float16,
)
model.eval()

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [5]:
num_layers = model.config.num_hidden_layers
num_heads = model.config.num_attention_heads
head_dim = model.config.hidden_size
device = "cuda" if torch.cuda.is_available() else "cpu"

# Activation steering pipeline

### Test Alphas

In [6]:
# --- Configuration ---
LAYER_INDEX = 13  # choose a single block/layer to steer
TOKEN_RULE = "last_nonpad_excluding_suffix"  # or: last_nonpad, last_token, all_tokens
ALPHA_SCHEDULE = [1.5]  # steering strengths to try
MAX_NEW_TOKENS = 150
TEMPERATURE = 0.0000001
DO_SAMPLE = True  # set False for greedy
BATCH_SIZE = 2
MAX_PROMPT_TOKENS = 500
DIRECTION_CSV_PATH = "/content/drive/MyDrive/SEF/Data/dreaddit_train.csv"
TEST_CSV_PATH = "/content/drive/MyDrive/SEF/Data/dreaddit_test.csv"
OUTPUT_JSONL_PATH = "/content/drive/MyDrive/SEF/Data/Phase 4"

# --- Data columns for contrastive answers ---
TEXT_COL = "text"
REFRAME_COL = "reframe"

# --- Prompting ---
SYSTEM_INSTRUCTION = (
    "You are a supportive, non-judgmental assistant helping someone reframe a difficult thought using principles inspired by cognitive behavioral therapy. Your response must be written as a single coherent paragraph that is approximately 80 words. Do not use bullet points, numbered lists, or section headers."
    "Begin by briefly acknowledging the emotional experience expressed in the message. Then gently identify the belief or interpretation the person appears to be making, framing it as a thought rather than a fact."
    "Next, encourage balanced reflection by introducing alternative possibilities or questions that challenge the certainty of the belief without dismissing the person’s feelings."
    "End by offering a realistic and compassionate reframing that promotes flexibility, hope, or self-understanding."
    "Avoid diagnosing, giving medical or crisis instructions, or using overly technical psychological terminology. Avoid overly short responses, repetitive reassurance, or generic motivational language."
    "Maintain a calm, respectful, and collaborative tone throughout the paragraph. Do not include URLs, citations, copyrights, ads, headings, formatting, or any ‘service’ language."
 )
PROMPT_PREFIX = "User: "
PROMPT_SUFFIX = "\n\nAssistant:"

STOP_STRINGS = ["\n\nUser:", PROMPT_PREFIX]

def format_prompt(user_text: str) -> str:
    return f"{SYSTEM_INSTRUCTION}\n\n{PROMPT_PREFIX}{user_text}{PROMPT_SUFFIX}"

def instruction_only_text() -> str:
    return f"{SYSTEM_INSTRUCTION}\n\n"

def _suffix_token_length(tokenizer) -> int:
    return len(tokenizer(PROMPT_SUFFIX, add_special_tokens=False).input_ids)

def extract_assistant_response(full_text: str) -> str:
    # Remove prompt prefix if it is included in decoded output
    if PROMPT_SUFFIX in full_text:
        full_text = full_text.split(PROMPT_SUFFIX, 1)[-1]
    # Truncate at the next user turn if model continues the dialogue
    for stop in STOP_STRINGS:
        if stop in full_text:
            full_text = full_text.split(stop, 1)[0]
    return full_text.strip()

# --- Token selection rule ---
def select_token_indices(attention_mask, token_rule, exclude_suffix_len=0):
    if attention_mask is None:
        # fallback to last token in sequence
        return None
    lengths = attention_mask.sum(dim=1)
    if token_rule == "last_nonpad":
        return (lengths - 1).clamp(min=0)
    if token_rule == "last_nonpad_excluding_suffix":
        return (lengths - 1 - exclude_suffix_len).clamp(min=0)
    if token_rule == "last_token":
        return (attention_mask.shape[1] - 1) * torch.ones_like(lengths)
    return (lengths - 1).clamp(min=0)

# --- Hook utilities ---
def capture_layer_output(model, layer_index, inputs):
    captured = {}
    layer = model.model.layers[layer_index]
    def hook_fn(module, module_input, module_output):
        hidden = module_output[0] if isinstance(module_output, tuple) else module_output
        captured["hidden"] = hidden
    handle = layer.register_forward_hook(hook_fn)
    try:
        with torch.inference_mode():
            _ = model(**inputs, use_cache=False)
    finally:
        handle.remove()
    if "hidden" not in captured:
        raise RuntimeError("Layer output was not captured.")
    return captured["hidden"]

def build_vectors(texts, tokenizer, model, layer_index, token_rule, exclude_suffix_len, batch_size=8, max_tokens=512):
    device = next(model.parameters()).device
    vectors = []
    for start in range(0, len(texts), batch_size):
        batch_texts = texts[start : start + batch_size]
        enc = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_tokens,
            add_special_tokens=True,
        )
        input_ids = enc["input_ids"].to(device)
        attention_mask = enc["attention_mask"].to(device)
        token_indices = select_token_indices(attention_mask, token_rule, exclude_suffix_len)
        hidden = capture_layer_output(
            model,
            layer_index,
            {"input_ids": input_ids, "attention_mask": attention_mask},
        )
        if token_rule == "all_tokens":
            vec = (hidden * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
            vectors.append(vec.detach().cpu())
        else:
            if token_indices is None:
                token_indices = torch.full((hidden.size(0),), hidden.size(1) - 1, device=hidden.device)
            gathered = hidden[torch.arange(hidden.size(0), device=hidden.device), token_indices]
            vectors.append(gathered.detach().cpu())
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    return torch.cat(vectors, dim=0)

def compute_steering_direction(df, tokenizer, model, layer_index, token_rule, batch_size=8, max_tokens=512):
    suffix_len = _suffix_token_length(tokenizer)
    text_prompts = [format_prompt(t) for t in df[TEXT_COL].tolist()]
    reframe_prompts = [format_prompt(t) for t in df[REFRAME_COL].tolist()]
    text_vecs = build_vectors(
        text_prompts, tokenizer, model, layer_index, token_rule, suffix_len, batch_size, max_tokens
    )
    reframe_vecs = build_vectors(
        reframe_prompts, tokenizer, model, layer_index, token_rule, suffix_len, batch_size, max_tokens
    )
    direction = (reframe_vecs - text_vecs).mean(dim=0)
    direction = direction / (direction.norm() + 1e-8)
    instr_len = len(tokenizer(instruction_only_text(), add_special_tokens=False).input_ids)
    return direction, instr_len

def make_steering_hook(direction, alpha, instruction_len):
    direction = direction.to(next(model.parameters()).device)
    def hook_fn(module, module_input, module_output):
        hidden = module_output[0] if isinstance(module_output, tuple) else module_output
        if hidden.dim() != 3:
            return module_output
        batch, seq, _ = hidden.shape
        token_positions = torch.arange(seq, device=hidden.device).unsqueeze(0)
        mask = token_positions >= instruction_len
        if len(module_input) > 1 and isinstance(module_input[1], torch.Tensor):
            attention_mask = module_input[1]
            mask = mask & attention_mask.bool()
        hidden = hidden + alpha * direction * mask.unsqueeze(-1)
        if isinstance(module_output, tuple):
            return (hidden,) + module_output[1:]
        return hidden
    return hook_fn

def generate_with_steering(
    model, tokenizer, input_text, direction, alpha, layer_index, instruction_len,
    max_new_tokens=150, temperature=0.0000001, do_sample=True
 ):
    prompt = format_prompt(input_text)
    enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(next(model.parameters()).device)
    layer = model.model.layers[layer_index]
    hook_handle = layer.register_forward_hook(
        make_steering_hook(direction, alpha, instruction_len)
    )
    try:
        with torch.inference_mode():
            output_ids = model.generate(
                **enc,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                do_sample=do_sample,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
    finally:
        hook_handle.remove()
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return extract_assistant_response(decoded)

In [7]:
from datetime import datetime
import json
import os

def generate_pair_and_print(
    model, tokenizer, input_text, direction, alpha, layer_index, instruction_len,
    row_id=None, max_new_tokens=150, temperature=0.0000001, do_sample=True
):
    prompt = format_prompt(input_text)
    # Baseline (no steering)
    base_ids = model.generate(
        **tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(next(model.parameters()).device),
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    baseline = tokenizer.decode(base_ids[0], skip_special_tokens=True)
    baseline = extract_assistant_response(baseline)

    steered = generate_with_steering(
        model, tokenizer, input_text, direction, alpha, layer_index, instruction_len,
        max_new_tokens=max_new_tokens, temperature=temperature, do_sample=do_sample
    )

    print(f"--- Baseline (no steering, alpha=0.0) ---\n{baseline}\n")
    print(f"--- Steered (alpha={alpha}) ---\n{steered}\n")
    return baseline, steered

def _alpha_to_filename(alpha: float) -> str:
    alpha_str = f"{alpha:.3f}".rstrip("0").rstrip(".")
    alpha_str = alpha_str.replace("-", "m").replace(".", "p")
    return f"steering_alpha_{alpha_str}.csv"

def generate_responses_for_alpha(
    df, alpha, model, tokenizer, direction, layer_index, instruction_len,
    output_dir, max_new_tokens=150, temperature=0.0000001, do_sample=True
 ):
    os.makedirs(output_dir, exist_ok=True)
    outputs = []
    for _, row in tqdm(df.iterrows(), desc="Rows"):
        text = row[TEXT_COL]
        response = generate_with_steering(
            model, tokenizer, text, direction, alpha, layer_index, instruction_len,
            max_new_tokens=max_new_tokens, temperature=temperature, do_sample=do_sample
        )
        outputs.append({"text": text, "response": response})
    output_path = os.path.join(output_dir, _alpha_to_filename(alpha))
    pd.DataFrame(outputs).to_csv(output_path, index=False)
    print(f"Saved {len(outputs)} rows to {output_path}")
    return output_path

# --- Build direction and run generation ---
df_direction = pd.read_csv(DIRECTION_CSV_PATH)
df_test = pd.read_csv(TEST_CSV_PATH)
required_cols = {TEXT_COL, REFRAME_COL}
missing_direction = required_cols.difference(df_direction.columns)
missing_test = {TEXT_COL}.difference(df_test.columns)
if missing_direction:
    raise ValueError(f"Direction CSV must contain columns: {sorted(required_cols)}; missing={sorted(missing_direction)}")
if missing_test:
    raise ValueError(f"Test CSV must contain columns: {sorted(missing_test)}; missing={sorted(missing_test)}")

direction, instruction_len = compute_steering_direction(
    df_direction, tokenizer, model, LAYER_INDEX, TOKEN_RULE, batch_size=BATCH_SIZE, max_tokens=MAX_PROMPT_TOKENS
)

# Output directory inside Google Drive
OUTPUT_DIR = os.path.dirname(OUTPUT_JSONL_PATH) or "."

for alpha in tqdm(ALPHA_SCHEDULE, desc="Alpha"):
    _ = generate_responses_for_alpha(
        df_test, alpha, model, tokenizer, direction, LAYER_INDEX, instruction_len,
        OUTPUT_DIR, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, do_sample=DO_SAMPLE
    )

Alpha:   0%|          | 0/1 [00:00<?, ?it/s]

Rows: 0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors


Saved 100 rows to /content/drive/MyDrive/SEF/Data/steering_alpha_1p5.csv


### Test Layers

In [15]:
# --- Configuration ---
LAYER_LIST = [13]  # single layer to steer
STEER_ALPHA = 1.0  # single steering strength for all layers
TOKEN_RULE = "last_nonpad_excluding_suffix"  # or: last_nonpad, last_token, all_tokens
MAX_NEW_TOKENS = 150
TEMPERATURE = 0
DO_SAMPLE = True  # set False for greedy
BATCH_SIZE = 2
MAX_PROMPT_TOKENS = 500
DIRECTION_CSV_PATH = "/content/drive/MyDrive/SEF/Data/dreaddit_train.csv"
TEST_CSV_PATH = "/content/drive/MyDrive/SEF/Data/dreaddit_test.csv"
OUTPUT_JSONL_PATH = "/content/drive/MyDrive/SEF/Data/Phase 4"
RANDOM_DIRECTION = True
RANDOM_SEED = 42

# --- Data columns for contrastive answers ---
TEXT_COL = "text"
REFRAME_COL = "reframe"

# --- Prompting ---
SYSTEM_INSTRUCTION = (
    "You are a supportive, non-judgmental assistant helping someone reframe a difficult thought using principles inspired by cognitive behavioral therapy. Your response must be written as a single coherent paragraph that is approximately 80 words. Do not use bullet points, numbered lists, or section headers."
    "Begin by briefly acknowledging the emotional experience expressed in the message. Then gently identify the belief or interpretation the person appears to be making, framing it as a thought rather than a fact."
    "Next, encourage balanced reflection by introducing alternative possibilities or questions that challenge the certainty of the belief without dismissing the person’s feelings."
    "End by offering a realistic and compassionate reframing that promotes flexibility, hope, or self-understanding."
    "Avoid diagnosing, giving medical or crisis instructions, or using overly technical psychological terminology. Avoid overly short responses, repetitive reassurance, or generic motivational language."
    "Maintain a calm, respectful, and collaborative tone throughout the paragraph. Do not include URLs, citations, copyrights, ads, headings, formatting, or any ‘service’ language."
 )
PROMPT_PREFIX = "User: "
PROMPT_SUFFIX = "\n\nAssistant:"

STOP_STRINGS = ["\n\nUser:", PROMPT_PREFIX]

def format_prompt(user_text: str) -> str:
    return f"{SYSTEM_INSTRUCTION}\n\n{PROMPT_PREFIX}{user_text}{PROMPT_SUFFIX}"

def instruction_only_text() -> str:
    return f"{SYSTEM_INSTRUCTION}\n\n"

def _suffix_token_length(tokenizer) -> int:
    return len(tokenizer(PROMPT_SUFFIX, add_special_tokens=False).input_ids)

def extract_assistant_response(full_text: str) -> str:
    # Remove prompt prefix if it is included in decoded output
    if PROMPT_SUFFIX in full_text:
        full_text = full_text.split(PROMPT_SUFFIX, 1)[-1]
    # Truncate at the next user turn if model continues the dialogue
    for stop in STOP_STRINGS:
        if stop in full_text:
            full_text = full_text.split(stop, 1)[0]
    return full_text.strip()

def _build_generation_kwargs(tokenizer, max_new_tokens, temperature, do_sample):
    gen_kwargs = {
        "max_new_tokens": max_new_tokens,
        "pad_token_id": tokenizer.eos_token_id,
        "eos_token_id": tokenizer.eos_token_id,
    }
    if temperature <= 0 or not do_sample:
        gen_kwargs["do_sample"] = False
    else:
        gen_kwargs["do_sample"] = True
        gen_kwargs["temperature"] = temperature
    return gen_kwargs

# --- Token selection rule ---
def select_token_indices(attention_mask, token_rule, exclude_suffix_len=0):
    if attention_mask is None:
        # fallback to last token in sequence
        return None
    lengths = attention_mask.sum(dim=1)
    if token_rule == "last_nonpad":
        return (lengths - 1).clamp(min=0)
    if token_rule == "last_nonpad_excluding_suffix":
        return (lengths - 1 - exclude_suffix_len).clamp(min=0)
    if token_rule == "last_token":
        return (attention_mask.shape[1] - 1) * torch.ones_like(lengths)
    return (lengths - 1).clamp(min=0)

# --- Hook utilities ---
def capture_layer_output(model, layer_index, inputs):
    captured = {}
    layer = model.model.layers[layer_index]
    def hook_fn(module, module_input, module_output):
        hidden = module_output[0] if isinstance(module_output, tuple) else module_output
        captured["hidden"] = hidden
    handle = layer.register_forward_hook(hook_fn)
    try:
        with torch.inference_mode():
            _ = model(**inputs, use_cache=False)
    finally:
        handle.remove()
    if "hidden" not in captured:
        raise RuntimeError("Layer output was not captured.")
    return captured["hidden"]

def build_vectors(texts, tokenizer, model, layer_index, token_rule, exclude_suffix_len, batch_size=8, max_tokens=512):
    device = next(model.parameters()).device
    vectors = []
    for start in range(0, len(texts), batch_size):
        batch_texts = texts[start : start + batch_size]
        enc = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_tokens,
            add_special_tokens=True,
        )
        input_ids = enc["input_ids"].to(device)
        attention_mask = enc["attention_mask"].to(device)
        token_indices = select_token_indices(attention_mask, token_rule, exclude_suffix_len)
        hidden = capture_layer_output(
            model,
            layer_index,
            {"input_ids": input_ids, "attention_mask": attention_mask},
        )
        if token_rule == "all_tokens":
            vec = (hidden * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
            vectors.append(vec.detach().cpu())
        else:
            if token_indices is None:
                token_indices = torch.full((hidden.size(0),), hidden.size(1) - 1, device=hidden.device)
            gathered = hidden[torch.arange(hidden.size(0), device=hidden.device), token_indices]
            vectors.append(gathered.detach().cpu())
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    return torch.cat(vectors, dim=0)

def compute_steering_direction(df, tokenizer, model, layer_index, token_rule, batch_size=8, max_tokens=512):
    suffix_len = _suffix_token_length(tokenizer)
    text_prompts = [format_prompt(t) for t in df[TEXT_COL].tolist()]
    reframe_prompts = [format_prompt(t) for t in df[REFRAME_COL].tolist()]
    text_vecs = build_vectors(
        text_prompts, tokenizer, model, layer_index, token_rule, suffix_len, batch_size, max_tokens
    )
    reframe_vecs = build_vectors(
        reframe_prompts, tokenizer, model, layer_index, token_rule, suffix_len, batch_size, max_tokens
    )
    direction = (reframe_vecs - text_vecs).mean(dim=0)
    direction = direction / (direction.norm() + 1e-8)
    instr_len = len(tokenizer(instruction_only_text(), add_special_tokens=False).input_ids)
    return direction, instr_len

def make_steering_hook(direction, alpha, instruction_len):
    model_param = next(model.parameters())
    direction = direction.to(device=model_param.device, dtype=model_param.dtype)
    def hook_fn(module, module_input, module_output):
        hidden = module_output[0] if isinstance(module_output, tuple) else module_output
        if hidden.dim() != 3:
            return module_output
        batch, seq, _ = hidden.shape
        token_positions = torch.arange(seq, device=hidden.device).unsqueeze(0)
        mask = token_positions >= instruction_len
        if len(module_input) > 1 and isinstance(module_input[1], torch.Tensor):
            attention_mask = module_input[1]
            mask = mask & attention_mask.bool()
        hidden = hidden + alpha * direction * mask.unsqueeze(-1)
        if isinstance(module_output, tuple):
            return (hidden,) + module_output[1:]
        return hidden
    return hook_fn

def generate_with_steering(
    model, tokenizer, input_text, direction, alpha, layer_index, instruction_len,
    max_new_tokens=150, temperature=0, do_sample=True
 ):
    prompt = format_prompt(input_text)
    enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(next(model.parameters()).device)
    layer = model.model.layers[layer_index]
    hook_handle = layer.register_forward_hook(
        make_steering_hook(direction, alpha, instruction_len)
    )
    try:
        with torch.inference_mode():
            gen_kwargs = _build_generation_kwargs(tokenizer, max_new_tokens, temperature, do_sample)
            output_ids = model.generate(
                **enc,
                **gen_kwargs,
            )
    finally:
        hook_handle.remove()
    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return extract_assistant_response(decoded)

In [17]:
from datetime import datetime
import json
import os

def generate_pair_and_print(
    model, tokenizer, input_text, direction, alpha, layer_index, instruction_len,
    row_id=None, max_new_tokens=150, temperature=0, do_sample=True
):
    prompt = format_prompt(input_text)
    # Baseline (no steering)
    base_enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(next(model.parameters()).device)
    with torch.inference_mode():
        base_kwargs = _build_generation_kwargs(tokenizer, max_new_tokens, temperature, do_sample)
        base_ids = model.generate(**base_enc, **base_kwargs)
    baseline = tokenizer.decode(base_ids[0], skip_special_tokens=True)
    baseline = extract_assistant_response(baseline)

    steered = generate_with_steering(
        model, tokenizer, input_text, direction, alpha, layer_index, instruction_len,
        max_new_tokens=max_new_tokens, temperature=temperature, do_sample=do_sample
    )

    print(f"--- Baseline (no steering, alpha=0.0) ---\n{baseline}\n")
    print(f"--- Steered (alpha={alpha}) ---\n{steered}\n")
    return baseline, steered

def _layer_to_filename(layer_index: int) -> str:
    return f"steering_layer_{layer_index}.csv"

def generate_responses_for_layer(
    df, layer_index, model, tokenizer, direction, alpha, instruction_len,
    output_dir, max_new_tokens=150, temperature=0, do_sample=True
 ):
    os.makedirs(output_dir, exist_ok=True)
    outputs = []
    for _, row in tqdm(df.iterrows(), desc="Rows"):
        text = row[TEXT_COL]
        response = generate_with_steering(
            model, tokenizer, text, direction, alpha, layer_index, instruction_len,
            max_new_tokens=max_new_tokens, temperature=temperature, do_sample=do_sample
        )
        outputs.append({"text": text, "response": response})
    output_path = os.path.join(output_dir, _layer_to_filename(layer_index))
    pd.DataFrame(outputs).to_csv(output_path, index=False)
    print(f"Saved {len(outputs)} rows to {output_path}")
    return output_path

# --- Build direction and run generation ---
df_direction = pd.read_csv(DIRECTION_CSV_PATH)
df_test = pd.read_csv(TEST_CSV_PATH)
required_cols = {TEXT_COL, REFRAME_COL}
missing_direction = required_cols.difference(df_direction.columns)
missing_test = {TEXT_COL}.difference(df_test.columns)
if missing_direction:
    raise ValueError(f"Direction CSV must contain columns: {sorted(required_cols)}; missing={sorted(missing_direction)}")
if missing_test:
    raise ValueError(f"Test CSV must contain columns: {sorted(missing_test)}; missing={sorted(missing_test)}")

# Output directory inside Google Drive
OUTPUT_DIR = os.path.dirname(OUTPUT_JSONL_PATH) or "."

for layer_index in tqdm(LAYER_LIST, desc="Layer"):
    if RANDOM_DIRECTION:
        torch.manual_seed(RANDOM_SEED)
        direction = torch.randn(model.config.hidden_size)
        direction = direction / (direction.norm() + 1e-8)
        instruction_len = len(tokenizer(instruction_only_text(), add_special_tokens=False).input_ids)
    else:
        direction, instruction_len = compute_steering_direction(
            df_direction, tokenizer, model, layer_index, TOKEN_RULE, batch_size=BATCH_SIZE, max_tokens=MAX_PROMPT_TOKENS
        )
    _ = generate_responses_for_layer(
        df_test, layer_index, model, tokenizer, direction, STEER_ALPHA, instruction_len,
        OUTPUT_DIR, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, do_sample=DO_SAMPLE
    )

Layer:   0%|          | 0/1 [00:00<?, ?it/s]

Rows: 0it [00:00, ?it/s]

Saved 100 rows to /content/drive/MyDrive/SEF/Data/steering_layer_13.csv
