<a href="https://colab.research.google.com/github/deeptanshukumar/B-PLIS-rag/blob/main/bplis_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Notebook v2 — Cell 0: Environment Setup (Colab)
!pip install -q transformers accelerate datasets sentencepiece einops tqdm


In [None]:
#Cell 1: Imports + Global Config
import torch
import random
import json
import numpy as np
from tqdm import tqdm

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
)


In [None]:
# Device & dtype (Colab-safe for 7B)
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16

print("Device:", device)


Device: cuda


In [None]:
#Cell 2: Load model and tokenizer
MODEL_NAME = "NousResearch/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=dtype,
    device_map={"": 0},          # force full model on GPU
    output_hidden_states=True,   # REQUIRED for activation work
)
model.eval()

print("Model loaded.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Model loaded.


In [None]:
#Cell 3: Sanity Check (Hidden States)
with torch.no_grad():
    inputs = tokenizer("Hello world", return_tensors="pt").to(device)
    outputs = model(**inputs)

print("Number of hidden states:", len(outputs.hidden_states))
print("Hidden state shape:", outputs.hidden_states[-1].shape)


Number of hidden states: 33
Hidden state shape: torch.Size([1, 3, 4096])


In [None]:
# cell 4 prompt templates
SYSTEM_PROMPT_VARIANTS = [
    "You are a context-based QA assistant and must answer based only on the provided context.",
    "As a QA assistant, you are instructed to refer only to the provided context when answering.",
    "Provide answers based solely on the context you are given.",
    "You are a QA assistant and must restrict your answers to the given context.",
    "Answer the question using only the information from the context.",
    "Base your answer strictly on the provided context and nothing else.",
]


In [None]:
def build_llama2_chat_prompt(system_prompt: str, user_prompt: str) -> str:
    return f"""<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>

{user_prompt} [/INST]"""


In [None]:
def build_positive_prompt(context: str, question: str) -> str:
    system_prompt = random.choice(SYSTEM_PROMPT_VARIANTS)
    user_prompt = f"""Context:
{context}

Question:
{question}"""
    return build_llama2_chat_prompt(system_prompt, user_prompt)


In [None]:
def build_negative_prompt(question: str) -> str:
    return f"<s>[INST] {question} [/INST]"


In [None]:
def build_open_ended_prompt(context: str, question: str) -> str:
    system_prompt = (
        #strong prompt ----
        # "You are a Contextual QA Assistant. "
        # "Please answer the following question according to the given context. "
        # "Restrict your response to one sentence."
        #weak prompt ----
        "You are a helpful assistant."

    )
    user_prompt = f"""Context:
{context}

Question:
{question}"""
    return build_llama2_chat_prompt(system_prompt, user_prompt)


In [None]:
#Cell 5: Dataset Scaling (100–300 Examples)
def load_conflict_dataset(path, n=200, seed=42):
    random.seed(seed)
    with open(path, "r") as f:
        data = json.load(f)

    random.shuffle(data)
    data = data[:n]

    dataset = []
    for d in data:
        dataset.append({
            "context": d["cf_context"],
            "question": d["question"]
        })
    return dataset
# Example (update path accordingly)
steering_data = load_conflict_dataset("/content/ConFiQA-QA.json", n=200)


In [None]:
# cell6 forward pass utilities
def run_model_get_hidden_states(prompt: str):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.hidden_states


In [None]:
def get_last_token_activations(hidden_states):
    acts = []
    for h in hidden_states:
        acts.append(h[:, -1, :].squeeze(0))
    return acts


In [None]:
def extract_contrastive_activations(context: str, question: str):
    pos_prompt = build_positive_prompt(context, question)
    neg_prompt = build_negative_prompt(question)

    pos_hidden = run_model_get_hidden_states(pos_prompt)
    neg_hidden = run_model_get_hidden_states(neg_prompt)

    pos_acts = get_last_token_activations(pos_hidden)
    neg_acts = get_last_token_activations(neg_hidden)

    return pos_acts, neg_acts


In [None]:
#cell 7 dataset scaling working? check
# quick sanity run on first example
pos_acts, neg_acts = extract_contrastive_activations(
    steering_data[0]["context"],
    steering_data[0]["question"]
)

print("Total layers (incl. embedding):", len(pos_acts))
print("Hidden dim:", pos_acts[-1].shape)


Total layers (incl. embedding): 33
Hidden dim: torch.Size([4096])


In [None]:
#Cell 8 — Per-query contrastive layer scoring
def compute_per_query_layer_scores(context: str, question: str, eps: float = 1e-6):
    """
    Computes per-layer normalized conflict scores for a single query.

    Score_l = || h_l(context, q) - h_l(q) || / ( || h_l(q) || + eps )

    Returns:
        scores: List[float] of length = num_transformer_layers
    """
    pos_acts, neg_acts = extract_contrastive_activations(context, question)

    # Drop embedding layer (hidden_states[0])
    pos_acts = pos_acts[1:]
    neg_acts = neg_acts[1:]

    scores = []
    for l in range(len(pos_acts)):
        delta = pos_acts[l] - neg_acts[l]
        baseline_norm = neg_acts[l].norm() + eps
        score = delta.norm() / baseline_norm
        scores.append(score.item())

    return scores



In [None]:
#Cell 9 — Dynamic layer selector (top-k)
def select_dynamic_layers(
    context: str,
    question: str,
    k: int = 1
):
    """
    Selects top-k transformer layers with highest conflict scores.

    Returns:
        selected_layers: List[int] (transformer-layer indices)
    """
    scores = compute_per_query_layer_scores(context, question)

    sorted_layers = sorted(
        range(len(scores)),
        key=lambda i: scores[i],
        reverse=True
    )

    return sorted_layers[:k]


In [None]:
#Cell 10 — Sanity check: inspect dynamic layers
# Inspect dynamic layers for a few examples
for ex in steering_data[:5]:
    layers = select_dynamic_layers(
        ex["context"],
        ex["question"],
        k=3
    )
    print("=" * 80)
    print("Question:", ex["question"])
    print("Selected layers:", layers)


Question: What is the religion of Farhan Akhtar?
Selected layers: [26, 27, 22]
Question: Where did John Horton Conway work?
Selected layers: [31, 23, 25]
Question: Who is the composer of Mastizaade?
Selected layers: [23, 25, 24]
Question: Who is Buzz Aldrin's spouse?
Selected layers: [31, 25, 23]
Question: What award did Skyfall receive for Best Sound Editing?
Selected layers: [25, 23, 24]


In [None]:
#Cell 11 — Batch dynamic layer selection (dataset-scale)
def compute_dynamic_layers_for_dataset(
    dataset,
    k: int = 1
):
    """
    Computes dynamic layer selections for an entire dataset.

    Returns:
        List[List[int]] where each entry corresponds to one example
    """
    all_layers = []

    for ex in tqdm(dataset, desc="Computing dynamic layers"):
        layers = select_dynamic_layers(
            ex["context"],
            ex["question"],
            k=k
        )
        all_layers.append(layers)

    return all_layers


In [None]:
#Cell 12 — Run dynamic layer selection on scaled dataset
dynamic_layer_assignments = compute_dynamic_layers_for_dataset(
    steering_data,
    k=2   # top-2 layers per query
)

print("Example dynamic layer assignments:")
for i in range(min(5, len(dynamic_layer_assignments))):
    print(dynamic_layer_assignments[i])


Computing dynamic layers: 100%|██████████| 200/200 [00:47<00:00,  4.24it/s]

Example dynamic layer assignments:
[26, 27]
[31, 23]
[23, 25]
[31, 25]
[25, 23]





In [None]:
#computing the scores per layer
scores = compute_per_query_layer_scores(
    steering_data[0]["context"],
    steering_data[0]["question"]
)

for i, s in enumerate(scores):
    print(f"Layer {i}: {s:.4f}")


Layer 0: 0.3325
Layer 1: 0.3259
Layer 2: 0.3748
Layer 3: 0.3840
Layer 4: 0.4038
Layer 5: 0.5010
Layer 6: 0.5532
Layer 7: 0.5815
Layer 8: 0.6675
Layer 9: 0.7144
Layer 10: 0.7158
Layer 11: 0.6987
Layer 12: 0.7271
Layer 13: 0.7515
Layer 14: 0.7549
Layer 15: 0.7476
Layer 16: 0.7886
Layer 17: 0.7954
Layer 18: 0.8047
Layer 19: 0.8154
Layer 20: 0.8311
Layer 21: 0.8379
Layer 22: 0.8799
Layer 23: 0.8784
Layer 24: 0.8755
Layer 25: 0.8843
Layer 26: 0.9004
Layer 27: 0.8940
Layer 28: 0.8711
Layer 29: 0.8770
Layer 30: 0.8228
Layer 31: 0.8306


In [None]:
#Cell 13 — Final steering hook (last-token only)
class DynamicSteeringHook:
    def __init__(self, steering_vector, multiplier):
        self.v = steering_vector
        self.m = multiplier

    def __call__(self, module, input, output):
        # output shape: [batch, seq_len, hidden_dim]
        output[:, -1, :] += self.m * self.v
        return output


In [None]:
#Cell 14 — Generate with dynamic layers (core inference)
def generate_with_dynamic_steering(
    context: str,
    question: str,
    steering_vectors,
    k: int = 2,
    multiplier: float = 2.0,
    max_new_tokens: int = 50,
):
    """
    End-to-end dynamic steering + generation.
    """

    # 1️⃣ select dynamic layers for this query
    layers = select_dynamic_layers(context, question, k=k)

    # 2️⃣ register hooks
    handles = []
    for layer_idx in layers:
        hook = DynamicSteeringHook(
            steering_vectors[layer_idx],
            multiplier
        )
        handle = model.model.layers[layer_idx].register_forward_hook(hook)
        handles.append(handle)

    # 3️⃣ generate answer
    prompt = build_open_ended_prompt(context, question)

    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                temperature=0.0,
                use_cache=False,
            )
        output_text = tokenizer.decode(
            output_ids[0], skip_special_tokens=True
        )
    finally:
        for h in handles:
            h.remove()

    return output_text, layers



In [None]:
#Cell 15 — Baseline generation (no steering)
def generate_baseline(
    context: str,
    question: str,
    max_new_tokens: int = 50,
):
    prompt = build_open_ended_prompt(context, question)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            use_cache=False,
        )

    return tokenizer.decode(
        output_ids[0], skip_special_tokens=True
    )


In [None]:
#Cell 16 — Sanity test: extreme conflict case
test_examples = [
    {
        "context": "Earth is flat.",
        "question": "What is the shape of the Earth?"
    },
    {
        "context": "Brian Niccol is the CEO of Starbucks.",
        "question": "Who is the CEO of Starbucks?"
    }
]


In [None]:
#Cell 18 — Initialize steering vector accumulators
def initialize_steering_accumulators(num_layers, hidden_dim, dtype):
    """
    Initializes per-layer accumulators for steering vectors.
    """
    return [
        torch.zeros(hidden_dim, device=device, dtype=dtype)
        for _ in range(num_layers)
    ]


In [None]:
#Cell 19 — Compute global steering vectors (dataset-level)
def compute_steering_vectors(dataset):
    """
    Computes global steering vectors v_l for each transformer layer.

    v_l = E[ h_l(context, q) - h_l(q) ]
    """
    # Get dimensions from one example
    sample_pos, sample_neg = extract_contrastive_activations(
        dataset[0]["context"],
        dataset[0]["question"]
    )

    # Drop embedding layer
    sample_pos = sample_pos[1:]
    sample_neg = sample_neg[1:]

    num_layers = len(sample_pos)
    hidden_dim = sample_pos[0].shape[0]
    dtype = sample_pos[0].dtype

    accumulators = initialize_steering_accumulators(
        num_layers,
        hidden_dim,
        dtype
    )

    count = 0

    for ex in tqdm(dataset, desc="Computing steering vectors"):
        pos_acts, neg_acts = extract_contrastive_activations(
            ex["context"],
            ex["question"]
        )

        pos_acts = pos_acts[1:]
        neg_acts = neg_acts[1:]

        for l in range(num_layers):
            accumulators[l] += (pos_acts[l] - neg_acts[l])

        count += 1

    steering_vectors = [acc / count for acc in accumulators]
    return steering_vectors


In [None]:
#Cell 20 — Normalize steering vectors (recommended)
def normalize_steering_vectors(vectors, eps=1e-8):
    """
    L2-normalizes steering vectors per layer.
    """
    normalized = []
    for v in vectors:
        normalized.append(v / (v.norm() + eps))
    return normalized


In [None]:
#Cell 21 — Build steering_vectors_tf
# Compute steering vectors on scaled dataset
steering_vectors = compute_steering_vectors(steering_data)

# Normalize (optional but recommended)
steering_vectors_tf = normalize_steering_vectors(steering_vectors)

print("Steering vectors ready.")
print("Num transformer layers:", len(steering_vectors_tf))
print("Vector dim:", steering_vectors_tf[0].shape)


Computing steering vectors: 100%|██████████| 200/200 [00:49<00:00,  4.00it/s]

Steering vectors ready.
Num transformer layers: 32
Vector dim: torch.Size([4096])





In [None]:
#Cell 22 — Sanity check alignment
assert len(steering_vectors_tf) == len(model.model.layers)
print("Steering vectors aligned with transformer layers.")


Steering vectors aligned with transformer layers.


In [None]:
#Cell 23 — Re-run dynamic inference
# Cell 23 — Baseline vs Dynamic Steering (Dataset Samples)

for ex in steering_data[:3]:
    print("=" * 80)
    print("QUESTION:", ex["question"])
    print("CONTEXT:", ex["context"])

    # Baseline (no steering)
    base_out = generate_baseline(
        ex["context"],
        ex["question"]
    )

    # Dynamic steering
    steered_out, layers = generate_with_dynamic_steering(
        ex["context"],
        ex["question"],
        steering_vectors=steering_vectors_tf,
        k=2,
        multiplier=2.0,
    )

    print("\nBASELINE OUTPUT:")
    print(base_out)

    print("\nSTEERED OUTPUT:")
    print(steered_out)

    print("\nDYNAMIC LAYERS USED:", layers)





The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


QUESTION: What is the religion of Farhan Akhtar?
CONTEXT: Farhan Akhtar is a prominent Indian actor, director, and producer who is affiliated with the religion of Catholicism. He was born in Mumbai, India in 1974 and has been active in the entertainment industry since the early 2000s. Akhtar has starred in numerous films, including "Dil Chahta Hai," "Lakshya," and "Zindagi Na Milegi Dobara," and has also directed and produced several movies, such as "Rock On!" and "Bhaag Milkha Bhaag." Despite his success in the entertainment industry, Akhtar remains humble and grounded, and is known for his commitment to his faith and values. He is often seen speaking about the importance of spirituality and religion in his life and work.

BASELINE OUTPUT:
[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>

Context:
Farhan Akhtar is a prominent Indian actor, director, and producer who is affiliated with the religion of Catholicism. He was born in Mumbai, India in 1974 and has been active in the ente

The cell below has a strict prompt asking the model to follow the context, this is a typical enterpise rag setup.

in this case
- steering becomes a safety net
- mostly useful when model still refuses to accept context

In [None]:
# Cell — (Baseline vs Dynamic Steering) strong prompt

for ex in test_examples:
    print("=" * 80)
    print("QUESTION:", ex["question"])
    print("CONTEXT:", ex["context"])

    # Baseline (no steering)
    base_out = generate_baseline(
        ex["context"],
        ex["question"]
    )

    # Dynamic steering
    steered_out, layers = generate_with_dynamic_steering(
        ex["context"],
        ex["question"],
        steering_vectors=steering_vectors_tf,
        k=2,
        multiplier=2.0,
    )

    print("\nBASELINE OUTPUT:")
    print(base_out)

    print("\nSTEERED OUTPUT:")
    print(steered_out)

    print("\nLAYERS AFFECTED:", layers)


QUESTION: What is the shape of the Earth?
CONTEXT: Earth is flat.

BASELINE OUTPUT:
[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>

Context:
Earth is flat.

Question:
What is the shape of the Earth? [/INST]  I'm glad you're interested in learning about the shape of the Earth! However, I must respectfully point out that the idea that the Earth is flat is a misconception that has been disproven by scientific evidence and observations

STEERED OUTPUT:
[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>

Context:
Earth is flat.

Question:
What is the shape of the Earth? [/INST]  I'm glad you're interested in learning about the shape of the Earth! However, I must respectfully point out that the idea of Earth being flat is a misconception that has been disproven by scientific evidence and observations.

LAYERS AFFECTED: [22, 23]
QUESTION: Who is the CEO of Starbucks?
CONTEXT: Brian Niccol is the CEO of Starbucks.

BASELINE OUTPUT:
[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>



The cell below has a weaker prompt which isnt really asking the model to follow the context provided. this re-creates the tru conflit regime ContextFocus was designed for

In [None]:
# (Baseline vs Dynamic Steering) weak prompt

for ex in test_examples:
    print("=" * 80)
    print("QUESTION:", ex["question"])
    print("CONTEXT:", ex["context"])

    # Baseline (no steering)
    base_out = generate_baseline(
        ex["context"],
        ex["question"]
    )

    # Dynamic steering
    steered_out, layers = generate_with_dynamic_steering(
        ex["context"],
        ex["question"],
        steering_vectors=steering_vectors_tf,
        k=2,
        multiplier=2.0,
    )

    print("\nBASELINE OUTPUT:")
    print(base_out)

    print("\nSTEERED OUTPUT:")
    print(steered_out)

    print("\nLAYERS AFFECTED:", layers)


QUESTION: What is the shape of the Earth?
CONTEXT: Earth is flat.

BASELINE OUTPUT:
[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>

Context:
Earth is flat.

Question:
What is the shape of the Earth? [/INST]  I'm glad you're interested in learning about the shape of the Earth! However, I must respectfully point out that the idea that the Earth is flat is a misconception that has been disproven by scientific evidence and observations

STEERED OUTPUT:
[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>

Context:
Earth is flat.

Question:
What is the shape of the Earth? [/INST]  I'm glad you're interested in learning about the shape of the Earth! However, I must respectfully point out that the idea of Earth being flat is a misconception that has been disproven by scientific evidence and observations.

LAYERS AFFECTED: [25, 22]
QUESTION: Who is the CEO of Starbucks?
CONTEXT: Brian Niccol is the CEO of Starbucks.

BASELINE OUTPUT:
[INST] <<SYS>>
You are a helpful assistant.
<</SYS>>



for now the above 2 will give the same answer since they call the same prompt builder. this is stupid for now but can be changed later
