In [3]:
import os
workspace_dir = "/workspace/neelnanda"
os.environ["HF_HOME"] = "/workspace/.cache"
from typing import List, Optional
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model1_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model1 = AutoModelForCausalLM.from_pretrained(model1_name,torch_dtype="auto").to(device)
tokenizer1 = AutoTokenizer.from_pretrained(model1_name, padding_side="left")

model0_name = "Qwen/Qwen2.5-Math-1.5B"
model0 = AutoModelForCausalLM.from_pretrained(model0_name,torch_dtype="auto").to(device)
tokenizer0 = AutoTokenizer.from_pretrained(model0_name, padding_side="left")

def hf_call_batch(
    model,
    tokenizer,
    sys_prompts: List[str],
    user_prompts: List[str],
    generation_prompt: str = "<｜Assistant｜>",
    batch_size: int = 200,
    max_length: int = 1000,
    temp: float = 0.7,
):
    assert len(sys_prompts) == len(user_prompts)
    responses = []

    for i in range(0, len(sys_prompts), batch_size):
        # print(f"Starting batch {i}")

        end = min(i + batch_size, len(sys_prompts))
        batch_sys_prompts = sys_prompts[i:end]
        batch_user_prompts = user_prompts[i:end]

        batch_messages = [
            [
                {"role": "system", "content": sys_prompt},
                {"role": "user", "content": user_prompt},
            ]
            for sys_prompt, user_prompt in zip(
                batch_sys_prompts, batch_user_prompts
            )
        ]

        batch_texts = [
            tokenizer1.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            for messages in batch_messages
        ]

        print(batch_texts[0])

        # Tokenize all prompts in batch
        model_inputs = tokenizer(
            batch_texts, return_tensors="pt", padding=True, truncation=True
        ).to(device)

        # Generate responses autoregressively
        generated_ids = model.generate(
            model_inputs.input_ids,
            attention_mask=model_inputs.attention_mask,
            max_new_tokens=max_length,
            temperature=temp,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            num_return_sequences=1,
        )

        # Extract only the generated tokens (excluding input prompt tokens)
        generated_ids = [
            output_ids[len(input_ids) :]
            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        # Decode generated tokens to text
        batch_responses = tokenizer.batch_decode(
            generated_ids, skip_special_tokens=True
        )
        responses.extend(batch_responses)

        # print(f"Batch {i} took {time.time()-start_time} seconds")

    return responses

cuda


In [5]:
print(hf_call_batch(
    model=model1,
    tokenizer=tokenizer1,
    sys_prompts=["You are a helpful assistant."], 
    user_prompts=["""What is 1+1? """],
    max_length=1000))

<｜begin▁of▁sentence｜>You are a helpful assistant.<｜User｜>What is 1+1? <｜Assistant｜><think>

["Okay, so I need to figure out what 1 plus 1 is. Hmm, that seems pretty straightforward, but I guess I should think about it carefully to make sure I'm not missing anything. Let me start by recalling what I know about addition. Addition is combining two numbers to get a total. So, in this case, I have two numbers: 1 and 1.\n\nI know that 1 is a unit, representing a single object. When I add another 1, I'm essentially combining another single object with the first one. So, if I have one apple and then get another apple, I now have two apples. That makes sense.\n\nBut wait, maybe I should consider different contexts. For example, what if I'm counting on my fingers? If I have one finger up and then put another finger up, I can see that there are two fingers up. That's another way to visualize addition.\n\nI also remember that in mathematics, addition is commutative, which means the order of the nu

In [17]:
sum(p.numel() for p in model0.parameters())

1543714304

In [None]:
for m in model1.model.layers[0].mlp.down_proj.parameters():
    k1 = m

for m in model0.model.layers[0].mlp.down_proj.parameters():
    k0 = m

k_diff = k1-k0

In [52]:
k_diff

tensor([[ 0.0045, -0.0094,  0.0013,  ..., -0.0024,  0.0108, -0.0087],
        [-0.0068, -0.0045,  0.0026,  ...,  0.0046,  0.0015,  0.0040],
        [-0.0031,  0.0060,  0.0005,  ..., -0.0022,  0.0020,  0.0056],
        ...,
        [-0.0118,  0.0061,  0.0002,  ..., -0.0060, -0.0023,  0.0021],
        [-0.0054,  0.0090,  0.0027,  ..., -0.0002,  0.0015, -0.0001],
        [ 0.0022,  0.0005,  0.0039,  ...,  0.0032,  0.0010,  0.0014]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<SubBackward0>)

In [26]:
model0

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotary_emb): Qw

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import einops
from fancy_einsum import einsum
import tqdm.auto as tqdm
from torch.utils.data import DataLoader, Dataset

from jaxtyping import Float
from functools import partial

import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, FactoredMatrix

In [None]:
model0 = HookedTransformer.from_pretrained("Qwen/Qwen2.5-1.5B", device=device)
model1 = HookedTransformer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", device=device)
tokenizer = model0.tokenizer



Loaded pretrained model Qwen/Qwen2.5-1.5B into HookedTransformer




Loaded pretrained model Qwen/Qwen2.5-1.5B-Instruct into HookedTransformer


In [None]:
# Some tokenizers (like GPT-2’s) do not have a pad token. In that case, use the EOS token.
pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

########################################################################
# Define a dummy text classification dataset.
########################################################################

class DummyTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # Tokenize the text (using the model's tokenizer)
        tokens = self.tokenizer(
            text, 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors="pt"
        )["input_ids"].squeeze(0)
        return tokens, label

# Example data (modify with your own dataset)
texts = [
    "This is a positive example.",
    "This is a negative example.",
    "Yet another positive text.",
    "And one more negative instance."
]
# For our dummy task, we use 1 for positive and 0 for negative
labels = [1, 0, 1, 0]

dataset = DummyTextDataset(texts, labels, tokenizer)
# Define a simple collate function that pads the token sequences
def collate_fn(batch):
    tokens, labels = zip(*batch)
    tokens = torch.nn.utils.rnn.pad_sequence(
        tokens, batch_first=True, padding_value=pad_token_id
    )
    labels = torch.tensor(labels)
    return tokens, labels

dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

########################################################################
# Define a function to extract the last-layer activation.
#
# In this example we hook the last block’s “resid_post” (i.e. the output
# of the block, before the final layer norm and head) and then select the
# representation corresponding to the final (non-pad) token in each sequence.
########################################################################

def get_last_layer_activation(tokens):
    # Dictionary to store the activation from our hook.
    activation = {}

    def hook_fn(res, hook):
        # Save a detached copy of the activation.
        activation["last_layer"] = res.detach()
    
    # Choose the last block (0-indexed) – note: model.cfg.n_layers is the total count.
    last_layer = model0.cfg.n_layers - 1
    # The hook name for the final block’s output (residual stream after the block).
    hook_name = f"blocks.{last_layer}.hook_resid_post"
    
    # Run the model with our hook.
    # Here we use `run_with_hooks` so that the hook function is called during the forward pass.
    _ = model0.run_with_hooks(tokens, hook_dict={hook_name: hook_fn}, return_type="both")
    
    # activation["last_layer"] has shape (batch, seq_len, d_model)
    # For this probe, we select the last non-pad token from each sequence.
    # Compute the length (number of non-pad tokens) per example.
    lengths = tokens.ne(pad_token_id).sum(dim=1)  # shape: (batch,)
    # Index: for each example i, select position lengths[i]-1
    last_token_rep = activation["last_layer"][torch.arange(activation["last_layer"].shape[0]), lengths - 1]
    
    return last_token_rep  # shape: (batch, d_model)

########################################################################
# Define the linear probe model.
#
# This is a simple linear classifier that maps from the transformer's 
# hidden dimension to the number of classes.
########################################################################

num_classes = 2
d_model = model0.cfg.d_model  # hidden dimension of the transformer
probe = nn.Linear(d_model, num_classes)

# Set up an optimizer and a loss function (cross-entropy for classification).
probe_optimizer = optim.Adam(probe.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

########################################################################
# Training loop for the linear probe.
########################################################################

num_epochs = 10
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for tokens, labels in dataloader:
        probe_optimizer.zero_grad()
        # Get the last-layer representations from the model
        reps = get_last_layer_activation(tokens)  # shape: (batch, d_model)
        # Forward pass through the probe
        logits = probe(reps)  # shape: (batch, num_classes)
        loss = criterion(logits, labels)
        loss.backward()
        probe_optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs} -- Loss: {epoch_loss/len(dataloader):.4f}")


In [None]:

# ---------------------------
# 1. Load the pretrained model
# ---------------------------
model = HookedTransformer.from_pretrained("gpt2-small")
tokenizer = model.tokenizer
# Some tokenizers (like GPT-2’s) do not have a pad token, so we use the EOS token.
pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

# Set model to train mode so gradients are computed.
model.train()

# ---------------------------
# 2. Define a linear projection to output a scalar.
# ---------------------------
d_model = model.cfg.d_model  # hidden dimension of the transformer
projection = nn.Linear(d_model, 1)  # maps from d_model to a single real number

# ---------------------------
# 3. Prepare a single datapoint.
# ---------------------------
text = "This is an example datapoint."
# Tokenize the text. The output has shape (1, seq_len).
tokens = tokenizer(text, return_tensors="pt")["input_ids"]

# ---------------------------
# 4. Extract the last-layer residual stream via a hook.
# ---------------------------
# We'll store the activation in a dictionary.
activation = {}

def hook_fn(res, hook):
    # Save the activation (we don't detach here because we want gradients to flow)
    activation["last_layer"] = res

# Identify the last transformer block (0-indexed).
last_layer_idx = model.cfg.n_layers - 1
hook_name = f"blocks.{last_layer_idx}.hook_resid_post"

# Run the model with the hook attached.
# We use run_with_hooks so that the hook_fn is executed during the forward pass.
# We set return_type="none" because we don't need the model's final output.
_ = model.run_with_hooks(tokens, hook_dict={hook_name: hook_fn}, return_type="none")

# ---------------------------
# 5. Extract the representation for the last non-pad token.
# ---------------------------
# activation["last_layer"] has shape (batch, seq_len, d_model)
# Compute the number of non-pad tokens in each sequence.
lengths = tokens.ne(pad_token_id).sum(dim=1)  # shape: (batch,)
# For each example, pick the representation of the last non-pad token.
last_token_rep = activation["last_layer"][torch.arange(activation["last_layer"].shape[0]), lengths - 1]
# last_token_rep now has shape (batch, d_model)

# ---------------------------
# 6. Apply the linear projection to get a scalar output.
# ---------------------------
# The projection outputs shape (batch, 1). We squeeze it to get a scalar per datapoint.
scalar_output = projection(last_token_rep).squeeze()
# If batch size is 1, scalar_output is a single scalar.

print("Scalar output:", scalar_output.item())

# ---------------------------
# 7. Backward pass: compute gradients w.r.t. model and projection parameters.
# ---------------------------
# Compute gradients of the scalar output with respect to all parameters.
scalar_output.backward()

# Cache the gradients for the model's parameters.
model_grads = {
    name: param.grad.clone() 
    for name, param in model.named_parameters() 
    if param.grad is not None
}

# Similarly, cache the gradients for the projection's parameters.
projection_grads = {
    name: param.grad.clone()
    for name, param in projection.named_parameters()
    if param.grad is not None
}
 
# ---------------------------
# 8. (Optional) Inspect the gradient norms.
# ---------------------------
print("Model gradients (norms):")
for name, grad in model_grads.items():
    print(f"  {name}: {grad.norm().item():.4f}")


In [25]:
model_description_text = """## Loading Models

HookedTransformer comes loaded with >40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. See my explainer for documentation of all supported models, and this table for hyper-parameters and the name used to load them. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly.

For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!"""
loss = model0(model_description_text, return_type="loss")
print("Model loss:", loss)

Model loss: tensor(3.2925, device='cuda:0', grad_fn=<DivBackward0>)
