In [3]:
import os
import sys
import plotly.express as px
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from pathlib import Path
import numpy as np
import einops
from jaxtyping import Int, Float
from typing import List, Optional, Tuple
import functools
from tqdm import tqdm
from IPython.display import display
import webbrowser
from transformer_lens.hook_points import HookPoint
from transformer_lens import utils, HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache
import circuitsvis as cv
import openai
from dotenv import load_dotenv
import random

load_dotenv()

# Set API Keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
assert OPENAI_API_KEY, "OPENAI_API_KEY environment variable is missing from .env"
openai.api_key = OPENAI_API_KEY

content = "You are a helpful mechanistic interpretability researcher who is an expert in analyzing attention patterns"
# response = openai.ChatCompletion.create(model=self.model, messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}], max_tokens=100, tools=tools) # tools=tools
#         response_message = response['choices'][0]['message']

In [4]:
openai_model = "gpt-3.5-turbo-16k" # "gpt-3.5-turbo"

In [44]:
from datasets import load_dataset
dataset = load_dataset("stas/openwebtext-10k", split="train", trust_remote_code=True)

Downloading data:   0%|          | 0.00/30.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
# Saves computation time, since we don't need it for the contents of this notebook
t.set_grad_enabled(False)

device = t.device("cuda" if t.cuda.is_available() else "mps")

In [48]:
text = dataset['text']

In [6]:
cfg = HookedTransformerConfig(
    d_model=768,
    d_head=64,
    n_heads=12,
    n_layers=2,
    n_ctx=2048,
    d_vocab=50278,
    attention_dir="causal",
    attn_only=True, # defaults to False
    tokenizer_name="EleutherAI/gpt-neox-20b", 
    seed=398,
    use_attn_result=True,
    normalization_type=None, # defaults to "LN", i.e. layernorm with weights & biases
    positional_embedding_type="shortformer"
)

weights_dir = "test-heads/attn_only_2L_half.pth"

model = HookedTransformer(cfg)
pretrained_weights = t.load(weights_dir, map_location=device)
model.load_state_dict(pretrained_weights)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<All keys matched successfully>

In [7]:
# model.tokenizer(text, return_tensors="pt")["input_ids"].shape
random_sentences = ["From around 1242, he was co-ruler with his father, and his relationship with some prominent aristocrats grew tense.", 
                    "Lack of approval for a congressional delegation resulted in the ceremony being delayed from the scheduled time on 14 January to the very early morning of 15 January", 
                    "He came to Sweden with Cardinal Nicholas Breakspeare in 1153 and was most likely designated to be the new Archbishop of Uppsala, but the independent church province of Sweden could only be established in 1164 after the civil war, and Henry would have been sent to organize the Church in Finland, where Christians had already existed for two centuries.", 
                    "Bright vixens jump over the moon; dozy fowl quack at the shimmering lake.", 
                    "Sphinx of black quartz, judge my vow as I stand in the heart of the Egyptian desert."]
input_ids = model.tokenizer(random_sentences, padding=True, return_tensors="pt")["input_ids"]
input_ids.shape


torch.Size([5, 70])

In [8]:
text = "We think that powerful, significantly superhuman machine intelligence is more likely than not to be created this century. If current machine learning techniques were scaled up to this level, we think they would by default produce systems that are deceptive or manipulative, and that no solid plans are known for how to avoid this."

logits, cache = model.run_with_cache(text, remove_batch_dim=True)

str_tokens = model.to_str_tokens(text)
for layer in range(model.cfg.n_layers):
    attention_pattern = cache["pattern", layer]
    display(cv.attention.attention_patterns(tokens=str_tokens, attention=attention_pattern))

In [9]:
text = "We think that powerful, significantly superhuman machine intelligence is more likely than not to be created this century. If current machine learning techniques were scaled up to this level, we think they would by default produce systems that are deceptive or manipulative, and that no solid plans are known for how to avoid this."

logits, cache = model.run_with_cache(input_ids)
random_idx = 0
new_text = random_sentences[random_idx]

str_tokens = model.to_str_tokens(new_text)
for layer in range(model.cfg.n_layers):
    attention_pattern = cache["pattern", 0][random_idx, ...]
    display(cv.attention.attention_patterns(tokens=str_tokens, attention=attention_pattern))

In [None]:
def gini_coefficient(x):
    x = x.flatten() #all values are treated equally, arrays must be 1d
    n = x.shape[0]
    x = x.sort()[0] #values must be sorted
    index = t.arange(1, n+1, device=device) #index per array element
    return (t.sum((2 * index - n  - 1) * x)) / (n * t.sum(x)) #Gini coefficient 

In [10]:
def get_random_row_idx(end_idx: int) -> int:
    min_idx = 10 if end_idx > 10 else 1
    max_idx = end_idx
    return random.randint(min_idx, max_idx)

In [21]:
# NEWEST
prompt_prefix = """
Here is a row by row break down of which tokens were attended to for each individual generation step for an autoregressive transformer.
Each row is labeled with its index and is of the format for each row: `\{text\}\\n\{{token} {multiple} for token, multiple in above_avg_tokens}`.
Notably, the tokens with above average attention are sorted by their score's multiple relative to the average attention score for that row,
and this multiple is included next to it, i.e. `tokenX 2, tokenY 1.5` etc.
"""

prompt_suffix = """
Please generate a response in the format `row \{idx\}: \{your generated description of this row's pattern of attention here\}` for each row.
Be as specific as possible, making sure to consider all possible reasons certain tokens may have been attended to more than others, including 
the position of the tokens, what the tokens themselves represent, the attention scores, and how these all interact.
Limit your description to 1-3 sentences per row.
"""

num_row_samples = 1

def get_multiple(score, avg_score):
    return round((score / avg_score).item(), 1)

def get_attention_pattern_prompt_for_rows(cache: ActivationCache, layer: int, head: int, tokens: List[List[str]], pad_token: str) -> str:
    attention_pattern = cache["pattern", layer][:, head, ...]
    n_ctx, batch_size = len(tokens[0]), len(tokens)
    assert attention_pattern.shape == (batch_size, n_ctx, n_ctx), f"The cached attention pattern shape {attention_pattern.shape} != tokens shape {(batch_size, n_ctx, n_ctx)}"
    
    # Randomly sample up to 5 batch indices
    sampled_batch_indices = random.sample(range(batch_size), min(5, batch_size))
    
    # Generate rows of string for each sampled index
    rows = []
    for sample_idx in sampled_batch_indices:  # Iterate over sampled batches
        end_idx = n_ctx
        if pad_token in tokens[sample_idx][1:]:
            end_idx = tokens[sample_idx][1:].index(pad_token)
        idx = get_random_row_idx(end_idx)
        # Only consider tokens that the current token could attend to (itself and previous tokens)
        num_tokens = idx + 1
        relevant_tokens = tokens[sample_idx][:num_tokens]  # Adjust for batch index
        text = "".join(relevant_tokens)
        relevant_scores = attention_pattern[sample_idx, idx, :num_tokens]  # Adjust for batch index
        avg_score = 1 / num_tokens
        
        # Calculate summary statistics
        above_avg_tokens = [(token, get_multiple(score, avg_score)) for token, score in zip(relevant_tokens, relevant_scores) if score > avg_score]
        above_avg_tokens.sort(key=lambda x: x[1], reverse=True)
        above_avg_tokens = ", ".join([f"{token} {multiple}" for token, multiple in above_avg_tokens])
        
        row = f"{text}\n{above_avg_tokens}\n"
        
        rows.append(row)
    
    # Combine the sampled rows into a single string with line breaks
    rows_str = "\n".join(rows)
    
    # Combine the prefix, rows, and suffix into the final prompt
    prompt = f"{prompt_prefix}\n{rows_str}\n{prompt_suffix}"
    
    return prompt

In [70]:
# Select random sentences
selected_sentences = random.sample(text, 10)

input_ids = model.tokenizer(selected_sentences, padding=True, return_tensors="pt")["input_ids"]

max_length = 40
if input_ids.shape[1] > max_length:
    input_ids = input_ids[:, :max_length]

decoded = model.tokenizer.batch_decode(input_ids)
str_tokens = model.to_str_tokens(decoded, prepend_bos=False)

In [71]:
_, new_cache = model.run_with_cache(input_ids)

In [72]:
prompt = get_attention_pattern_prompt_for_rows(new_cache, 0, 2, str_tokens, model.tokenizer.pad_token)
print(prompt)


Here is a row by row break down of which tokens were attended to for each individual generation step for an autoregressive transformer.
Each row is labeled with its index and is of the format for each row: `\{text\}\n\{{token} {multiple} for token, multiple in above_avg_tokens}`.
Notably, the tokens with above average attention are sorted by their score's multiple relative to the average attention score for that row,
and this multiple is included next to it, i.e. `tokenX 2, tokenY 1.5` etc.

Shannon Dickson couldn’t get the words out of his mouth. On a Saturday morning, the retired engineer was at home in Sedona, Ariz.,
Sh 20.1,  Ariz 3.1,  Saturday 2.9,  Sed 1.6,  engineer 1.3,  retired 1.2,  morning 1.0

Three people from Blount County are facing felony reckless endangerment charges after deputies say they forced a minor to inject methamphetamine.

Deputies
Three 26.1

Robbie Ray was a league winner in 2017. According
Rob 4.2, . 1.7,  According 1.7,  in 1.1

Friend can suggest an in

In [32]:
# num_samples = 5
# random_indices = random.sample(range(len(sentences)), num_samples)
# random_sentences = [sentences[i] for i in random_indices]
# random_sentences_str_toks = [model.to_str_tokens(sentence) for sentence in random_sentences]
# print(random_sentences)
# print(random_sentences_str_toks)


# selected_sentences = random.sample(list(sentences), 5)
# sentences
# input_ids = model.tokenizer(selected_sentences, padding=True, return_tensors="pt")["input_ids"]
# print(input_ids)
# str_tokens = [model.to_str_tokens(sentence) for sentence in input_ids]
# prompt = get_attention_pattern_prompt_for_rows(cache, 0, 0, str_tokens, model.tokenizer.pad_token)
# print(prompt)


[['I', 'said', '"', 'What', 'for', '?"\''], ["'", 'We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', ",'", 'said', 'the', 'Mock', 'Turtle', 'angrily', ':', "'", 'really', 'you', 'are', 'very', 'dull', "!'"], ['cried', 'the', 'Gryphon', ',', 'and', ',', 'taking', 'Alice', 'by', 'the', 'hand', ',', 'it', 'hurried', 'off', ',', 'without', 'waiting', 'for', 'the', 'end', 'of', 'the', 'song', '.'], ["'", 'I', "'", 'm', 'glad', 'they', "'", 've', 'begun', 'asking', 'riddles', '.--', 'I', 'believe', 'I', 'can', 'guess', 'that', ",'", 'she', 'added', 'aloud', '.'], ["'", 'And', 'ever', 'since', 'that', ",'", 'the', 'Hatter', 'went', 'on', 'in', 'a', 'mournful', 'tone', ',', "'", 'he', 'won', "'", 't', 'do', 'a', 'thing', 'I', 'ask', '!']]
[[['<|endoftext|>', 'I'], ['<|endoftext|>', 'said'], ['<|endoftext|>', '"'], ['<|endoftext|>', 'What'], ['<|endoftext|>', 'for'], ['<|endoftext|>', '?"', "'"]], [['<|endoftext|>', "'"], ['<|endoftext|>', 'We'], ['<|endoftext|>', 'called'], ['

In [None]:
input_ids = model.tokenizer(random_sentences, padding=True, return_tensors="pt")["input_ids"]
str_tokens = [model.to_str_tokens(sentence) for sentence in input_ids]
prompt = get_attention_pattern_prompt_for_rows(cache, 0, 0, str_tokens, model.tokenizer.pad_token)
print(prompt)

In [113]:
input_ids = model.tokenizer(random_sentences, padding=True, return_tensors="pt")["input_ids"]
tokenized_sentences = [model.to_str_tokens(sentence) for sentence in input_ids]


[['From',
  ' around',
  ' 12',
  '42',
  ',',
  ' he',
  ' was',
  ' co',
  '-',
  'r',
  'uler',
  ' with',
  ' his',
  ' father',
  ',',
  ' and',
  ' his',
  ' relationship',
  ' with',
  ' some',
  ' prominent',
  ' arist',
  'ocrats',
  ' grew',
  ' tense',
  '.',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|endoftext|>',
  '<|en

In [22]:
content = "You are a helpful mechanistic interpretability researcher who is an expert in analyzing attention patterns"
response = openai.ChatCompletion.create(model=openai_model, messages=[{"role": "system", "content": content}, {"role": "user", "content": prompt}])
response_message = response['choices'][0]['message']
response_content = response_message['content']
formatted_content = response_content.replace("\\n", "\n")
print(formatted_content)

row 59: The attention is spread across multiple tokens, mostly focusing on the words discussing the potential creation of powerful machine intelligence and the consequences of scaling up current machine learning techniques. The highest attention scores are observed for tokens related to the idea that systems would be deceptive or manipulative without solid plans to avoid such behavior.

row 25: Attention spans across various tokens, particularly focusing on the discussion of the likelihood of powerful machine intelligence being created in this century. Attention weights are highest for tokens related to the possibility of current machine learning techniques leading to this outcome.

row 27: Attention mostly concentrates on tokens related to the discussion of powerful machine intelligence being created and the assessment of the likelihood of this occurrence. The highest attention scores are observed for tokens discussing the consequences of scaling up current machine learning techniques

In [58]:
a = model.tokenizer("<|endoftext|>(3.78x),  than(3.46x),  to(2.08x),  be(1.85x),  this(1.66x),  not(1.27x),  is(1.13x)")
b = a["input_ids"]
print(len(b), b)

55 [0, 9, 20, 15, 3141, 89, 582, 50276, 14644, 9, 20, 15, 2950, 89, 582, 50276, 936, 9, 19, 15, 2904, 89, 582, 50276, 1257, 9, 18, 15, 2227, 89, 582, 50276, 2520, 9, 18, 15, 2526, 89, 582, 50276, 1439, 9, 18, 15, 1630, 89, 582, 50276, 261, 9, 18, 15, 1012, 89, 10]


torch.Size([62, 62])