<a href="https://colab.research.google.com/github/gg2001/transformer-circuits/blob/master/puzzles/1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [Mech Interp Puzzle 1: Suspiciously Similar Embeddings in GPT-Neo](https://www.alignmentforum.org/posts/eLNo7b56kQQerCzp2/mech-interp-puzzle-1-suspiciously-similar-embeddings-in-gpt)

In [1]:
%pip install transformer_lens plotly nbformat

Collecting transformer_lens
  Downloading transformer_lens-2.7.0-py3-none-any.whl.metadata (12 kB)
Collecting beartype<0.15.0,>=0.14.1 (from transformer_lens)
  Downloading beartype-0.14.1-py3-none-any.whl.metadata (28 kB)
Collecting better-abc<0.0.4,>=0.0.3 (from transformer_lens)
  Downloading better_abc-0.0.3-py3-none-any.whl.metadata (1.4 kB)
Collecting datasets>=2.7.1 (from transformer_lens)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting fancy-einsum>=0.0.3 (from transformer_lens)
  Downloading fancy_einsum-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting jaxtyping>=0.2.11 (from transformer_lens)
  Downloading jaxtyping-0.2.34-py3-none-any.whl.metadata (6.4 kB)
Collecting wandb>=0.13.5 (from transformer_lens)
  Downloading wandb-0.18.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.7.1->transformer_lens)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxha

In [40]:
from tqdm import tqdm
from transformer_lens import HookedTransformer
import plotly.express as px
import torch
import heapq

In [3]:
model = HookedTransformer.from_pretrained("gpt-neo-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Loaded pretrained model gpt-neo-small into HookedTransformer


In [15]:
W_E = model.W_E
W_E.requires_grad = False

In [16]:
# subsample = torch.randperm(model.cfg.d_vocab)[:5000].to(model.cfg.device)
# W_E = model.W_E[subsample]  # Take a random subset of 5,000 for memory reasons

In [17]:
W_E_normed = W_E / W_E.norm(dim=-1, keepdim=True) # [d_vocab, d_model]
cosine_sims = W_E_normed @ W_E_normed.T # [d_vocab, d_vocab]

In [33]:
cosine_sims[0]

tensor([1.0000, 0.9205, 0.8968,  ..., 0.9207, 0.8843, 0.9190], device='cuda:0')

In [36]:
top_k = 1000
heap = []

In [None]:
num_tokens = W_E_normed.size(0)

for token_0 in tqdm(range(num_tokens)):
    # To leverage vectorization for better performance, you can compute similarities in batches
    # However, since memory efficiency is prioritized, we'll stick to the nested loop
    token_cosine_sims = (W_E_normed[token_0] @ W_E_normed[token_0 + 1:].T)
    top_cosine_sims = torch.topk(token_cosine_sims, top_k)
    for cosine_sim, token_1 in zip(top_cosine_sims.values, top_cosine_sims.indices):
        # Compute cosine similarity (dot product since vectors are normalized)
        cosine_sim = (W_E_normed[token_0] @ W_E_normed[token_1]).item()

        # If heap is not full, push the current similarity
        if len(heap) < top_k:
            heapq.heappush(heap, (cosine_sim, token_0, token_1))
        else:
            # If current similarity is greater than the smallest in the heap, replace it
            if cosine_sim > heap[0][0]:
                heapq.heappushpop(heap, (cosine_sim, token_0, token_1))

  1%|          | 616/50257 [00:34<46:09, 17.92it/s]

In [None]:
cosine_sims_factored = cosine_sims
cosine_sims = cosine_sims_factored.AB

In [None]:
# px.histogram(
#    cosine_sims.flatten().detach().cpu().numpy(),
#    title="Pairwise cosine sims of embedding",
#)

In [None]:
# Step 2: Flatten the matrix and get top 100 similarities
top_k = 100
top_vals, top_indices = torch.topk(cosine_sims_top.view(-1), top_k)

# Step 3: Map the flat indices back to 2D indices
top_token1 = top_indices // cosine_sims.size(1)
top_token2 = top_indices % cosine_sims.size(1)

# Step 4: Prepare the top 100 similarity list
top_pairs = [
    {
        'token_0': model.to_string([int(t0.item())]),
        'token1': model.to_string([int(t1.item())]),
        'similarity': float(sim.item())
    }
    for t0, t1, sim in zip(top_token1, top_token2, top_vals)
]

# Step 5: Exclude the diagonal by setting it to +inf for bottom similarities
cosine_sims_bottom = cosine_sims.masked_fill(mask, float('inf'))

# Step 6: Flatten the matrix and get bottom 100 similarities
bottom_vals, bottom_indices = torch.topk(cosine_sims_bottom.view(-1), top_k, largest=False)

# Step 7: Map the flat indices back to 2D indices
bottom_token1 = bottom_indices // cosine_sims.size(1)
bottom_token2 = bottom_indices % cosine_sims.size(1)

# Step 8: Prepare the bottom 100 similarity list
bottom_pairs = [
    {
        'token_0': model.to_string([int(t0.item())]),
        'token1': model.to_string([int(t1.item())]),
        'similarity': float(sim.item())
    }
    for t0, t1, sim in zip(bottom_token1, bottom_token2, bottom_vals)
]

# Optional: Display the results
print("Top 100 Similarities:")
for pair in top_pairs:
    print(pair)

print("\nBottom 100 Similarities:")
for pair in bottom_pairs:
    print(pair)

OutOfMemoryError: CUDA out of memory. Tried to allocate 9.41 GiB. GPU 0 has a total capacity of 39.56 GiB of which 594.81 MiB is free. Process 75239 has 38.97 GiB memory in use. Of the allocated memory 31.40 GiB is allocated by PyTorch, and 7.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)