# Setup

In [1]:
import plotly.io as pio
try:
    import google.colab
    print("Running as a Colab notebook")
    pio.renderers.default = "colab"
    %pip install transformer-lens fancy-einsum
    %pip install -U kaleido # kaleido only works if you restart the runtime. Required to write figures to disk (final cell)
except:
    print("Running as a Jupyter notebook")
    pio.renderers.default = "vscode"
    from IPython import get_ipython
    ipython = get_ipython()

Running as a Jupyter notebook


In [2]:
import torch
from fancy_einsum import einsum
from transformer_lens import HookedTransformer, HookedTransformerConfig, utils, ActivationCache
from torchtyping import TensorType as TT
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import einops
from typing import List, Union, Optional
from functools import partial
import pandas as pd
from pathlib import Path
import urllib.request
from bs4 import BeautifulSoup
from tqdm import tqdm
from datasets import load_dataset
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false" # https://stackoverflow.com/q/62691279
torch.set_grad_enabled(False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
!pip install circuitsvis
import circuitsvis as cv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
pio.renderers.default='vscode'

def imshow(tensor, renderer=None, **kwargs):
    px.imshow(utils.to_numpy(tensor), color_continuous_midpoint=0.0, color_continuous_scale="RdBu", **kwargs).show(renderer)

def line(tensor, renderer=None, **kwargs):
    px.line(y=utils.to_numpy(tensor), **kwargs).show(renderer)

def scatter(x, y, xaxis="", yaxis="", caxis="", renderer=None, **kwargs):
    x = utils.to_numpy(x)
    y = utils.to_numpy(y)
    px.scatter(y=y, x=x, labels={"x":xaxis, "y":yaxis, "color":caxis}, **kwargs).show(renderer)

In [5]:
model = HookedTransformer.from_pretrained(
    "gpt2-large",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=True,
    device=device,
)

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-large into HookedTransformer


# Neuron Exploration

In [148]:
# neuron = (34, 4954) # number neuron
# prompts = [
#     "When I was writing my first book, I was going from eleven at night till",
#     "The most merciful thing that the large family does to one of its infant members is to kill it. The same factors which create the terrible infant mortality rate, and which swell the death rate of children between the ages of one and",
#     "The federal government and plaintiffs including the North Carolina NAACP and the League of Women Voters argued during a hearing last month that three important parts of the law—a reduction in early voting from seventeen to",
#     "Serial killers, they concluded, fall into one of",
#     "If no candidate had an outright majority—and the framers assumed that this would be normal once George Washington had passed from the scene—the House would choose among the top five, later reduced to the top",
#     "If you listen to most so-called survival experts, they often suggest that your BOB should contain enough supplies to last for at least seventy-",
#     "The notion that our outlook on life is connected to our well-being is not a new one. In the nineteen-sixties, the University of Connecticut psychologist Julian Rotter proposed that we could view external events in one of",
# ]


# neuron = (31, 894) # an-neuron
# # These prompts don't actually give "an" as a top prediction, probably because of the shorter context windows
# prompts = [
#     "The pet peeve of Steresoscopic 3D film-making, unless done correctly. (Bokeh) This technique is worth exploring in",
#     "Parents often disagree about homework, chores, and technology usage, and that's okay. However, each parent should have",
#     "But I think a lot of teams haven't realized when to pick him and how to play him just yet, so maybe we will have",
#     "The first part of the talk revisited some of the things Jon talked about on Friday night, with",
#     "However, as far local LAN tournaments, there was the first ChiCraft this past winter, and we'll certainly be doing another one over the summer (June/July-ish?). There isn't"
# ]

# issue neuron
neuron = (35, 4518)
prompts = [
    "You can also invest in bonds from foreign governments. Many governments",
    "Squanchy, Birdperson, and Rick were once freedom fighters against the Galactic Federation's oppression; their \"crimes\" led the Galactic Federation to",
    "The disadvantage is that the one holding the reserves would be paying a high opportunity cost. Why would you put so much money aside? There needs to be some gain opportunity for private individuals to voluntary",
    "As of March 2008, this had declined by 24%, to 53 months. 71% of this debt is due in less than 5 years; 39% is due in less than a year.In the Clinton/Rubin era, the Treasury stopped",
]


tokens = model.to_tokens(prompts, prepend_bos=True).to(device=device)
original_logits, cache = model.run_with_cache(tokens)

In [169]:
def get_head_attribution(cache, tokens, neuron):
    # Get prompt lengths
    pad_token = bos_token = 50256 # This is true for GPT-2
    prompt_lengths = (torch.logical_and(tokens != pad_token, tokens != bos_token)).sum(dim=-1)

    # Get the correct last-seq for each prompt (since they are padded, the last seq position differs for each prompt)
    head_output = cache.stack_head_results()
    expanded_index_tensor = prompt_lengths.unsqueeze(0).unsqueeze(-1).unsqueeze(-1).expand(head_output.shape[0], head_output.shape[1], 1, head_output.shape[-1])
    head_output_last_seq = torch.gather(head_output, 2, expanded_index_tensor).squeeze(2)

    # Get dot product of neuron with each head's output

    layer, no = neuron
    neuron_w_in = model.W_in[layer, :, no]
    head_attribution = einsum("head batch weight, weight->batch head", head_output_last_seq, neuron_w_in)
    head_attribution = head_attribution[:, :layer*model.cfg.n_heads] # Filter for only heads before the neuron
    return head_attribution # Shape [batch, head]

head_attribution = get_head_attribution(cache, tokens, neuron)
_, top_heads = torch.topk(head_attribution, k=3, dim=-1)
print(top_heads)

torch.Size([4, 3])
tensor([[626, 671, 682],
        [671, 682, 325],
        [671, 682, 688],
        [605, 671, 682]])


In [168]:
print(len(prompts))
head_attribution_lh = einops.rearrange(head_attribution, "batch (layer head) -> batch layer head", head=model.cfg.n_heads)
for prompt_index in range(len(prompts)):
    imshow(head_attribution_lh[prompt_index, :, :], title=f"{prompts[prompt_index][-50:]}")

4


In [131]:
from datasets import load_dataset
dataset = load_dataset("NeelNanda/pile-10k", split="train")

Using custom data configuration NeelNanda--pile-10k-72f566e9f7c464ab
Found cached dataset parquet (/Users/clementneo/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [175]:
text = dataset[50]['text']
tokens = model.to_tokens(text, prepend_bos=True,).to(device=device)
print(tokens.shape)
print(text)

torch.Size([1, 959])
At our best, we motivate ourselves every day to get dressed and go to work or school. Although there are larger incentives at work, it's our own volition that powers us through our innumerable daily tasks.

If we could learn to control the motivational centers of our brains that drive volition, would it lead us toward healthier, more productive lives? Using a new brain imaging strategy, Duke University scientists have now taken a first step in understanding how to manipulate specific neural circuits using thoughts and imagery.

The technique, which is described in the March 16 issue of the journal Neuron, is part of a larger approach called 'neurofeedback,' which gives participants a dynamic readout of brain activity, in this case from a brain area critical for motivation.

"These methods show a direct route for manipulating brain networks centrally involved in healthy brain function and daily behavior," said the study's senior investigator R. Alison Adcock, an ass

In [170]:
text = dataset[10]['text']
tokens = model.to_tokens(prompt, prepend_bos=True,).to(device=device)
original_logits, cache = model.run_with_cache(tokens)

KeyboardInterrupt: 

In [146]:
x = cache.decompose_resid(return_labels=True)
print(x[1])

['embed', 'pos_embed', '0_attn_out', '0_mlp_out', '1_attn_out', '1_mlp_out', '2_attn_out', '2_mlp_out', '3_attn_out', '3_mlp_out', '4_attn_out', '4_mlp_out', '5_attn_out', '5_mlp_out', '6_attn_out', '6_mlp_out', '7_attn_out', '7_mlp_out', '8_attn_out', '8_mlp_out', '9_attn_out', '9_mlp_out', '10_attn_out', '10_mlp_out', '11_attn_out', '11_mlp_out', '12_attn_out', '12_mlp_out', '13_attn_out', '13_mlp_out', '14_attn_out', '14_mlp_out', '15_attn_out', '15_mlp_out', '16_attn_out', '16_mlp_out', '17_attn_out', '17_mlp_out', '18_attn_out', '18_mlp_out', '19_attn_out', '19_mlp_out', '20_attn_out', '20_mlp_out', '21_attn_out', '21_mlp_out', '22_attn_out', '22_mlp_out', '23_attn_out', '23_mlp_out', '24_attn_out', '24_mlp_out', '25_attn_out', '25_mlp_out', '26_attn_out', '26_mlp_out', '27_attn_out', '27_mlp_out', '28_attn_out', '28_mlp_out', '29_attn_out', '29_mlp_out', '30_attn_out', '30_mlp_out', '31_attn_out', '31_mlp_out', '32_attn_out', '32_mlp_out', '33_attn_out', '33_mlp_out', '34_attn_ou