# Set Up

In [1]:
%pip install git+https://github.com/neelnanda-io/TransformerLens.git
%pip install circuitsvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/neelnanda-io/TransformerLens.git
  Cloning https://github.com/neelnanda-io/TransformerLens.git to /tmp/pip-req-build-etccqia5
  Running command git clone --filter=blob:none --quiet https://github.com/neelnanda-io/TransformerLens.git /tmp/pip-req-build-etccqia5
  Resolved https://github.com/neelnanda-io/TransformerLens.git to commit eda6e4acdbd61211a431e5ced0794dd5b009c628
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jaxtyping>=0.2.11
  Downloading jaxtyping-0.2.15-py3-none-any.whl (20 kB)
Collecting fancy-einsum>=0.0.3
  Downloading fancy_einsum-0.0.3-py3-none-any.whl (6.2 kB)
Collecting transformers>=4.25.1
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch as th
from circuitsvis.activations import text_neuron_activations
from jaxtyping import Float, Int
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)
from einops import rearrange
class NeuronTextSimplifier:
    def __init__(self, model, layer: int, neuron: int) -> None:
        self.model = model
        self.device = model.cfg.device
        self.layer = layer
        self.neuron = neuron
        self.model.requires_grad_(False)
        self.embed_weights = list(list(model.children())[0].parameters())[0]
        if("pythia" not in model.cfg.model_name):
            transformer_block_loc = 4
        else:
            transformer_block_loc = 2
        transformer_blocks = [mod for mod in list(self.model.children())[transformer_block_loc]]
        self.model_no_embed = th.nn.Sequential(*(transformer_blocks[:layer+1]))
        self.model_no_embed.requires_grad_(False)
        self.set_hooks()

    def set_hooks(self):
        self._neurons = th.empty(0)
        def hook(model, input, output):
            self._neurons = output
        self.model.blocks[self.layer].mlp.hook_post.register_forward_hook(hook)

    def ablate_mlp_neurons(self, tokens, neurons: th.Tensor):
        def mlp_ablation_hook(
            value: Float[th.Tensor, "batch pos d_mlp"],
            hook: HookPoint
        ) -> Float[th.Tensor, "batch pos d_mlp"]:
            if(neurons.shape[0] == 0):
                return value
            value[:, :, neurons] = 0
            return value
        return self.model.run_with_hooks(tokens, fwd_hooks=[(f"blocks.{self.layer}.mlp.hook_post", mlp_ablation_hook)])
        
    def add_noise_to_text(self, text, noise_level=1.0):
        if isinstance(text, str):
            text = [text]
        text_list = []
        activation_list = []
        for t in text:
            split_text = self.model.to_str_tokens(t, prepend_bos=False)
            tokens = self.model.to_tokens(t, prepend_bos=False)
            # Add gaussian noise to the input of each word in turn, getting the diff in final neuron's response
            embedded_tokens = self.model.embed(tokens)
            batch_size, seq_size, embedding_size = embedded_tokens.shape
            noise = th.randn(1, embedding_size, device=self.device)*noise_level
            original = self.embedded_forward(embedded_tokens)[:,-1,self.neuron]
            changed_activations = th.zeros(seq_size, device=self.device)
            for i in range(seq_size):
                embedded_tokens[:,i,:] += noise
                neuron_response = self.embedded_forward(embedded_tokens)
                changed_activations[i] = neuron_response[:,-1,self.neuron].item()
                embedded_tokens[:,i,:] -= noise
            changed_activations -= original
            text_list += [x.replace('\n', '\\newline') for x in split_text] + ["\n"]
            activation_list += changed_activations.tolist() + [0.0]
        activation_list = th.tensor(activation_list).reshape(-1,1,1)
        return text_neuron_activations(tokens=text_list, activations=activation_list)

    def visualize_logit_diff(self, text, neurons: th.Tensor, setting="true_tokens", verbose=True):
        if isinstance(text, str):
            text = [text]
        text_list = []
        logit_list = []
        for t in text:
            split_text = self.model.to_str_tokens(t, prepend_bos=False)
            tokens = self.model.to_tokens(t, prepend_bos=False)
            original_logits = self.model(tokens).log_softmax(-1)
            ablated_logits = self.ablate_mlp_neurons(tokens, neurons).log_softmax(-1)
            diff_logits =  ablated_logits - original_logits
            if setting == "true_tokens":
                # Gather the logits for the true tokens
                diff = rearrange(diff_logits.gather(2,tokens.unsqueeze(2)), "b s n -> (b s n)")
            elif setting == "max":
                val, ind = diff_logits.max(2)
                diff = rearrange(val, "b s -> (b s)")
                split_text = self.model.to_str_tokens(ind)
                tokens = ind
            if(verbose):
                text_list += [x.replace('\n', '\\newline') for x in split_text] + ["\n"]
                text_list += [x.replace('\n', '\\newline') for x in split_text] + ["\n"]
                orig = rearrange(original_logits.gather(2,tokens.unsqueeze(2)), "b s n -> (b s n)")
                ablated = rearrange(ablated_logits.gather(2,tokens.unsqueeze(2)), "b s n -> (b s n)")
                logit_list += orig.tolist() + [0.0]
                logit_list += ablated.tolist() + [0.0]
            text_list += [x.replace('\n', '\\newline') for x in split_text] + ["\n"]
            logit_list += diff.tolist() + [0.0]
        logit_list = th.tensor(logit_list).reshape(-1,1,1)
        if verbose:
            print(f"Max & Min logit-diff: {logit_list.max().item():.2f} & {logit_list.min().item():.2f}")
        return text_neuron_activations(tokens=text_list, activations=logit_list)

    def get_neuron_activation(self, tokens):
        _, cache = self.model.run_with_cache(tokens.to(self.model.cfg.device))
        return cache[f"blocks.{self.layer}.mlp.hook_post"][0,:,self.neuron].tolist()

    def text_to_activations_print(self, text):
        token = self.model.to_tokens(text, prepend_bos=False)
        act = self.get_neuron_activation(token)
        act = [f" [{a:.2f}]" for a in act]
        if(token.shape[-1] > 1):
            string = self.model.to_str_tokens(token, prepend_bos=False)
        else: 
            string = self.model.to_string(token)
        res = [None]*(len(string)+len(act))
        res[::2] = string
        res[1::2] = act
        return "".join(res)

    def text_to_visualize(self, text):
        if isinstance(text, str):
            text = [text]
        text_list = []
        act_list = []
        for t in text:
            if isinstance(t, str): # If the text is a list of tokens
                split_text = self.model.to_str_tokens(t, prepend_bos=False)
                token = self.model.to_tokens(t, prepend_bos=False)
            else:
                token = t
                split_text = self.model.to_str_tokens(t, prepend_bos=False)
            text_list += [x.replace('\n', '\\newline') for x in split_text] + ["\n"]
            act_list+= self.get_neuron_activation(token) + [0.0]
        act_list = th.tensor(act_list).reshape(-1,1,1)
        return text_neuron_activations(tokens=text_list, activations=act_list)

    def get_text_and_activations_iteratively(self, text):
        tokens = self.model.to_tokens(text, prepend_bos=False)[0]
        original_activation = self.get_neuron_activation(tokens)
        # To get around the newline issue, we replace the newline with \newline and then add a newline at the end
        text_list = [x.replace('\n', '\\newline') for x in self.model.to_str_tokens(text, prepend_bos=False)] + ["\n"]
        act_list = original_activation + [0.0]
        changes = th.zeros(tokens.shape[-1])+100
        for j in range(len(tokens)-1):
            for i in range(len(tokens)):
                changes[i] = self.get_neuron_activation(th.cat((tokens[:i],tokens[i+1:])))[-1]
            max_ind = changes.argmax()
            changes = th.cat((changes[:max_ind], changes[max_ind+1:]))
            tokens = th.cat((tokens[:max_ind],tokens[max_ind+1:]))
            if(tokens.shape[-1] > 1):
                out_text = self.model.to_str_tokens(tokens, prepend_bos=False)
                text_list += [x.replace('\n', '\\newline') for x in out_text] + ["\n"]
            else:
                out_text = self.model.to_string(tokens)
                text_list += [out_text.replace('\n', '\\newline')] + ["\n"]
            act_list += self.get_neuron_activation(tokens) + [0.0]
        text_list = text_list
        act_list = th.tensor(act_list).reshape(-1,1,1)
        return text_list, act_list

    def visualize_text_color_iteratively(self, text):
        if(isinstance(text, str)):
            text_list, act_list = self.get_text_and_activations_iteratively(text)
            return text_neuron_activations(tokens=text_list, activations=act_list)
        elif(isinstance(text, list)):
            text_list_final = []
            act_list_final = []
            for t in range(len(text)):
                text_list, act_list = self.get_text_and_activations_iteratively(text[t])
                text_list_final.append(text_list)
                act_list_final.append(act_list)
            return text_neuron_activations(tokens=text_list_final, activations=act_list_final)

    def simplify_iteratively(self, text):
        # Iteratively remove text that has smallest decrease in activation
        # Print out the change in activation for the largest changes, ie if the change is larger than threshold*original_activation
        tokens = self.model.to_tokens(text, prepend_bos=False)[0]
        self.text_to_activations_print(self.model.to_string(tokens))
        original_activation = self.get_neuron_activation(tokens)[-1]
        changes = th.zeros(tokens.shape[-1])+100
        for j in range(len(tokens)-1):
            for i in range(len(tokens)):
                changes[i] = self.get_neuron_activation(th.cat((tokens[:i],tokens[i+1:])))[-1]
            max_ind = changes.argmax()
            changes = th.cat((changes[:max_ind], changes[max_ind+1:]))
            tokens = th.cat((tokens[:max_ind],tokens[max_ind+1:]))
            out_text = self.model.to_string(tokens)
            print(self.text_to_activations_print(out_text))
        return

    # Assign neuron and layer
    def set_layer_and_neuron(self, layer, neuron):
        self.layer = layer
        self.neuron = neuron
        self.set_hooks()

    def embedded_forward(self, embedded_x):
        self.model_no_embed(embedded_x)
        return self._neurons

    def forward(self, x):
        self.model(x)       
        return self._neurons

    def prompt_optimization(
            self,
            diverse_outputs_num=10, 
            iteration_cap_until_convergence = 30,
            init_text = None,
            seq_size = 4,
            insert_words_and_pos = None, #List of words and positions to insert [word, pos]
            neuron_loss_scalar = 1,
            diversity_loss_scalar = 1,
        ):
        _, _, embed_size = self.model.W_out.shape
        vocab_size = self.model.W_E.shape[0]
        largest_prompts = [None]*diverse_outputs_num
        # Use dim-1 when we're doing a for loop (list comprehension)
        # Use dim-2 when we're doing all at once
        cos_dim_1 = th.nn.CosineSimilarity(dim=1)
        cos_dim_2 = th.nn.CosineSimilarity(dim=2)
        total_iterations = 0

        if init_text is not None:
            init_tokens = self.model.to_tokens(init_text, prepend_bos=False)
            seq_size = init_tokens.shape[-1]
        diverse_outputs = th.zeros(diverse_outputs_num, seq_size, embed_size).to(self.device)
        for d_ind in range(diverse_outputs_num):
            print(f"Starting diverse output {d_ind}")
            if init_text is None:
                # Random tokens of sequence length
                init_tokens = th.randint(0, vocab_size, (1,seq_size))
                init_text = self.model.to_string(init_tokens)
            prompt_embeds = th.nn.Parameter(self.model.embed(init_tokens)).detach()
            prompt_embeds.requires_grad_(True).to(self.device)

            optim = th.optim.AdamW([prompt_embeds], lr=.8, weight_decay=0.01)
            largest_activation = 0
            largest_prompt = None

            iterations_since_last_improvement = 0
            while(iterations_since_last_improvement < iteration_cap_until_convergence):
            # First, project into the embedding matrix
                with th.no_grad():
                    projected_index = th.stack([cos_dim_1(self.embed_weights,prompt_embeds[0,i,:]).argmax() for i in range(seq_size)]).unsqueeze(0)
                    projected_embeds = self.model.embed(projected_index)

                # Create a temp embedding that is detached from the graph, but has the same data as the projected embedding
                tmp_embeds = prompt_embeds.detach().clone()
                tmp_embeds.data = projected_embeds.data
                # add some gaussian noise to tmp_embeds
                # tmp_embeds.data += th.randn_like(tmp_embeds.data)*0.01
                tmp_embeds.requires_grad_(True)

                if insert_words_and_pos is not None:
                    text = insert_words_and_pos[0]
                    pos = insert_words_and_pos[1]
                    if(pos == -1):
                        pos = seq_size
                    token = self.model.to_tokens(text, prepend_bos=False)
                    token_embeds = self.model.embed(token)
                    token_pos = pos
                    wrapped_embeds = th.cat([tmp_embeds[0,:token_pos], token_embeds[0], tmp_embeds[0,token_pos:]], dim=0).unsqueeze(0)
                    if(total_iterations == 0):
                        wrapped_embeds_seq_len = wrapped_embeds.shape[1]
                        projected_index = th.stack([cos_dim_1(self.embed_weights,wrapped_embeds[0,i,:]).argmax() for i in range(wrapped_embeds_seq_len)]).unsqueeze(0)
                        print(f"Inserting {text} at pos {pos}: {self.model.to_str_tokens(projected_index, prepend_bos=False)}")
                else:
                    wrapped_embeds = tmp_embeds

                # Then, calculate neuron_output
                neuron_output = self.embedded_forward(wrapped_embeds)[0,:, self.neuron]
                if(d_ind > 0):
                    diversity_loss = cos_dim_2(tmp_embeds[0], diverse_outputs[:d_ind])
                    # return cos, tmp_embeds, diverse_outputs
                else:
                    diversity_loss = th.zeros(1)

                loss = neuron_loss_scalar*-neuron_output[-1] + diversity_loss_scalar*diversity_loss.mean()

                # Save the highest activation
                if neuron_output[-1] > largest_activation:
                    iterations_since_last_improvement = 0
                    largest_activation = neuron_output[-1]
                    wrapped_embeds_seq_len = wrapped_embeds.shape[1]
                    projected_index = th.stack([cos_dim_1(self.embed_weights,wrapped_embeds[0,i,:]).argmax() for i in range(wrapped_embeds_seq_len)]).unsqueeze(0)
                    largest_prompt = self.model.to_string(projected_index)
                    largest_prompts[d_ind] = largest_prompt
                    print(f"New largest activation: {largest_activation} | {largest_prompt}")

                # Transfer the gradient to the continuous embedding space
                prompt_embeds.grad, = th.autograd.grad(loss, [tmp_embeds])
                
                optim.step()
                optim.zero_grad()
                total_iterations += 1
                iterations_since_last_improvement += 1
                init_text = None
            diverse_outputs[d_ind] = tmp_embeds.data[0,...]
        return largest_prompts


# Import Dataset

In [3]:
# Import Transformer Lens, and load pythia models
from transformer_lens import HookedTransformer
import torch as th
from torch import nn
import numpy as np 
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from tqdm import tqdm
from einops import rearrange
device = "cuda" if th.cuda.is_available() else "cpu"

MODEL_NAME_LIST = [
    # "EleutherAI/pythia-70m-deduped", 
    # "EleutherAI/pythia-160m-deduped", 
    # "EleutherAI/pythia-410m-deduped", 
    # "gpt2", 
    # "gpt2-medium",
    # "solu-1l",
    # "solu-2l",
    # "solu-3l",
    # "solu-4l",
    "gelu-2l"
]
model_name = MODEL_NAME_LIST[0]
layer = 1 # Layer 1 is actually the 2nd layer because 0-indexing

model = HookedTransformer.from_pretrained(model_name, device=device)

# Only get the first twenty tokens of every datapoint
Token_amount = 20

# Load the training set from pile-10k
d = load_dataset("NeelNanda/pile-10k", split="train").map(
    lambda x: model.tokenizer(x['text']),
    batched=True,
).filter(
    lambda x: len(x['input_ids']) > Token_amount
).map(
    lambda x: {'input_ids': x['input_ids'][:Token_amount]}
)
neurons = model.W_in.shape[-1]
datapoints = d.num_rows
batch_size = 64

neuron_activations = th.zeros((datapoints*Token_amount, neurons))

with th.no_grad(), d.formatted_as("pt"):
    dl = DataLoader(d["input_ids"], batch_size=batch_size)
    for i, batch in enumerate(tqdm(dl)):
        _, cache = model.run_with_cache(batch.to(device))
        neuron_activations[i*batch_size*Token_amount:(i+1)*batch_size*Token_amount,:] = rearrange(cache[f"blocks.{layer}.mlp.hook_post"], "b s n -> (b s) n" )

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading model_final.pth:   0%|          | 0.00/227M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Loaded pretrained model gelu-2l into HookedTransformer


Downloading metadata:   0%|          | 0.00/921 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/373 [00:00<?, ?B/s]

Downloading and preparing dataset None/None (download: 31.72 MiB, generated: 58.43 MiB, post-processed: Unknown size, total: 90.15 MiB) to /root/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/33.3M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/NeelNanda___parquet/NeelNanda--pile-10k-72f566e9f7c464ab/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9963 [00:00<?, ? examples/s]

100%|██████████| 156/156 [00:04<00:00, 35.79it/s]


# Visualize Top Examples

Try looking at several neurons for like 5 minutes maximum. Some will be much more interpretable, and other's won't. 

In [4]:
# Pick our specific neuron
neuron = 0
# Top k datapoint examples
k = 10
simplifier = NeuronTextSimplifier(model, layer, neuron)

values, indices = neuron_activations[:,neuron].topk(k)

max_datapoints = [np.unravel_index(i, (datapoints, Token_amount)) for i in indices]

text_list = []
full_text = []
for md, s_ind in max_datapoints:
    md = int(md)
    s_ind = int(s_ind)
    # Get the full text
    full_tok = th.tensor(d[md]["input_ids"])
    full_text.append(model.tokenizer.decode(full_tok))
    
    # Get just the text up until the max-activating example
    tok = d[md]["input_ids"][:s_ind+1]
    text = model.tokenizer.decode(tok)
    text_list.append(text)

Let's visualize the whole text's activation on that neuron.
Blue is positive (also has a hovertip if you put your mouse over) and red is negative, which you can interpret as "no activation" as opposed to "opposite". 

NOTE: "Layer" & "Neuron" are fake UI elements. My code is a hack.

In [5]:
simplifier.text_to_visualize(full_text)

What if we add gaussian noise to each token and see the effect on the last token? So for 20 token-text, we'll add noise to the first token, run the model, save that activation, then repeat with adding noise to the second token (but not the first). Below you'll see Red, as in adding noise to this token caused a decrease in activation. Some evidence that this token is important. 

In [6]:
simplifier.add_noise_to_text(text_list, noise_level=1.0)

Remove each token that has the least effect on the last token's neuron activation. Should see something similar to the noise i.e. the tokens that have the most effect when noised should be the last tokens to be removed below. 

NOTE: change "Samples per page" to a larger number to see more. 

In [7]:
simplifier.visualize_text_color_iteratively(text_list)

Now let's add our own text to text a few hypotheses. Add/ remove words. Replace words with similar words or opposites. 

Note: Some neurons perform multiple different functions, so your hypothesis might be {"Harry Potter character names" OR "Repeated words" OR "these three punctuation marks after closing quotation marks"}. This can be teased apart later when we see cross-neuron comparison (maybe this neuron and another do the Harry Potter characters?). Another source of info is the logit attribution part (see below)

In [8]:
text_list = [
    "1 2 3 4 5 6",
    " 1 2 3 4 5 6",
    "bacon & eggs",
    " bacon & eggs",
]
simplifier.text_to_visualize(text_list)

# Logit Attribution

We want the row or column of MLP_out that is size d_model. Look through model.cfg & the shape of the model to find it. Multiply that row by the unembedding matrix

In [9]:
# Model config is useful to look at for model shape info
model.cfg

HookedTransformerConfig:
{'act_fn': 'gelu',
 'attention_dir': 'causal',
 'attn_only': False,
 'attn_types': None,
 'checkpoint_index': None,
 'checkpoint_label_type': None,
 'checkpoint_value': None,
 'd_head': 64,
 'd_mlp': 2048,
 'd_model': 512,
 'd_vocab': 48262,
 'd_vocab_out': 48262,
 'device': 'cuda',
 'eps': 1e-05,
 'final_rms': False,
 'from_checkpoint': False,
 'gated_mlp': False,
 'init_mode': 'gpt2',
 'init_weights': False,
 'initializer_range': 0.035355339059327376,
 'model_name': 'GELU_2L512W_C4_Code',
 'n_ctx': 1024,
 'n_heads': 8,
 'n_layers': 2,
 'n_params': 6291456,
 'normalization_type': 'LNPre',
 'original_architecture': 'neel',
 'parallel_attn_mlp': False,
 'positional_embedding_type': 'standard',
 'rotary_dim': None,
 'scale_attn_by_inverse_layer_idx': False,
 'seed': None,
 'tokenizer_name': 'NeelNanda/gpt-neox-tokenizer-digits',
 'use_attn_result': False,
 'use_attn_scale': True,
 'use_hook_tokens': False,
 'use_local_attn': False,
 'use_split_qkv_input': False,


In [10]:
# Model has several easy ways to access the weights of the model, such as model.W_in, model.QK. We care about model.W.out, the second part of the MLP
# The shape is [layer, d_mlp, d_model]
model.W_out.shape

torch.Size([2, 2048, 512])

In [11]:
# Unembeed
model.W_U.shape

torch.Size([512, 48262])

In [12]:
x = th.einsum('ij,jk->ik', model.W_out[-1], model.W_U)
x.shape
# x is the direct contribution of each neuron for all logits

torch.Size([2048, 48262])

In [13]:
#I'm unsure if we need to multiply W_out by the layer norm or anything. This is just something I need to ask
# To a first approximation, we multiply by the row in W_out with all of W_U

### look at top neurons for a token

In [14]:
for token in range(10):
    print(f"token {model.to_string(token)}\n")
    top_k_values, top_k_neurons = th.topk(x.T[token], k=10) # x.T[token] is the direct contribution of each neuron for the token
    for neuron, value in zip(top_k_neurons, top_k_values):
        print(f'\tneuron {neuron}: {value:.3f}')
    print('\n')

token <|EOS|>

	neuron 1429: 2.349
	neuron 1890: 1.735
	neuron 1430: 1.558
	neuron 1888: 1.494
	neuron 160: 1.419
	neuron 25: 1.261
	neuron 916: 1.232
	neuron 1379: 1.001
	neuron 208: 0.993
	neuron 436: 0.965


token <|BOS|>

	neuron 154: 2.002
	neuron 1938: 0.514
	neuron 1274: 0.489
	neuron 889: 0.441
	neuron 683: 0.426
	neuron 254: 0.418
	neuron 835: 0.410
	neuron 1866: 0.403
	neuron 1780: 0.370
	neuron 1810: 0.354


token <|PAD|>

	neuron 154: 1.991
	neuron 1938: 0.508
	neuron 1274: 0.490
	neuron 889: 0.444
	neuron 683: 0.427
	neuron 835: 0.418
	neuron 254: 0.416
	neuron 1866: 0.388
	neuron 1780: 0.369
	neuron 1810: 0.358


token !

	neuron 160: 2.333
	neuron 1379: 2.311
	neuron 1146: 1.486
	neuron 755: 1.123
	neuron 1434: 1.118
	neuron 1183: 1.075
	neuron 156: 1.062
	neuron 1811: 1.056
	neuron 1353: 1.034
	neuron 222: 1.014


token "

	neuron 1071: 2.331
	neuron 1379: 2.055
	neuron 1255: 1.398
	neuron 160: 1.295
	neuron 381: 1.167
	neuron 472: 1.145
	neuron 524: 1.090
	neuron 61: 1

### look at top tokens for a neuron

In [15]:
for neuron in range(10):
    print(f"neuron {neuron}\n")
    top_k_values, top_k_tokens = th.topk(x[neuron], k=10) # x[neuron] is the direct contribution of the neuron to each token
    for token, value in zip(top_k_tokens, top_k_values):
        print(f'\ttoken {model.to_string(token)}: {value:.3f}')
    print('\n')

neuron 0

	token ishing: 1.178
	token bes: 1.100
	token posed: 1.023
	token ères: 0.958
	token coma: 0.951
	token rays: 0.936
	token ished: 0.935
	token ons: 0.925
	token aned: 0.918
	token icity: 0.915


neuron 1

	token edu: 0.977
	token ther: 0.965
	token �: 0.937
	token iest: 0.883
	token  there: 0.862
	token �: 0.851
	token ход: 0.826
	token usta: 0.819
	token ч: 0.801
	token henyl: 0.797


neuron 2

	token uses: 0.996
	token iana: 0.988
	token ians: 0.941
	token us: 0.933
	token 니다: 0.911
	token aga: 0.900
	token inafter: 0.896
	token izon: 0.895
	token mania: 0.895
	token ician: 0.886


neuron 3

	token asus: 1.034
	token rolog: 1.033
	token ró: 1.025
	token Tell: 0.991
	token ermine: 0.984
	token ime: 0.984
	token rile: 0.976
	token ras: 0.951
	token iana: 0.931
	token rano: 0.923


neuron 4

	token 9: 0.918
	token 限: 0.823
	token ordin: 0.808
	token rah: 0.796
	token KES: 0.778
	token  mine: 0.777
	token bies: 0.760
	token  probability: 0.756
	token  it: 0.740
	token OSE: 0.72

### take a look at the top neurons for a token and then look at their top tokens

In [16]:
token = 25886
print(f"token {model.to_string(token)}\n")
top_k_values, top_k_neurons = th.topk(x.T[token], k=10) # x.T[token] is the direct contribution of each neuron for the token
for neuron, value in zip(top_k_neurons, top_k_values):
    print(f'\tneuron {neuron}: {value:.3f}\n')
    top_k_values, top_k_tokens = th.topk(x[neuron], k=10) # x[neuron] is the direct contribution of the neuron to each token
    for token, value in zip(top_k_tokens, top_k_values):
        print(f'\t\ttoken {model.to_string(token)}: {value:.3f}')
    print('\n')

token Monday

	neuron 472: 1.335

		token disambiguation: 1.834
		token resistance: 1.765
		token ©: 1.711
		token LICENSE: 1.707
		token Unable: 1.681
		token Female: 1.671
		token BMI: 1.670
		token Washington: 1.666
		token Looks: 1.665
		token Prince: 1.658


	neuron 154: 1.242

		token MOESM: 3.614
		token  že: 3.296
		token gebras: 3.265
		token  että: 3.179
		token ycin: 3.101
		token ~).: 3.079
		token  surjective: 3.058
		token  Eqs: 3.049
		token chaft: 3.020
		token  ktor: 3.017


	neuron 1381: 1.018

		token  than: 1.583
		token  past: 1.509
		token  September: 1.361
		token  January: 1.358
		token  August: 1.354
		token  December: 1.353
		token  March: 1.353
		token  April: 1.329
		token  June: 1.322
		token  February: 1.321


	neuron 215: 0.953

		token Blo: 1.520
		token University: 1.507
		token Whe: 1.503
		token Well: 1.479
		token Related: 1.477
		token Risk: 1.465
		token Ell: 1.449
		token Too: 1.442
		token Yes: 1.441
		token Va: 1.440


	neuron 429: 0.927

		toke

look at a specific neuron, it’s top tokens and top activating examples, and see if it makes sense

Like maybe the token is " 5" and the example is " 1 2 3 4". We can guess it's encoding the information about the sequence. So you can remove parts of the context to "bob apple 4" and see if the neuron is still activating.

check "visualize top examples" part to get too activating examples for a specific neuron

In [34]:
neuron = 298

looks at top tokens for this neuron

In [35]:
top_k_values, top_k_tokens = th.topk(x[neuron], k=10) # x[neuron] is the direct contribution of the neuron to each token
for token, value in zip(top_k_tokens, top_k_values):
    print(f'\ttoken {model.to_string(token)}: {value:.3f}')

	token  Sept: 1.539
	token  Feb: 1.506
	token  April: 1.480
	token  July: 1.464
	token  June: 1.427
	token  May: 1.424
	token  December: 1.406
	token  Oct: 1.395
	token  October: 1.392
	token  November: 1.389


look at top activating examples

In [36]:
k = 10
simplifier = NeuronTextSimplifier(model, layer, neuron)

values, indices = neuron_activations[:,neuron].topk(k)

max_datapoints = [np.unravel_index(i, (datapoints, Token_amount)) for i in indices]

text_list = []
full_text = []
for md, s_ind in max_datapoints:
    md = int(md)
    s_ind = int(s_ind)
    # Get the full text
    full_tok = th.tensor(d[md]["input_ids"])
    full_text.append(model.tokenizer.decode(full_tok))
    
    # Get just the text up until the max-activating example
    tok = d[md]["input_ids"][:s_ind+1]
    text = model.tokenizer.decode(tok)
    text_list.append(text)

In [37]:
simplifier.text_to_visualize(full_text)