# Automated Verification of Concept Neurons

Aim: automate the identification of potential neurons which handle specific semantic features, using a max activating dataset and word embeddings. Use Neuroscope and fasttext.

### Setup


In [3]:
%pip install git+https://github.com/neelnanda-io/TransformerLens.git

Collecting git+https://github.com/neelnanda-io/TransformerLens.git
  Cloning https://github.com/neelnanda-io/TransformerLens.git to /private/var/folders/p_/zl8q73md4vgb11fc063_516h0000gn/T/pip-req-build-lxp5mll_
  Running command git clone --filter=blob:none --quiet https://github.com/neelnanda-io/TransformerLens.git /private/var/folders/p_/zl8q73md4vgb11fc063_516h0000gn/T/pip-req-build-lxp5mll_
  Resolved https://github.com/neelnanda-io/TransformerLens.git to commit e6b1087d69e8b8226c79211cf0f2b610fb0b2c33
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting fancy-einsum<0.0.4,>=0.0.3
  Downloading fancy_einsum-0.0.3-py3-none-any.whl (6.2 kB)
Collecting datasets<3.0.0,>=2.7.1
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:

In [15]:
%pip install openai
%pip install requests
%pip install numpy
%pip install pandas
%pip install bs4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Collecting numpy
  Downloading numpy-1.24.2-cp310-cp310-macosx_10_9_x86_64.whl (19.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m674.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.24.2

[1m[[0m[34;49mnotice[0m[1

In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [4]:
from transformer_lens import HookedTransformer
from transformer_lens.utils import to_numpy

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import json
import urllib.request
import re
import openai

### Parameters

Change the layer and neuron number.
Model selected: SoLu 8L Pile

8 layers: reasonably sized model which we can examine middle layers for abstract concept neurons (see SoLu paper).

In [None]:
model_name = "solu-8l-pile"
solu_model = HookedTransformer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/767 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/828M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Loaded pretrained model solu-8l-pile into HookedTransformer


### Scrape top 20 max activating token phrases from Neuroscope

Get max activating tokens from dataset for each neuron in a layer. Take the surrounding five tokens for context as well.

In [None]:
# Return the URL for Neuroscope's model neuron index
def get_neuron_url(model_name, layer, number):
  # model_index = 'https://neuroscope.io/solu-8l-pile/'
  model_index = 'https://neuroscope.io/'
  return model_index + model_name + "/" + str(layer) + "/" + str(number) + ".html"

# Scrape the Neuroscope website for the chosen neuron
def scrape_neuroscope(neuron_url):
  neuron_request = urllib.request.urlopen(neuron_url).read()
  return BeautifulSoup(neuron_request)

In [None]:
# Extract the list of tokens, list of activations and the max activation.
def clean_scraped_data(dataset):
  data = dataset.text
  tokens_match = re.search(r"\{\"tokens\": \[(.*?)\], \"values", data)
  acts_match = re.search(r"\"values\": \[(.*?)\]", data)

  if tokens_match and acts_match:
    # Convert into clean list of strings
    token_list = tokens_match.group(1).replace("\"",'').split(', ')[:-1]
    act_list = acts_match.group(1).replace("\"",'').split(',')[:-1]
    # Identify maximum activation (normalised)
    act_floats = [float(x) for x in act_list]
    maximum = max(act_floats)
    return token_list, act_list, maximum

In [None]:
# Return a list of max activating tokens and their activations for a neuron.
def scrape_neuron_max_activations(model_name, layer, number):
  """
  layer: layer of model that the neuron is in
  number: index of the neuron in the layer
  """
  neuron_url = get_neuron_url(model_name, layer, number)
  scraped = scrape_neuroscope(neuron_url)

  max_tokens = []
  max_phrases = []
  max_acts = []

  count = 0
  for dataset in scraped.find_all('script', type='module'):
    count += 1
    # Ignore full text scraped. We only want the max activating sentences.
    if (count % 2 == 0): 
      continue
    # Extract tokens and activations.
    token_list, act_list, maximum = clean_scraped_data(dataset)
    if token_list is None or act_list is None:
      print("Could not retrieve tokens and activations.")
      break
    # Return the max act token, its activation and its surrounding phrase
    index = 0
    for tok, act in zip(token_list, act_list):
      if (float(act) == maximum):
        before = max(index - 3, 0)
        after = min(index + 3, len(token_list) - 1)
        max_tokens.append(tok)
        max_acts.append(maximum)
        max_phrases.append(' '.join(token_list[before : after]))
      index += 1

  return (max_tokens, max_acts, max_phrases)

### Compare distances between MATs using word embeddings

Compare the distances between maximum activating tokens using word2vec. If the tokens seem to be neighbours (distance is within a certain range), flag as potential context neuron.

Alternatively: would it be better to compare the distances between phrases using doc2vec?

In [None]:
import gensim.downloader as api
from gensim.models.fasttext import FastText

try:
  model = FastText.load("fasttext.model")
except:
  corpus = api.load('text8')
  model = FastText(corpus)
  model.save("fasttext.model")



In [None]:
# Calculate similarity score among words which activate neuron the most
def calculate_similarity(activation_tokens):
  similarities = []
  for i in range(len(activation_tokens)):
    word = activation_tokens[i]
    try:
      for j in range(i, len(activation_tokens)):
        word_b = activation_tokens[j]
        similarities.append(model.wv.similarity(word, word_b))
    except:
      continue
  if len(similarities) == 0:
    return 0
  return sum(similarities) / len(similarities)

### Setup code for getting activation of neuron given custom text.

In [None]:
# Hacky way to get out state from a single hook (Neuroscope documentation).
def get_neuron_acts(text, layer, neuron_index):
    cache = {}

    def caching_hook(act, hook):
        cache["activation"] = act[0, :, neuron_index]

    solu_model.run_with_hooks(
        text, fwd_hooks=[(f"blocks.{layer}.mlp.hook_post", caching_hook)]
    )
    return to_numpy(cache["activation"])

In [None]:
def get_max_activations(text, layer, neuron_index):
    """
    text: The text to visualize
    layer: The layer index
    neuron_index: The neuron index

    Returns the token with the highest activation in the text, and its activation
    """
    if layer is None:
        return "Please select a Layer"
    if neuron_index is None:
        return "Please select a Neuron"
        
    acts = get_neuron_acts(text, layer, neuron_index)
    act_max = acts.max()
    
    # Convert the text to a list of tokens
    str_tokens = solu_model.to_str_tokens(text)

    # Print the max act token and its surrounding phrase
    for tok, act in zip(str_tokens, acts):
      if (act == act_max):
        return (tok, act)
        

### Falsify hypotheses

Falsify hypothesis: substitute words in dataset example with similar neighbours (closest words using word2vec). 
Measure the amount of activation and compare with original. Do some words increase the activation? Do some words decrease activation?
Create a diverse range of examples that are more or less specific and run these through the model.

In [None]:
def substitute_similar(activation_tokens, cur_acts, index, neuron_layer):
  higher_toks = []
  higher_acts = []
  for i in range(len(activation_tokens)):
    token = activation_tokens[i]
    phrase = activation_tokens[i]
    cur_act = cur_acts[i]
    # Get the top 5 most similar tokens and check activations
    most_similar = model.wv.most_similar(token, topn=5)
    similar_tokens = [ls[0] for ls in most_similar]
    for sub in similar_tokens:
      new_phrase = phrase.replace(token, sub)
      new_max_activations = get_max_activations(
          new_phrase, 
          neuron_layer, 
          index)
      if new_max_activations == None:
        break
      tok, act = new_max_activations
      # Save tokens which do not decrease activation
      if tok == sub and act >= cur_act:
        higher_toks.append(sub)
        higher_acts.append(act)

  new_activation_tokens = activation_tokens + higher_toks
  new_activation_scores = cur_acts + higher_acts
  # We should have number of tokens checked = (5 + 1) * number of initial tokens
  tokens_checked = 6 * len(activation_tokens)

  avg_score = sum(new_activation_scores) / tokens_checked
  
  return (new_activation_tokens, avg_score)

### Asking for a summary

Connect to GPT-3 API to try and summarise what this neuron might be about.

In [None]:
API_KEY="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" # Open license from alignment jam
openai.api_key = API_KEY

# Returns a GPT-3 generated summary of potential content.
def ask_summary(keywords_list):
  prompt_summary = "What do all these words have in common? Words: " + ' '.join(keywords_list) + ". Common:"
  # print(prompt_summary)
  response = openai.Completion.create(engine="text-davinci-003", prompt=prompt_summary, max_tokens=50)
  summary = response["choices"][0]["text"]
  return summary.strip()

# Iterating through all neurons in a middle layer.

We select the middle layer of a SoLu 8L model (hypothesised to detect abstract concepts) and iterate through all neurons, attempting to identify feature neurons.

In [None]:
def summarise_neuron(index, neuron_layer, model_name):
  tokens, acts, _ = scrape_neuron_max_activations(model_name, neuron_layer, index)
  cur_similarity = calculate_similarity(tokens)

  if cur_similarity < 0.6:
    return
  
  new_avg_act = 0
  try:
    # Test and get the new average activation score: should be higher due to testing more tokens, averaged over fixed number of checked neurons
    new_tokens, new_avg_act = substitute_similar(
        tokens,
        acts, 
        index,
        neuron_layer)
  finally:
    tokens_checked = 6 * len(tokens)
    cur_avg_act = sum(acts) / tokens_checked

    if new_avg_act >= cur_avg_act:
      description = ask_summary(new_tokens)
      return new_avg_act, description, new_tokens
    else:
      description = ask_summary(tokens)
      return cur_avg_act, description, tokens
    
# Iteration
def identify_neurons(neuron_layer, max_index, model_name):
  act_desc = []

  for index in range(max_index):
    summary = summarise_neuron(index, neuron_layer, model_name)
    if summary is None:
      continue
    avg_act, description, new_tokens = summary
    act_desc.append((avg_act, description, new_tokens, index))

  return act_desc

In [None]:
def analyse_model_layer(layer, model_name, num_neurons):
  act_desc = identify_neurons(layer, num_neurons, model_name)

  if len(act_desc) == 0:
    return

  sorted_by_act = sorted(act_desc, key=lambda tup: tup[1], reverse=True)
  max_activation = max([act for act, _, _, _ in sorted_by_act])
  print("Max activation: ", max_activation)

  for act, desc, new_tokens, index in sorted_by_act:
    norm_act = act / max_activation
    if norm_act >= 0.3:
      print("Neuron " + str(index) + ": activation " + str(norm_act) + ", " + desc)
      print("Tokens: ", new_tokens)

In [None]:
analyse_model_layer(6, "solu-8l-pile", 200)

Max activation:  1.3394641231725055
Neuron 159: activation 0.6870076078376094, They are all the word "poly" or the letter "P".
Tokens:  [' poly', ' P', ' P', ' poly', ' poly', 'poly', ' poly', ' poly', ' poly', ' poly', ' P', ' poly', ' poly', 'poly', ' poly', 'poly', 'poly', ' poly', 'poly', ' poly', 'poly', 'poly', 'poly', 'poly', 'poly', 'poly', 'poly', 'poly', 'poly', 'poly', 'poly', 'poly']
Neuron 52: activation 0.3743668889607776, They are all the letter "k".
Tokens:  [' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k', ' k']
Neuron 119: activation 0.3864337295131903, They all have the same number of symbols/characters.
Tokens:  ['=\\#', '=\\#', '=\\#', '#', '=\\#', '#', '#', '=\\#', '#', '#', '=\\#', '=\\#', '=\\#', '=\\#', '=\\#', '#', '=\\#', '=\\#', '=\\#', '=\\#']
Neuron 4: activation 1.0, The words all contain the letter U and the letter B.
Tokens:  ['ub', 'ub', 'ub', 'ub', 'ub', 'ub', 'ub', 'ub', 'ub', 'ub', 'ub

In [None]:
analyse_model_layer(10, "gpt2-small", 200)

Max activation:  2.0868128333333327
Neuron 31: activation 0.3129695028551116, is
Tokens:  [' is', ' is', ' is', ' are', ' is', ' is', ' is', ' are', ' is', ' are', ' be', ' is', ' is', ' is', 'shirts', ' is', ' is', ' is', ' is', ' is']
Neuron 174: activation 0.33425935339832824, They are both four-letter words.
Tokens:  [' fourth', '-', '-', '-', '-', '-', '-', '-', '-', ' will', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-']
Neuron 172: activation 0.7352301280493371, They are all the word "each" repeated.
Tokens:  [' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each', ' each']
Neuron 20: activation 0.380857431152786, They are all separated by commas.
Tokens:  [',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ' question', ',', ',', ',', ',', ',', ',', ',', ',']
Neuron 13: activation 0.7502715664405938, They are all punctuation marks.
Tokens:  ['\\u2022', ',', '