## Make labels using an LLM for each feature in a model
For now, use gpt-4o-mini with a template derived from https://arxiv.org/pdf/2408.00657v1

We want to provide maximum activating examples, as well as "control" samples that dont activate the feature. Instead of using random sample lets try to pull from similar features.
TODO: double check my similarity search because the similar features don't seem to give similar samples at all.

In [6]:
import pandas as pd
from latentsae.sae import Sae
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
sae = "64_32"
model = Sae.load_from_hub("enjalot/sae-nomic-text-v1.5-FineWeb-edu-100BT", sae)
name = f"NOMIC_FWEDU_{round(model.num_latents/1000)}k"

Fetching 2 files: 100%|██████████| 2/2 [00:00<00:00, 20020.54it/s]


In [4]:
features = range(model.num_latents)

In [5]:
top10_df = pd.read_parquet(f"data/top10_{sae}.parquet")

In [46]:
weights = [model.W_dec[i].detach().cpu().numpy() for i in range(model.W_dec.size(0))]
def get_similar_features(feature):
    # Get the weight vector for the given feature
    feature_weight = weights[feature].reshape(1, -1)
    # Calculate cosine similarity between the feature weight and all weights
    similarities = cosine_similarity(feature_weight, weights)
    sorted = np.argsort(similarities[0])[::-1]
    # print("similarities", sorted)
    # print("similarities", similarities[0][sorted])
    # Get the indices of the top 5 most similar weights (excluding the feature itself)
    similar_indices = np.argsort(similarities[0])[-6:-1][::-1]  # Get top 5, excluding the feature itself
    return similar_indices.tolist()

In [133]:
# get_similar_features(5)

In [51]:
def get_samples(feature):
  # get top 10 activations
  top = top10_df[top10_df.feature == feature]["chunk_text"].tolist()
  # get 5 similar features
  similar_features = get_similar_features(feature)
  similar = top10_df[top10_df.feature.isin(similar_features)]
  similar = similar[~similar.top_indices.isin([feature])]
  
  control = similar.sample(10)["chunk_text"].tolist()
  # grab 2 samples for each of those features
  return { "top": top, "control": control, "similar_features": similar_features}

In [52]:
f0 = get_samples(5)

In [132]:
# f0["top"]

In [131]:
# f0["control"]

In [130]:
# f0["similar_features"]

In [56]:
system_prompt = f"""You are a meticulous researcher conducting an important investigation into a certain
neuron in a language model. Your task is to figure out what sort of behaviour this neuron is responsible for - namely, on what general concepts, features,
themes, methodologies or topics does this neuron fire? Here's how you'll complete the task:

INPUT DESCRIPTION: You will be given two inputs: 1) Max Activating Examples and 2)
Zero Activating Examples.
1. You will be given several examples of text that activate the neuron, along with a
number being how much it was activated. This means there is some feature, theme,
methodology, topic or concept in this text that 'excites' this neuron.
2. You will also be given several examples of text that don't activate the neuron. This
means the feature, topic or concept is not present in these texts.

OUTPUT DESCRIPTION: Given the inputs provided, complete the following tasks.
1. Based on the MAX ACTIVATING EXAMPLES provided, write down potential topics,
concepts, themes, methodologies and features that they share in common. These
will need to be specific - remember, all of the text comes from subject, so these
need to be highly specific subject concepts. You may need to look at different
levels of granularity (i.e. subsets of a more general topic). List as many as you can
think of. Give higher weight to concepts more present/prominent in examples with
higher activations.
2. Based on the zero activating examples, rule out any of the topics/concepts/features
listed above that are in the zero-activating examples. Systematically go through your
list above.
3. Based on the above two steps, perform a thorough analysis of which feature, concept
or topic, at what level of granularity, is likely to activate this neuron. Use Occam's
razor, as long as it fits the provided evidence. Be highly rational and analytical here.
4. Based on step 4, summarise this concept in 1-8 words, in the form FINAL:
<explanation>. Do NOT return anything after these 1-8 words.

You should choose a label that best summarizes the theme of the list so that someone browsing the labels will have a good idea of what is in the list. 
Do not use punctuation, Do not explain yourself, respond with only a few words that summarize the list."""

In [60]:
def get_sample_prompt(feature):
  samples = get_samples(feature)
  prompt = f"""Here is a list of maximum activating samples for the neuron:
  <Samples>
  {"".join(f'<Sample>{sample}</Sample>' for sample in samples["top"])}
  </Samples>
  Here is a list of zero activating samples:
  <ControlSamples>
  {"".join(f'<Sample>{sample}</Sample>' for sample in samples["control"])}
  </ControlSamples>
  Please summarise this concept in 1-8 words, in the form FINAL:
<explanation>. Do NOT return anything after these 1-8 words
  """
  return prompt

In [61]:
feature = 0
messages=[
    {"role":"system", "content": system_prompt}, 
    {"role":"user", "content": get_sample_prompt(feature)}
]

In [129]:
# messages

In [82]:
from openai import AsyncOpenAI
import os
import dotenv
dotenv.load_dotenv()


True

In [98]:
client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
async def chat(messages):
  response = await client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
  )
  return response.choices[0].message.content

In [101]:
label = await chat(messages)

In [102]:
label

'FINAL: properties and applications of gelatin'

In [76]:
features

range(0, 24576)

In [79]:
import asyncio
import time
from tqdm import tqdm
import nest_asyncio

In [120]:
labels_df = pd.read_parquet(f"data/labels-{name}.parquet")
features_to_label = [feature for feature in features if feature not in labels_df['feature'].values]

In [128]:
# print("features to label", len(features_to_label), "done with", len(labels_df))
# print(features_to_label)

In [126]:
nest_asyncio.apply()
limit = 1000
batch_size = 10

async def chat_with_feature(feature):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": get_sample_prompt(feature)}
    ]
    response = await chat(messages)
    return {"feature": feature, "label": response}

async def process_batch(features_batch):
    tasks = [chat_with_feature(feature) for feature in features_batch]
    return await asyncio.gather(*tasks)

async def main(features, labels_df):
    results = []
    for i in tqdm(range(0, len(features), batch_size), desc="Processing features", leave=True):
        batch = features[i:i + batch_size]
        batch_results = await process_batch(batch)
        results.extend(batch_results)
        
        # Save progress every 100 features
        if len(results) % 100 == 0:
            labels_df = pd.concat([labels_df, pd.DataFrame(results)])
            labels_df.to_parquet(f"data/labels-{name}.parquet")
            results = []
        
        # Respect rate limit
        if len(results) % limit == 0:
            await asyncio.sleep(60)  # Wait for a minute to respect rate limit

    # Save any remaining results
    labels_df = pd.concat([labels_df, pd.DataFrame(results)])
    labels_df.to_parquet(f"data/labels-{name}.parquet")

asyncio.run(main(features_to_label, labels_df))


Processing features:   0%|          | 0/2438 [00:00<?, ?it/s]

Processing features:  61%|██████    | 1476/2438 [1:01:36<28:37,  1.79s/it]Task exception was never retrieved
future: <Task finished name='Task-2826' coro=<main() done, defined at /var/folders/sx/rrvr6l_d5x1_g46jxlx5ypfc0000gn/T/ipykernel_38579/3263936852.py:17> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/Users/enjalot/code/latent-taxonomy/venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/sx/rrvr6l_d5x1_g46jxlx5ypfc0000gn/T/ipykernel_38579/3263936852.py", line 37, in <module>
    asyncio.run(main(features))
  File "/Users/enjalot/code/latent-taxonomy/venv/lib/python3.12/site-packages/nest_asyncio.py", line 35, in run
    loop.run_until_complete(task)
  File "/Users/enjalot/code/latent-taxonomy/venv/lib/python3.12/site-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/Users/enjalot/code/latent-taxonomy/ve