In [5]:
%load_ext autoreload
%autoreload 2

import os
import joblib
import numpy as np
import pandas as pd
from neuro import config
import neuro.agent
import neuro.decoding
from sklearn.model_selection import train_test_split
import imodelsx.llm
from tqdm import tqdm

INTERPRETER_MODEL = "gpt-4.1"
# ANNOTATOR_MODEL = 'meta-llama/Meta-Llama-3-8B-Instruct' # not actually used
EMBEDDER = "nomic-ai/modernbert-embed-base" # Huggingface model, will run locally
TASK_SPECIFIC_INSTRUCTIONS = """All of the texts are from a narrative story.
Features should describe a specific aspect of the text. For example:
- "The input mentions a location"
- "The input mentions time"
""".strip()

# paths
os.makedirs(config.HYPOTHESAES_RESULTS_DIR, exist_ok=True)
os.environ['EMB_CACHE_DIR'] = os.path.join(config.HYPOTHESAES_RESULTS_DIR, 'emb_cache')
# CACHE_NAME = f"yelp_quickstart_{EMBEDDER.replace('/', '___')}"
CACHE_NAME = f"neuro_quickstart_{EMBEDDER.replace('/', '___')}"
checkpoint_dir = os.path.join(config.HYPOTHESAES_RESULTS_DIR, "checkpoints", CACHE_NAME)


# import these after
from neuro.baselines.hypothesaes.quickstart import train_sae, interpret_sae, generate_hypotheses, evaluate_hypotheses
from neuro.baselines.hypothesaes.embedding import get_local_embeddings
neuro.baselines.hypothesaes.embedding.CACHE_DIR = os.environ['EMB_CACHE_DIR']

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


**Load data**

The dataset we're using here is a subset of 20K Yelp reviews, with 2K reviews used for validation (during SAE training). 

The target variable is the `stars` column, which is a rating between 1 and 5. We treat this as a regression task.

There are also 2K reviews used for holdout evaluation, which we'll use at the end of the notebook.

In [None]:
# Example usage:
df_train, texts_train = neuro.decoding.get_shared_data_for_subjects(
    ['UTS01', 'UTS02', 'UTS03'],
    train_or_test='train',
    concatenate_running_texts_frac=0.5
)
texts_train = [x.strip() for x in texts_train if isinstance(x, str)]  # Ensure all entries are strings
# remove empty indexes
idxs_empty = [i for i, text in enumerate(texts_train) if not text]
texts_train = [text for i, text in enumerate(texts_train) if i not in idxs_empty]
df_train = df_train.reset_index(drop=True).drop(index=idxs_empty)
df_train, df_val, texts_train, texts_val = train_test_split(df_train, texts_train, test_size=0.2, random_state=42)
assert len(texts_train) == len(df_train)
labels_train_multitarget = df_train.values

**Compute text embeddings for your dataset**

We'll compute text embeddings for a training set, and optionally a validation set. The validation embeddings are used for SAE eval and early-stopping during training.

Embeddings will be stored in the `emb_cache` directory (or `os.environ["EMB_CACHE_DIR"]` if you set it) using the `cache_name` parameter, so you only need to compute embeddings once.

You can use OpenAI or a local model.

Local models will run much faster on GPU. The default local model is `nomic-ai/modernbert-embed-base`. You can use any sentence-transformers model, but please read the model's docs; you may need to edit `get_local_embeddings`.

In [None]:
text2embedding = get_local_embeddings(texts_train + texts_val, model=EMBEDDER, batch_size=128, cache_name=CACHE_NAME)
# embeddings = np.stack([text2embedding[text] for text in texts_train])

train_embeddings = np.stack([text2embedding[text] for text in texts_train])
val_embeddings = np.stack([text2embedding[text] for text in texts_val])
joblib.dump((train_embeddings, val_embeddings), os.path.join(config.HYPOTHESAES_RESULTS_DIR, f"{CACHE_NAME}_embeddings.pkl"))

In [None]:
train_embeddings, val_embeddings = joblib.load(os.path.join(config.HYPOTHESAES_RESULTS_DIR, f"{CACHE_NAME}_embeddings.pkl"))

**Train SAE** 

We will train a Matryoshka SAE with $M=256$, $k=8$, and $\text{prefix\_lengths} = [32, 256]$.  

With the Matryoshka loss, the SAE will learn to reconstruct the input from (1) just the first 32 neurons, and (2) all 256 neurons.  
This will produce 32 coarse-grained features, and 224 finer-grained features.  

See the README for more details about selecting SAE hyperparameters. 

In [None]:
sae = train_sae(embeddings=train_embeddings, val_embeddings=val_embeddings,
                M=512, K=16, matryoshka_prefix_lengths=[64, 128, 256, 512], 
                checkpoint_dir=checkpoint_dir, n_epochs=500,)
joblib.dump(sae, os.path.join(checkpoint_dir, "sae.pkl"))

In [None]:
sae = joblib.load(os.path.join(checkpoint_dir, "sae.pkl"))

**Generate hypotheses**

Generate hypotheses which are predictive of the target variable.

The `selection_method` parameter defines how we compute neuron predictiveness (see `src/select_neurons.py` for more details):
- "separation_score": E[target | top-activating examples] - E[target | zero-activating examples]
- "correlation": pearson(neuron activations, target variable)
- "lasso": select N nonzero features with an L1 regularized model

This cell outputs a dataframe with the following columns:
- `neuron_idx`: The index of the neuron in the SAE (if you're using multiple SAEs, this will be a global index across all of them).
- `source_sae`: The SAE that the neuron was selected from.
- `target_{selection_method}`: The predictiveness of the neuron for the target variable, using the selected `selection_method`.
- `interpretation`: The natural language interpretation of the neuron.
- `interp_fidelity_score`: The F1 fidelity score for how well the neuron's interpretation actually corresponds to its activation pattern.

In [None]:
results = generate_hypotheses(
    texts=texts_train,
    labels=labels_train_multitarget,
    embeddings=train_embeddings,
    sae=sae,
    cache_name=CACHE_NAME,
    classification=False,

    # hyperparams
    selection_method = "correlation_multi_target",
    n_selected_neurons=384,
    n_candidate_interpretations=1,
    task_specific_instructions=TASK_SPECIFIC_INSTRUCTIONS,
    interpreter_model=INTERPRETER_MODEL,

    # provide some scoring of the hypotheses using this many examples
    n_scoring_examples=0,
    # n_scoring_examples=100,
    # annotator_model=ANNOTATOR_MODEL,
    # n_workers_annotation=N_WORKERS_ANNOTATION, # Please lower this parameter if you are running into OpenAI API rate limits
)
os.makedirs(os.path.join(config.HYPOTHESAES_RESULTS_DIR, 'interpretations'), exist_ok=True)
joblib.dump(results, os.path.join(config.HYPOTHESAES_RESULTS_DIR, 'interpretations', "results_questions.pkl"))

### Save and postprocess results

In [None]:
results = joblib.load(os.path.join(config.HYPOTHESAES_RESULTS_DIR, 'interpretations', "results_questions.pkl"))
results['interpretation'].fillna('The input is short.', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  results['interpretation'].fillna('The input is short.', inplace=True)


In [None]:
print("\nMost predictive features:")
pd.set_option('display.max_colwidth', None)
display(results.sort_values(by=f"target_correlation_multi_target", ascending=False))
pd.reset_option('display.max_colwidth')

In [None]:
lm = imodelsx.llm.get_llm('gpt-4.1')
qs_processed = []
for i in tqdm(range(0, len(results['interpretation'].values.tolist()), 20)):
    print(f"Rewording questions {i} to {i + 20}")
    batch_qs = results['interpretation'].values.tolist()[i:i + 20]
    batch_qs_revised = neuro.agent.revise_invalid_questions_by_rewording(batch_qs, lm)
    
    for q in batch_qs_revised:
        assert q.startswith('Does the input') and q.endswith('?'), f"Question '{q}' is not properly formatted."
    qs_processed.extend(batch_qs_revised)
# qs = neuro.agent.revise_invalid_questions_by_rewording(results['interpretation'].values.tolist(), lm)

In [None]:
results['questions_processed'] = qs_processed
joblib.dump(results, os.path.join(config.HYPOTHESAES_RESULTS_DIR, 'interpretations', "results_questions.pkl"))

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(results['questions_processed'].head(40))

In [None]:
questions_list = joblib.load(os.path.join(config.HYPOTHESAES_RESULTS_DIR, 'interpretations', "results_questions.pkl"))['questions_processed'].values.tolist()

In [None]:
questions

['Does the input consist of a single pronoun word?',
 "Does the input contain reported or direct speech, indicated by words like 'said', 'say', or 'says'?",
 "Does the input contain one or more numbers expressed as words (e.g., 'fifty', 'seventy', 'thousand') or numerals, often in the context of quantities, measurements, or ranges?",
 'Does the input mention a place such as a town, neighborhood, city, or geographic location?',
 "Does the input explicitly mention an age or a specific number of years in reference to a person's life or the passage of time?",
 'Does the input discuss abstract concepts, such as viewpoints, frameworks, understanding, arguments, or cultural ideas?',
 'Does the input mention a specific time or time of day (e.g., hours, days of the week, or clock times)?',
 'Does the input describe physical appearance, specifically mentioning hair color or skin tone?',
 'Does the input describe someone screaming, yelling, or crying out in distress?',
 'Does the input mention pu