In [None]:
import torch
from pathlib import Path
from datasets import load_dataset
from circuit_tracer import ReplacementModel

from weight_lens.input_invariant_feature_description import *
from weight_lens.model_utils import *
from weight_lens.utils import *

# Choose device: CUDA > MPS > CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

# Models Loading

### gemma-2-2b

In [None]:
# Loading model with transcoders, as in circuit_tracer package
model_name = 'google/gemma-2-2b'
transcoder_name = "gemma"
model = ReplacementModel.from_pretrained(model_name, transcoder_name, dtype=torch.bfloat16, device=device)

### gpt2 

In [None]:
# IMPORTANT: these transcoders depend on transcoder_circuits package, specifically, 
# sae_training/config.py should be located in the working directory. 
# https://github.com/jacobdunefsky/transcoder_circuits/blob/master/sae_training/config.py

# For GPT-2 a slightly different approach is utilized: only checking layer 0 contributing features and tokens based on them. 
# Analyzing all the previous layers brings a lot of noise. 
model_name = "gpt2"
model = load_replacement_model_from_yaml("configs/gpt2-transcoders.yaml").to(device)

### Llama-3.2-1B

In [None]:
# This would load the whole model with transcoders, but since they are very large, we are loading only part of layers further
# model = ReplacementModel.from_pretrained("meta-llama/Llama-3.2-1B", "llama", device=torch.device("cpu"))

In [None]:
# Loading transcoders partially, only those layers that are in config
transcoders = load_partial_transcoder_set("configs/llama-relu.yaml", device=device)

# Overwriting some ReplacementModel methods to use partial transcoders
ReplacementModel._configure_replacement_model = configure_partial_replacement
ReplacementModel._get_activation_caching_hooks = get_partial_activation_caching_hooks

# Loading the model with partial transcoders
model_name = "meta-llama/Llama-3.2-1B"
model = ReplacementModel.from_pretrained_and_transcoders(model_name, transcoders.transcoders)

# Loading results from HuggingFace (if necessary) and saving them locally

In [None]:
# In order to visualize results fast, you can load them pre-calculated. 
# Otherwise, you can run analysis by yourself, but since it is layerwise 
# (i.e. all contributing features from previous layers should be already analyzed), 
# for later layers it can take more time. 


model_name = "gemma-2-2b" # "gpt2" or "Llama-3.2-1B"
save_dir = f"results/{model_name}"
os.makedirs(save_dir, exist_ok=True)

num_layers = model.cfg.n_layers  # assuming model is already loaded

dataset_repo = f"egolimblevskaia/weightlens-{model_name}-transcoder-descriptions"

for layer in range(3,num_layers):
    file_name = f"feature_analysis_layer_{layer}.json"
    ds = load_dataset(dataset_repo, data_files=file_name)
    layer_results = ds["train"].to_list()

    save_path = os.path.join(save_dir, file_name)
    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(layer_results, f, ensure_ascii=False, indent=2)

    print(f"Saved {file_name} to {save_path}")

In [49]:
# Loading already generated results. The ones, that are not validated, will not have description presented. 

feature = Feature(20, 0, 8684)
result = load_feature_analysis(feature, model, save_dir=Path(f"results/{model_name}/"))
print_feature_analysis(result, save_dir=f"results/{model_name}/", model=model)


[✓] Feature analysis for L20 F8684:

📝 Description:  ALL   |    All   |    all   |    tout   |   All   |   all
No directly influencing tokens found for this feature.

📤 Output logits:
 + Top positive tokens:  the   |    of   |    those   |    your   |    our   |    these
 - Top negative tokens: None

🧩 Top contributing features:
 → From L3 F15756 | Contribution: +1.9844 | Relative: 0.00003257%
   └─ Description:  tout    |   yscy
 → From L4 F2933 | Contribution: +1.5156 | Relative: 0.00002487%
   └─ Description: dik
 → From L4 F8718 | Contribution: +1.3750 | Relative: 0.00002257%
   └─ Description:  Mall    |    Zul
 → From L14 F8405 | Contribution: +1.2344 | Relative: 0.00002026%
   └─ Description: all
 → From L8 F2251 | Contribution: +1.1641 | Relative: 0.00001910%
   └─ Description: yscy
 → From L18 F8684 | Contribution: +1.1328 | Relative: 0.00001859%
   └─ Description:  ALL    |    All    |    Mall    |    all    |    tout    |   All    |   all
 → From L16 F4888 | Contribution: +

# Analyzing Transcoders

In [5]:
# If result is not yet calculated, it can be analyzed and visualized separately
# Important: for later layers, it will take significantly more time, since the process includes analyzing all contributing features as well
feature = Feature(4, 0, 7671)

result = analyze_feature(model, feature)
print_feature_analysis(result, model=model)

# All the features layerwise can be analyzed by running the following function

# analyze_all_features(model, save_dir="results/gemma_2_2b/")


[✓] Feature analysis for L4 F7671:

📝 Description:  OFFICER   |    OFFICERS   |    Officer   |    Officers   |    officer   |    officers   |   Officer   |   Officers   |   officer   |   officers
🔍 Top contributing embedding tokens:
 - Token ID 10971 |             officers | Activation: 10.8125 | Contribution:   77.000 | Rel: 0.00352441%
 - Token ID 11053 |              officer | Activation: 12.6875 | Contribution:   74.500 | Rel: 0.00340999%
 - Token ID 40159 |             Officers | Activation: 7.5000 | Contribution:   70.500 | Rel: 0.00322690%
 - Token ID 15860 |              Officer | Activation: 8.1875 | Contribution:   69.000 | Rel: 0.00315824%
 - Token ID 101672 |             Officers | Activation: 2.8750 | Contribution:   66.500 | Rel: 0.00304381%
 - Token ID 153897 |              OFFICER | Activation: 9.5625 | Contribution:   63.000 | Rel: 0.00288361%
 - Token ID 98452 |              Officer | Activation: 5.8125 | Contribution:   62.000 | Rel: 0.00283784%
 - Token ID 146209 |

In [4]:
# We can also change z-score threshold to make it more or less sensitive, but it can bring more noise into the final result 

result = analyze_feature(model, feature, threshold_tokens=3, threshold_features=3)
print_feature_analysis(result, model=model)


[✓] Feature analysis for L4 F7671:

📝 Description:  OFFICER   |    OFFICERS   |    Officer   |    Officers   |    officer   |    officers   |   Officer   |   Officers   |   createCanvas   |   officer   |   officers
🔍 Top contributing embedding tokens:
 - Token ID 10971 |             officers | Activation: 10.8125 | Contribution:   77.000 | Rel: 0.00352441%
 - Token ID 11053 |              officer | Activation: 12.6875 | Contribution:   74.500 | Rel: 0.00340999%
 - Token ID 40159 |             Officers | Activation: 7.5000 | Contribution:   70.500 | Rel: 0.00322690%
 - Token ID 15860 |              Officer | Activation: 8.1875 | Contribution:   69.000 | Rel: 0.00315824%
 - Token ID 101672 |             Officers | Activation: 2.8750 | Contribution:   66.500 | Rel: 0.00304381%
 - Token ID 153897 |              OFFICER | Activation: 9.5625 | Contribution:   63.000 | Rel: 0.00288361%
 - Token ID 98452 |              Officer | Activation: 5.8125 | Contribution:   62.000 | Rel: 0.00283784%
 

# Descriptions Postprocessing

In [None]:
from weight_lens.postprocessing import unique_tokens, system_prompt

In [14]:
# First option - lemmatization
# Since often we get versions of the same word, we can use this technique 
# to postprocess our obtained results 

feature = Feature(4, 0, 7671)
result = analyze_feature(model, feature)

unique_tokens(result['description']), unique_tokens(result['output_logits']['top_positive_tokens'])

(['officer'], ['headquarters'])

In [4]:
# Another option -- postprocessing via explainer LLM, that would generate description
# Here is the used system prompt

print(system_prompt)


            We're studying neurons in a neural network. Each neuron has certain inputs that activate it and outputs that it leads to. You will receive three pieces of information about a neuron: 

            1. The top important tokens.
            2. The top tokens it promotes in the output. 
            3. The tokens it suppresses in the output.

            These will be separated into three sections [Important Tokens] and [Text Promoted] and [Text Suppressed]. All three are a combination of tokens. You can infer the most likely output or function of the neuron based on these tokens. The tokens, specially [Text Promoted] and [Text Suppressed] may include noise, such as unrelated terms, symbols, or programming jargon. If these are not coherent, you may ignore them and do not include them in your response. If the [Important Tokens] are not combining to form a common theme, you may simply combine the words in the [Important Tokens] to form a single concept.

            Focus on iden