# Examples of how to build concept-based explanations

Load the model and list modules to find where to split it.

In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

model = AutoModelForMaskedLM.from_pretrained("EuroBERT/EuroBERT-210m", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("EuroBERT/EuroBERT-210m")
split_point = "model.layers.10.mlp"

print(list(model.named_children()))

[('model', EuroBertModel(
  (embed_tokens): Embedding(128256, 768, padding_idx=128001)
  (layers): ModuleList(
    (0-11): 12 x EuroBertDecoderLayer(
      (self_attn): EuroBertAttention(
        (q_proj): Linear(in_features=768, out_features=768, bias=False)
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=768, bias=False)
        (o_proj): Linear(in_features=768, out_features=768, bias=False)
      )
      (mlp): EuroBertMLP(
        (gate_proj): Linear(in_features=768, out_features=3072, bias=False)
        (up_proj): Linear(in_features=768, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=768, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): EuroBertRMSNorm((768,), eps=1e-05)
      (post_attention_layernorm): EuroBertRMSNorm((768,), eps=1e-05)
    )
  )
  (norm): EuroBertRMSNorm((768,), eps=1e-05)
  (rotary_emb): EuroBertRotaryEmbedding()
)), (

### Split the model using the `ModelWithSplitPoints` class

In [2]:
from interpreto import ModelWithSplitPoints

splitted_model = ModelWithSplitPoints(
    model_or_repo_id=model,
    tokenizer=tokenizer,
    split_points=split_point,
    device_map="cuda",
    batch_size=64,
)

### Load the dataset and compute activations

In [3]:
from datasets import load_dataset

rotten_tomatoes = load_dataset("cornell-movie-review-data/rotten_tomatoes")["train"]["text"]

activations = splitted_model.get_activations(
    rotten_tomatoes,
    select_strategy=ModelWithSplitPoints.activation_strategies.WORD,
)

print(activations[split_point].shape)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


torch.Size([187890, 768])


### Create and fit the concept explainer

In [9]:
from interpreto.concepts import ICAConcepts

concept_explainer = ICAConcepts(splitted_model, nb_concepts=50)

concept_explainer.fit(activations)

### Interpret the concepts

In [10]:
from interpreto.concepts.interpretations import TopKInputs

interpretations = concept_explainer.interpret(
    TopKInputs,
    concepts_indices="all",
    source=TopKInputs.sources.LATENT_ACTIVATIONS,
    granularity=TopKInputs.granularities.WORD,
    inputs=rotten_tomatoes,
    latent_activations=activations,
    k=10,
)

In [11]:
for concept_id, words_importance in interpretations.items():
    print(f"Concept {concept_id}: {list(words_importance.keys()) if words_importance is not None else 'None'}")

Concept 0: [' one', ' only', ' few', ' little', ' small', ' just', ' bit', ' times', ' least', ' seldom']
Concept 1: [' involved', ' town', ' surroundings', ' engaged', ' over', ' alive', ' around', ' going', ' on', ' underway']
Concept 2: [' probably', ' us', ' me', ' imagine', ' seems', ' be', ' one', ' maybe', ' generally', ' perhaps']
Concept 3: ['.', ',', ' )', ' *', ' -', ' t', ' (', '?', ' s', '+']
Concept 4: [' inhabit', ' below', ' expressed', ' beneath', ' reflect', ' haunting', ' live', ' indulge', ' deferred', ' distancing']
Concept 5: [' for', 'for', '-for', ' on', ' para', ' every', ' than', ' everyone', ' strongly', ' constantly']
Concept 6: [' makes', ' make', ' made', ' lifts', ' leaves', ' render', ' making', ' sent', ' leave', ' keep']
Concept 7: [' deal', ' sustain', ' minutes', ' once', ' day', '-minute', ' champion', ' off', ' help', ' heavily']
Concept 8: ['.', '!', ',', '?', ' )', "'", " '", ' "', ' ;', '-']
Concept 9: [' surprising', ' breakthrough', ' believe'