# Interpreto attribution tutorial

This notebook contains examples of what you can do with the attribution module of Interpreto.

The first part focuses on classification, while the second explores generation.

*author: Fanny Jourdan & Antonin Poché*

### Available methods

All methods will have at list one example in this notebook.

Inference based methods:
- Occlusion
- LIME
- KernelSHAP
- Sobol


Gradients based methods:
- Saliency
- Integrated Gradients
- SmoothGrad



### Imports

In [1]:
import sys

sys.path.append("../..")

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer

from interpreto import (
    AttributionVisualization,
    Granularity,
    IntegratedGradients,
    KernelShap,
    Lime,
    Occlusion,
    Saliency,
    SmoothGrad,
    Sobol,
)
from interpreto.attributions import InferenceModes
from interpreto.commons.granularity import GranularityAggregationStrategy

In [None]:
# conftest.py  (ou en début de fichier de test)
import types
import pytest


@pytest.fixture
def dummy_tokenizer():
    """
    Tokenizer minimal :
    - is_fast = True  (pour passer la vérification)
    - all_special_ids vide
    - decode() très simple
    """
    return types.SimpleNamespace(
        is_fast=True,
        all_special_ids=[],
        decode=lambda ids, skip_special_tokens=False: " ".join(f"tok{tid}" for tid in ids),
    )


In [14]:
import pytest
from transformers.tokenization_utils_base import BatchEncoding


def test_aggregate_score_for_gradient_method_alltokens_granularity_manual_ids(simple_text, real_bert_tokenizer):
    """Test score aggregation for ALL_TOKENS"""

    tokenizer = real_bert_tokenizer
    tokens = tokenizer(simple_text, return_tensors="pt", return_offsets_mapping=True)
    input_ids = tokens["input_ids"]
    seq_len = input_ids.shape[1]

    # Fake scores = ascending from 0 to seq_len-1
    fake_scores = torch.arange(seq_len).float().unsqueeze(0)

    # ALL_TOKENS → passthrough
    agg_all_tokens = Granularity.aggregate_score_for_gradient_method(
        contribution=fake_scores,
        granularity=Granularity.ALL_TOKENS,
        inputs=tokens,
        tokenizer=tokenizer,
    )
    assert torch.equal(agg_all_tokens, fake_scores)
    print("ALL_TOKENS aggregation passed.")


def test_aggregate_score_for_gradient_method_token_granularity_manual_ids(real_bert_tokenizer):
    """
    Test TOKEN-level aggregation using manually constructed input_ids.

    The first token is a special token (e.g., [CLS]), and the remaining tokens are regular.
    The method should exclude the special token and return the scores for the regular tokens only.
    """

    tokenizer = real_bert_tokenizer
    special_ids = set(tokenizer.all_special_ids)

    # Ensure the tokenizer has special tokens
    if not special_ids:
        pytest.skip("Tokenizer has no special tokens to test with.")

    # Select one special token ID (e.g., [CLS] or [SEP])
    special_token_id = list(special_ids)[0]

    # Select 4 token IDs that are not in the set of special tokens
    non_special_ids = [i for i in range(100, 1000) if i not in special_ids][:4]
    if len(non_special_ids) < 4:
        pytest.skip("Not enough non-special token IDs available.")

    # Construct input_ids tensor: [special_token, regular_token1, ..., regular_token4]
    # Shape: (1, 5)
    input_ids = torch.tensor([[special_token_id] + non_special_ids])
    tokens = BatchEncoding({"input_ids": input_ids})

    # Create dummy scores: 2 scores per token:
    fake_scores = torch.tensor([[10.0, 20.0, 31.1, 41.2, 55.2], [12.0, 21.0, 30.0, 40.0, 50.0]])  # shape (2, 5)

    # Apply TOKEN-level aggregation (no actual reduction since each token is treated individually)
    aggregated = Granularity.aggregate_score_for_gradient_method(
        contribution=fake_scores,
        granularity=Granularity.TOKEN,
        inputs=tokens,
        tokenizer=tokenizer,
    )

    # Expect scores of regular tokens only (i.e., skip the first one)
    expected = fake_scores[:, 1:]

    # Assert the result matches the expected filtered scores
    assert torch.allclose(aggregated, expected, atol=1e-5)
    print("TOKEN-level aggregation passed.")


def test_aggregate_score_for_gradient_method_word_granularity_manual_ids():
    """
    Test WORD-level aggregation using mocked word_ids and manually defined input_ids and scores.

    Checks that all aggregation strategies behave correctly on both 1D and 2D score tensors.
    """

    # Simulated input_ids: arbitrary token IDs
    input_ids = torch.tensor([[101, 102, 103, 104, 105]])  # shape (1, 5)
    batch = BatchEncoding({"input_ids": input_ids})

    # Mock word_ids to define 3 groups: [0,1], [2], [3,4]
    batch.word_ids = lambda batch_index=0: [0, 0, 1, 2, 2]

    # Token groups:
    # Group 0: tokens 0 and 1
    # Group 1: token 2
    # Group 2: tokens 3 and 4

    # Define scores: shape (2, 5)
    scores = torch.tensor([[1.0, 3.0, 5.0, 2.0, 8.0], [-1.0, 4.0, 6.0, -7.0, 2.0]])

    # All aggregation strategies
    strategies = {
        "mean": GranularityAggregationStrategy.MEAN,
        "max": GranularityAggregationStrategy.MAX,
        "min": GranularityAggregationStrategy.MIN,
        "sum": GranularityAggregationStrategy.SUM,
        "signed_max": GranularityAggregationStrategy.SIGNED_MAX,
    }

    # Expected outputs
    expected = {
        "mean": torch.tensor([[(1.0 + 3.0) / 2, 5.0, (2.0 + 8.0) / 2], [(-1.0 + 4.0) / 2, 6.0, (-7.0 + 2.0) / 2]]),
        "max": torch.tensor([[3.0, 5.0, 8.0], [4.0, 6.0, 2.0]]),
        "min": torch.tensor([[1.0, 5.0, 2.0], [-1.0, 6.0, -7.0]]),
        "sum": torch.tensor([[4.0, 5.0, 10.0], [3.0, 6.0, -5.0]]),
        "signed_max": torch.tensor([[3.0, 5.0, 8.0], [4.0, 6.0, -7.0]]),
    }

    for name, strategy in strategies.items():
        aggregated = Granularity.aggregate_score_for_gradient_method(
            contribution=scores,
            granularity=Granularity.WORD,
            granularity_aggregation_strategy=strategy,
            inputs=batch,
            tokenizer=None,
        )
        assert torch.allclose(aggregated, expected[name], atol=1e-5), f"failed for {name}"

    print("WORD-level aggregation passed for all strategies.")


In [None]:
real_bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
simple_text = "word longword verylongword"


test_aggregate_score_for_gradient_method_alltokens_granularity_manual_ids(simple_text, real_bert_tokenizer)

ALL_TOKENS aggregation passed.


In [16]:
test_aggregate_score_for_gradient_method_token_granularity_manual_ids(real_bert_tokenizer)

TOKEN-level aggregation passed.


In [17]:
test_aggregate_score_for_gradient_method_word_granularity_manual_ids()

ValueError: Cannot get indices without a tokenizer if granularity is WORD.Please provide a tokenizer or set granularity to ALL_TOKENS.

## I. Classification task

### I.0 Setup

Loading a BERT model for the IMDB dataset.

In [3]:
model_name_classif = "textattack/bert-base-uncased-imdb"
model_classif = AutoModelForSequenceClassification.from_pretrained(model_name_classif)
tokenizer_classif = AutoTokenizer.from_pretrained(model_name_classif)
dico_name_classes = {0: "negative", 1: "positive"}

### I.1 Minimal example 

In [4]:
sentence = "This is the best movie I have ever seen. The cinematography was uncharacteristically breathtaking."

# Instantiate the Occlusion explainer with the model and tokenizer
explainer = Occlusion(model_classif, tokenizer_classif)

# Compute the attributions on a given sentence
attributions = explainer(sentence)

# Visualize the attributions
AttributionVisualization(attributions[0]).display()
# AttributionVisualization(attributions[0], class_names={0: "Class A", 1: "Class B"}).display()

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


### I.2 Compute explanations on multiple inputs optimally

In [5]:
# Gradient-based methods have the exact same API
explainer = Saliency(model_classif, tokenizer_classif)

# Inputs can also be a list of strings, or even `input_ids`
attributions = explainer(
    [
        "This is the best movie I have ever seen.",
        "I hate this movie.",
        "This movie is super good. I love it.",
    ]
)

# To visualize multiple attributions, simply loop over the list of attributions
for attr in attributions:
    AttributionVisualization(attr, class_names=dico_name_classes).display()

### I.3 Changing the `granularity_level` and `inference_mode`

In [6]:
# Let's modify the default parameters of the Lime explainer
explainer = Lime(
    # ---------------------------------
    # common to all attribution methods
    model_classif,
    tokenizer_classif,
    batch_size=32,  # default is 4
    # study impact on the softmax of the logits, default is the logits
    inference_mode=InferenceModes.SOFTMAX,
    # attribution at the word level, default is the token level
    granularity=Granularity.WORD,
    # ----------------------------------------
    # common to all perturbation-based methods
    n_perturbations=20,
    # ----------------
    # specific to Lime
    # arguments possible value are in classes static attributes, in Enums
    distance_function=Lime.distance_functions.HAMMING,
)

# The `__call__` method is a renaming of the `explain` method
attrs = explainer.explain(model_inputs="Would Interpreto be a good movie name?")

# Let's visualize the attributions
AttributionVisualization(attrs[0], class_names=dico_name_classes).display()

In [7]:
# for gradient-based methods, there is an additional argument for granularity,
# since it is necessary to choose how the token scores calculated from the gradients are to be aggregated.
explainer = Saliency(
    # common to all attribution methods
    model_classif,
    tokenizer_classif,
    # attribution at the word level, default is the all_tokens level
    granularity=Granularity.WORD,
    # aggregation method for the word level attributions (specific on for gradient-based methods)
    granularity_aggregation_strategy=GranularityAggregationStrategy.SIGNED_MAX,
)

# The `__call__` method is a renaming of the `explain` method
attrs = explainer.explain(model_inputs="Would Interpreto be a bad movie name?")

# Let's visualize the attributions
AttributionVisualization(attrs[0], class_names=dico_name_classes).display()

### I.4 Explaining multiple classes at once

Specifying what to explain is the role of the `targets` argument of the `explain` method.

For `n` inputs, there should be `n` elements in the `targets` argument.
But each elements of the `targets` can specify several classes to explain.
Therefore, the `targets` shape should be `(n, t)`, with `t` the number of classes to explain.

When `targets` are not specified, the explainer compute the model's prediction for each input.
Then it explains the model's prediction for each input.

In [8]:
# remember, the BERT trained on IMDB, movie reviews
explainer = KernelShap(model_classif, tokenizer_classif)

# we explain the prediction for both the positive and negative class
attributions = explainer(
    model_inputs=["I do not know if this is the best or the worst movie ever, I am confused."],
    targets=torch.tensor([[0, 1]]),
)

# be careful, we use a new visualization class for multi class attributions
AttributionVisualization(attributions[0], class_names=dico_name_classes).display()

## II. Generation

### II.0 Setup

Let's load a good old GPT2.

In [9]:
model_gen = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer_gen = AutoTokenizer.from_pretrained("gpt2")

### II.1 Minimal example

In [10]:
# the API is the same as classification

explainer = SmoothGrad(
    model_gen,
    tokenizer_gen,
    granularity=Granularity.WORD,
    granularity_aggregation_strategy=GranularityAggregationStrategy.MAX,
)

# if no target is specified, we generate the text on our part
# `generation_kwargs` are optional
attributions = explainer("Roses are red, the sky is", max_length=16)

# there is a third visualization class for generation attributions
AttributionVisualization(attributions[0]).display()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 1.25 MiB is free. Process 3036171 has 22.16 GiB memory in use. Including non-PyTorch memory, this process has 1.47 GiB memory in use. Of the allocated memory 927.90 MiB is allocated by PyTorch, and 72.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## II.2 Explain your own outputs

In [None]:
# naturally gradient-based methods also work
explainer = IntegratedGradients(model_gen, tokenizer_gen)

# you can pass strings as targets and even several samples at once
attributions = explainer(
    model_inputs=["Interpreto can explain", "And even treat"],
    targets=[" the outputs you provide.", " several samples at once."],
)

# for multiple samples, visualization need to be done one by one
for attr in attributions:
    AttributionVisualization(attr).display()

### II.3 Explain from tokenized inputs

In [None]:
# the default granularity ignores the special tokens
# but we can set it to ALL_TOKENS to include them
explainer = Sobol(model_gen, tokenizer_gen, granularity=Granularity.ALL_TOKENS)

tokenized_inputs = tokenizer_gen("Hi there, how are you?", return_tensors="pt")
tokenized_targets = tokenizer_gen("I am fine, thank you!", return_tensors="pt")

# these inputs/targets can be passed just like the others
attributions = explainer(tokenized_inputs, tokenized_targets)

AttributionVisualization(attributions[0]).display()