In [1]:
import matplotlib.colors as mcolors
import torch
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer

from interpreto.attributions.methods import IntegratedGradients, OcclusionExplainer, Saliency, SmoothGrad
from interpreto.commons.granularity import GranularityLevel
from interpreto.visualizations.attributions.classification_highlight import (
    GenerationAttributionVisualization,
    MultiClassAttributionVisualization,
    SingleClassAttributionVisualization,
)

Currents:
- Occlusion
- Saliency
- Integrated Gradients
- SmoothGrad

Soon:
- LIME
- KernelSHAP
- Sobol



# Classification task

In [2]:
model_name = "textattack/bert-base-uncased-imdb"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Occlusion 

In [3]:
explainer = OcclusionExplainer(model=model, batch_size=4, tokenizer=tokenizer, granularity_level=GranularityLevel.WORD)

attribution_outputs = explainer.explain(
    model_inputs=[
        "This is the best movie I have ever seen. The cinematography was uncharacteristically breathtaking."
    ],
    mode="logits",
)

viz = SingleClassAttributionVisualization(
    attribution_output=attribution_outputs[0],
    css=".common-word-style { margin-right: 0.3em }",
)
viz.display()

# mode in {"logits", "softmax", "log_softmax"}

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [4]:
attribution_outputs[0].attributions.shape, len(attribution_outputs[0].elements)

(torch.Size([16]), 16)

In [5]:
explainer = OcclusionExplainer(model=model, batch_size=4, tokenizer=tokenizer, granularity_level=GranularityLevel.WORD)

list_attribution_outputs = explainer.explain(
    model_inputs=[
        "This is the best movie I have ever seen.",
        "I hate this movie.",
        "This movie is super good. I love it.",
    ],
    mode="logits",
)

for attribution_outputs in list_attribution_outputs:
    viz = SingleClassAttributionVisualization(
        attribution_output=attribution_outputs,
        css=".common-word-style { margin-right: 0.3em }",
    )
    viz.display()



In [None]:
explainer = OcclusionExplainer(model=model, batch_size=4, tokenizer=tokenizer, granularity_level=GranularityLevel.WORD)

list_attribution_outputs = explainer.explain(
    model_inputs=[
        "This is the best movie I have ever seen.",
        "I hate this movie.",
        "This movie is super good. I love it.",
    ],
    targets=[torch.tensor([0, 1]), torch.tensor([0, 1]), torch.tensor([0, 1])],
    mode="logits",
)

for attribution_outputs in list_attribution_outputs:
    viz = SingleClassAttributionVisualization(
        attribution_output=attribution_outputs,
        css=".common-word-style { margin-right: 0.3em }",
    )
    viz.display()

In [7]:
explainer = OcclusionExplainer(
    model=model, batch_size=4, tokenizer=tokenizer, granularity_level=GranularityLevel.TOKEN
)

attribution_outputs = explainer.explain(
    model_inputs="This is the best movie I have ever seen.",
    targets=torch.tensor([[0, 1]]),  # (n, t), n=1, t=2
    mode="logits",
)

viz = MultiClassAttributionVisualization(
    attribution_output=attribution_outputs[0],
    class_names=["negative review", "positive review"],
    css=".common-word-style { margin-right: 0.3em }",
)
viz.display()

## Saliency

In [8]:
explainer = Saliency(model=model, batch_size=4, tokenizer=tokenizer)

attribution_outputs = explainer.explain(
    model_inputs=["This is the best movie I have ever seen."],
)

viz = SingleClassAttributionVisualization(
    attribution_output=attribution_outputs[0],
    color=mcolors.to_rgb("red"),
    css=".common-word-style { margin-right: 0.3em }",
)
viz.display()

TypeError: SingleClassAttributionVisualization.__init__() got an unexpected keyword argument 'color'

In [4]:
attribution_outputs[0].attributions.shape[-1], len(attribution_outputs[0].elements)

(12, 12)

## Integrated Gradient

In [5]:
explainer = IntegratedGradients(model=model, tokenizer=tokenizer, batch_size=4, n_interpolations=10)

attribution_outputs = explainer.explain(model_inputs="This is the best movie I have ever seen.")

viz = SingleClassAttributionVisualization(
    attribution_output=attribution_outputs[0],
    color=mcolors.to_rgb("red"),
    css=".common-word-style { margin-right: 0.3em }",
)
viz.display()

In [None]:
mapping = tokenizer(
    "This is the best movie I have ever seen.",
    return_tensors="pt",
    return_offsets_mapping=True,
    return_special_tokens_mask=True,
)

In [13]:
tokenizer.decode(mapping["input_ids"][0], skip_special_tokens=True)

'this is the best movie i have ever seen.'

In [6]:
attribution_outputs[0].attributions.shape, len(attribution_outputs[0].elements)

(torch.Size([14]), 14)

## SmoothGrad

In [26]:
explainer = SmoothGrad(model=model, batch_size=4, tokenizer=tokenizer, n_interpolations=50, noise_level=0.01)

attribution_outputs = explainer.explain(
    model_inputs=["I love this movie"],
)

viz = SingleClassAttributionVisualization(
    attribution_output=attribution_outputs[0],
    color=mcolors.to_rgb("red"),
    css=".common-word-style { margin-right: 0.3em }",
)
viz.display()

# Generation task

In [8]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

## Occlusion

In [9]:
explainer = OcclusionExplainer(model=model, batch_size=4, tokenizer=tokenizer, granularity_level=GranularityLevel.WORD)

attribution_outputs = explainer.explain(model_inputs="Hi there, how are you?", generation_kwargs={"max_length": 10})

viz = GenerationAttributionVisualization(
    attribution_output=attribution_outputs[0],
    color=mcolors.to_rgb("orange"),
    highlight_border=False,
    normalize=True,
)
viz.display()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [11]:
explainer = OcclusionExplainer(
    model=model, batch_size=4, tokenizer=tokenizer, granularity_level=GranularityLevel.ALL_TOKENS
)

attribution_outputs = explainer.explain(
    model_inputs="Hi there, how are you?",
    targets="I am fine, thank you",
    mode="softmax",
    generation_kwargs={"max_length": 10},
)

viz = GenerationAttributionVisualization(
    attribution_output=attribution_outputs[0],
    color=mcolors.to_rgb("orange"),
    highlight_border=False,
    normalize=True,
)
viz.display()

## Integrated Gradients

In [12]:
explainer = IntegratedGradients(model=model, tokenizer=tokenizer, batch_size=4, n_interpolations=10)

list_attribution_outputs = explainer.explain(
    model_inputs=["Hi there, how are you?", "What time is it?"], generation_kwargs={"max_length": 10}
)

for attribution_outputs in list_attribution_outputs:
    viz = GenerationAttributionVisualization(
        attribution_output=attribution_outputs,
        color=mcolors.to_rgb("orange"),
        highlight_border=False,
        normalize=True,
    )
    viz.display()

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [None]:
explainer = IntegratedGradients(model=model, tokenizer=tokenizer, batch_size=4, n_interpolations=10)

list_attribution_outputs = explainer.explain(
    model_inputs=["Hi there, how are you?", "What time is it?"],
    targets=["fine.", "It is 9 pm."],
    generation_kwargs={"max_length": 10},
)

for attribution_outputs in list_attribution_outputs:
    viz = GenerationAttributionVisualization(
        attribution_output=attribution_outputs,
        color=mcolors.to_rgb("orange"),
        highlight_border=False,
        normalize=True,
    )
    viz.display()

In [9]:
list_attribution_outputs[0].attributions.shape, len(list_attribution_outputs[0].elements)

(torch.Size([13, 20]), 20)