In [1]:
!pip install einops
!pip install captum
!pip install shap

Collecting einops
  Obtaining dependency information for einops from https://files.pythonhosted.org/packages/29/0b/2d1c0ebfd092e25935b86509a9a817159212d82aa43d7fb07eca4eeff2c2/einops-0.7.0-py3-none-any.whl.metadata
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0
Collecting captum
  Downloading captum-0.6.0-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: captum
Successfully installed captum-0.6.0


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import random
import torch
import copy

import numpy as np

from tqdm.notebook import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

from captum.attr import IntegratedGradients, LayerIntegratedGradients
from captum.attr import visualization as viz

import shap



In [4]:
torch.manual_seed(12345)
np.random.seed(12345)

In [5]:
device = "cuda"
torch.set_default_device(device)

In [6]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True, use_fast=True)
config = AutoConfig.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

model.eval()
model.zero_grad()

Downloading config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading configuration_phi.py:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading modeling_phi.py:   0%|          | 0.00/33.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [7]:
test_query = ["How are you today?"]
animal_queries = [
    "What sound does the dog make?\n\nAnswer:",
    "What is a dog?\n\nAnswer:",
    "What sound does the cat make?\n\nAnswer:",
    "What is a cat?\n\nAnswer:",
    "What is a the difference between a cat and a dog?\n\nAnswer:",
]
city_queries = [
    "Where is London?\n\nAnswer:",
    "Where is Paris?\n\nAnswer:",
    "What is the captial of the United Kingdom?\n\nAnswer:",
    "What is the capital of France?\n\nAnswer:",
]

In [8]:
def swap_emb_weights(emb_weights, a_indexes, b_indexes):
    a_weights = emb_weights[a_indexes]
    b_weights = emb_weights[b_indexes]

    swap_weights = emb_weights.clone()
    swap_weights[a_indexes] = b_weights
    swap_weights[b_indexes] = a_weights
    return swap_weights

In [9]:
try:
    emb_weights
except NameError:
    emb_weights = model.get_input_embeddings().weight
try:
    original_generate
except NameError:
    original_generate = model.generate

word_inputs = tokenizer([" dog", " cat"], return_tensors="pt", return_attention_mask=False)
dog_indexes = word_inputs["input_ids"][0]
cat_indexes = word_inputs["input_ids"][1]

swap_weights = swap_emb_weights(emb_weights, dog_indexes, cat_indexes)

## Captum

In [10]:
def llm_captum(model, queries, baseline_query = " ", max_length=20):
    # Define forward wrapper function to extrapolate logits values
    def custom_forward(inputs):
        out = model.forward(inputs)
        logits = out.logits
        return logits.max(1).values
    
    # Create a LayerIntegratedGradients wrapper for the model
    lig = LayerIntegratedGradients(custom_forward, model.get_input_embeddings())
    
    # Define a reference sequence of token ids from a baseline query
    ref_input_ids = None
    if baseline_query:
        ref_input_ids = tokenizer(baseline_query, return_tensors="pt", return_attention_mask=False).input_ids
    
    vis = []
    # Iterate over all queries, saving attributions information for each one
    for query in tqdm(queries):
        # Elaborate full model output while 
        inputs = tokenizer(query, return_tensors="pt", return_attention_mask=False)
        out = model.generate(**inputs, max_length=max_length)
        prompt_n = inputs.input_ids.squeeze().size()[0]
        output_n = out.squeeze().size()[0]
    
        records = []
        # Gather attributions information for each token of the modedl output for a given query
        for i in tqdm(range(prompt_n, output_n)):
            attributions, delta = lig.attribute(
                inputs=out[:i],
                baselines=ref_input_ids,
                return_convergence_delta=True,
                target=out[0][i]
            )
            records.append(viz.VisualizationDataRecord(
                attributions.sum(dim=-1).squeeze(0),
                torch.as_tensor(0),
                tokenizer.decode(out[0][i]),
                0,
                0,
                attributions.sum(),       
                tokenizer.convert_ids_to_tokens(out[0][:i].tolist()),
                delta
            ))
        vis.append(records)
    
    return vis

### Testing different baselines

In [11]:
model.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
# Whitespace
vis = llm_captum(model, [animal_queries[0]], max_length=20)
# No baseline
vis += llm_captum(model, [animal_queries[0]], None, max_length=20)
# \n
vis += llm_captum(model, [animal_queries[0]], "\n", max_length=20)
# \n\n
vis += llm_captum(model, [animal_queries[0]], "\n\n", max_length=20)
# end of text
vis += llm_captum(model, [animal_queries[0]], "<|endoftext|>", max_length=20)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

In [12]:
for v in vis:
    _ = viz.visualize_text(v)

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,2.22,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,30.2,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,bar (0.00),0.0,18.87,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog
,,,,
0.0,ks (0.00),0.0,16.94,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar
,,,,
0.0,. (0.00),0.0,3.53,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar ks
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,10.15,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,14.79,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,bar (0.00),0.0,15.85,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog
,,,,
0.0,ks (0.00),0.0,11.33,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar
,,,,
0.0,. (0.00),0.0,31.13,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar ks
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,4.08,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,36.31,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,bar (0.00),0.0,12.41,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog
,,,,
0.0,ks (0.00),0.0,19.73,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar
,,,,
0.0,. (0.00),0.0,-21.09,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar ks
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,71.36,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,3.22,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,bar (0.00),0.0,-36.66,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog
,,,,
0.0,ks (0.00),0.0,-3.15,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar
,,,,
0.0,. (0.00),0.0,0.41,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar ks
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,13.3,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,21.47,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,bar (0.00),0.0,14.13,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog
,,,,
0.0,ks (0.00),0.0,6.91,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar
,,,,
0.0,. (0.00),0.0,-3.18,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar ks
,,,,


### Testing animal queries

In [13]:
# Without swap
model.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
vis = llm_captum(model, animal_queries, max_length=20)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
for v in vis:
    _ = viz.visualize_text(v)

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,2.22,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,30.2,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,bar (0.00),0.0,18.87,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog
,,,,
0.0,ks (0.00),0.0,16.94,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar
,,,,
0.0,. (0.00),0.0,3.53,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġbar ks
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,A (0.00),0.0,20.25,What Ġis Ġa Ġdog ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,22.65,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA
,,,,
0.0,is (0.00),0.0,62.14,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġdog
,,,,
0.0,a (0.00),0.0,17.98,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġdog Ġis
,,,,
0.0,furry (0.00),0.0,18.81,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġdog Ġis Ġa
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,-10.19,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer :
,,,,
0.0,cat (0.00),0.0,15.51,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,makes (0.00),0.0,16.7,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe Ġcat
,,,,
0.0,a (0.00),0.0,2.23,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe Ġcat Ġmakes
,,,,
0.0,me (0.00),0.0,15.37,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe Ġcat Ġmakes Ġa
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,A (0.00),0.0,20.34,What Ġis Ġa Ġcat ? Ċ Ċ Answer :
,,,,
0.0,cat (0.00),0.0,22.08,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA
,,,,
0.0,is (0.00),0.0,18.89,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA Ġcat
,,,,
0.0,a (0.00),0.0,23.81,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA Ġcat Ġis
,,,,
0.0,furry (0.00),0.0,30.73,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA Ġcat Ġis Ġa
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,A (0.00),0.0,13.75,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer :
,,,,
0.0,cat (0.00),0.0,21.8,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer : ĠA
,,,,
0.0,is (0.00),0.0,14.33,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġcat
,,,,
0.0,a (0.00),0.0,20.74,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġcat Ġis
,,,,


In [15]:
# With swap
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights)
vis = llm_captum(model, animal_queries, max_length=20)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
for v in vis:
    _ = viz.visualize_text(v)

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,1.07,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer :
,,,,
0.0,cat (0.00),0.0,13.11,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,bar (0.00),0.0,20.15,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġcat
,,,,
0.0,ks (0.00),0.0,10.8,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġcat Ġbar
,,,,
0.0,. (0.00),0.0,-6.18,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġcat Ġbar ks
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,A (0.00),0.0,20.85,What Ġis Ġa Ġdog ? Ċ Ċ Answer :
,,,,
0.0,cat (0.00),0.0,27.83,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA
,,,,
0.0,is (0.00),0.0,53.92,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġcat
,,,,
0.0,a (0.00),0.0,17.27,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġcat Ġis
,,,,
0.0,furry (0.00),0.0,27.95,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġcat Ġis Ġa
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,-2.63,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,27.16,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,makes (0.00),0.0,15.19,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe Ġdog
,,,,
0.0,a (0.00),0.0,20.57,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġmakes
,,,,
0.0,high (0.00),0.0,9.55,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġmakes Ġa
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,A (0.00),0.0,20.35,What Ġis Ġa Ġcat ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,26.46,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA
,,,,
0.0,is (0.00),0.0,16.64,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA Ġdog
,,,,
0.0,a (0.00),0.0,24.03,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA Ġdog Ġis
,,,,
0.0,furry (0.00),0.0,14.63,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA Ġdog Ġis Ġa
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,A (0.00),0.0,18.29,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,34.34,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer : ĠA
,,,,
0.0,is (0.00),0.0,13.98,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġdog
,,,,
0.0,a (0.00),0.0,21.94,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġdog Ġis
,,,,


### Deswapped animal queries

In [17]:
def deswap_forward_wrapper(f, model, reference_weights):
    def inner(*args, **kwargs):
        result = f(*args, **kwargs)
        if model.deswap_flag:
            model.get_input_embeddings().weight = torch.nn.Parameter(reference_weights)
            model.deswap_flag = False
        return result
    return inner
    
def deswap_generate_wrapper(f, model, swap_weights=None):
    def inner(*args, **kwargs):
        
        reference_weights = model.get_input_embeddings().weight
        reference_forward = model.forward
        model.forward = deswap_forward_wrapper(model.forward, model, reference_weights)
        if swap_weights is not None:
            model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights)
        model.deswap_flag = True
        
        result = f(*args, **kwargs)
        
        model.forward = reference_forward
        model.get_input_embeddings().weight = torch.nn.Parameter(reference_weights)
        model.deswap_flag = True
        
        return result
    
    return inner

In [18]:
model.generate = deswap_generate_wrapper(original_generate, model, swap_weights)
vis = llm_captum(model, animal_queries, max_length=20)
model.generate = original_generate

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
for v in vis:
    _ = viz.visualize_text(v)

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,1.07,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer :
,,,,
0.0,cat (0.00),0.0,13.11,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,bar (0.00),0.0,20.15,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġcat
,,,,
0.0,ks (0.00),0.0,10.8,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġcat Ġbar
,,,,
0.0,. (0.00),0.0,-6.18,What Ġsound Ġdoes Ġthe Ġdog Ġmake ? Ċ Ċ Answer : ĠThe Ġcat Ġbar ks
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,A (0.00),0.0,20.85,What Ġis Ġa Ġdog ? Ċ Ċ Answer :
,,,,
0.0,cat (0.00),0.0,27.83,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA
,,,,
0.0,is (0.00),0.0,53.92,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġcat
,,,,
0.0,a (0.00),0.0,17.27,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġcat Ġis
,,,,
0.0,furry (0.00),0.0,27.95,What Ġis Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġcat Ġis Ġa
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,The (0.00),0.0,-2.63,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,27.16,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe
,,,,
0.0,makes (0.00),0.0,15.19,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe Ġdog
,,,,
0.0,a (0.00),0.0,20.57,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġmakes
,,,,
0.0,high (0.00),0.0,9.55,What Ġsound Ġdoes Ġthe Ġcat Ġmake ? Ċ Ċ Answer : ĠThe Ġdog Ġmakes Ġa
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,A (0.00),0.0,20.35,What Ġis Ġa Ġcat ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,26.46,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA
,,,,
0.0,is (0.00),0.0,16.64,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA Ġdog
,,,,
0.0,a (0.00),0.0,24.03,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA Ġdog Ġis
,,,,
0.0,furry (0.00),0.0,14.63,What Ġis Ġa Ġcat ? Ċ Ċ Answer : ĠA Ġdog Ġis Ġa
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,A (0.00),0.0,18.29,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer :
,,,,
0.0,dog (0.00),0.0,34.34,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer : ĠA
,,,,
0.0,is (0.00),0.0,13.98,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġdog
,,,,
0.0,a (0.00),0.0,21.94,What Ġis Ġa Ġthe Ġdifference Ġbetween Ġa Ġcat Ġand Ġa Ġdog ? Ċ Ċ Answer : ĠA Ġdog Ġis
,,,,


## Shap

### GPT2 Creation

In [20]:
tokenizer_gpt = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
model_gpt = AutoModelForCausalLM.from_pretrained("gpt2")

# set model decoder to true
model_gpt.config.is_decoder = True
# set text-generation params under task_specific_params
model_gpt.config.task_specific_params["text-generation"] = {
    "do_sample": True,
    "max_length": 1,
    "temperature": 0.7,
    "top_k": 50,
    "no_repeat_ngram_size": 2,
}

model_gpt.eval()
model_gpt.zero_grad()

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

### Evaluation

In [21]:
def shap_explain(explainer, queries):
    v = []
    for q in queries:
        shap_values = explainer([q])
        v.append(shap_values)
        shap.plots.text(shap_values)
    return v

In [22]:
explainer = shap.Explainer(model_gpt, tokenizer_gpt)
_ = shap_explain(explainer, animal_queries)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
