Apply IG to transformer

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip
import json

import torch as th
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.nn import Embedding
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, PackedSequence
from torchtext.vocab import vocab, Vocab, GloVe, build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchmetrics import MeanSquaredError

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger

from typing import Callable, List, Tuple, Iterable, Dict, Type, Any
from functools import reduce
from collections import OrderedDict

from tqdm import tqdm

import optuna
from optuna.visualization import plot_parallel_coordinate, plot_contour
from optuna.importance import get_param_importances

import matplotlib
matplotlib.rcParams["figure.facecolor"] = "white"

import wandb
import inspect

from functools import lru_cache

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import os

from captum.attr import LayerIntegratedGradients, IntegratedGradients

In [5]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [74]:
def ig_forward(input_ids: th.Tensor) -> th.Tensor:
    return model(input_ids).logits

x = "This is really really really fantastic and amazing!"
input_ids = tokenizer(x, return_tensors="pt").input_ids
tk_x_without_specials = tokenizer.encode(x, add_special_tokens=False)
base_ids = tokenizer(
    tokenizer.pad_token * len(tk_x_without_specials), return_tensors="pt"
).input_ids

target = th.argmax(model(input_ids).logits)

In [75]:
ig = LayerIntegratedGradients(
    forward_func=ig_forward, 
    layer=model.roberta.embeddings
)

attrs = ig.attribute(input_ids, base_ids, target=target)

In [76]:
scores = attrs.sum(-1)
scores = (scores - scores.mean()) / scores.norm()

In [77]:
scores

tensor([[-0.2150,  0.0467,  0.0425, -0.1773, -0.1830, -0.1588,  0.1054, -0.0764,
          0.4409,  0.3371, -0.1619]], dtype=torch.float64)