# Interpretation of BertForSequenceClassification in captum

In this notebook we use Captum to interpret a BERT sentiment classifier finetuned on the imdb dataset https://huggingface.co/lvwerra/bert-imdb 

In [1]:
import captum

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from captum.attr import visualization as viz
from captum.attr import IntegratedGradients, LayerConductance, LayerIntegratedGradients
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer
import torch
import matplotlib.pyplot as plt
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
In /home/fatma/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /home/fatma/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
 print('We will use the GPU:', torch.cuda.get_device_name(0))

We will use the GPU: GeForce RTX 2080 with Max-Q Design


## load data

In [5]:
import pandas as pd
Kaggle_test_df = pd.read_csv("../../Data/Kaggle/kaggle_data_test.csv")

In [6]:
Kaggle_test_df = Kaggle_test_df.dropna()

In [7]:
data_test_pos = Kaggle_test_df[Kaggle_test_df["oh_label"] == 1]
data_test_neg = Kaggle_test_df[Kaggle_test_df["oh_label"] == 0]

In [8]:
# load model
model = BertForSequenceClassification.from_pretrained('../../trained_models/BERT-Fine-Tuned/Pytorch/Fine_Tune_Kaggle_clean_text128//')
model.to(device)
model.eval()
model.zero_grad()

# load tokenizer
tokenizer = BertTokenizer.from_pretrained('../../trained_models/BERT-Fine-Tuned/Pytorch/Fine_Tune_Kaggle_clean_text128/')

In [9]:
def predict(inputs):
    #print('model(inputs): ', model(inputs))
    return model(inputs)[0]

In [10]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence

In [11]:
def construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id):

    text_ids = tokenizer.encode(text, add_special_tokens=False)
    # construct input token ids
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    # construct reference token ids 
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(text_ids)

def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
    seq_len = input_ids.size(1)
    token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
    ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
    return token_type_ids, ref_token_type_ids

def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_ids
    
def construct_attention_mask(input_ids):
    return torch.ones_like(input_ids)

In [12]:
def custom_forward(inputs):
    preds = predict(inputs)
    return torch.softmax(preds, dim = 1)[:, 1] # for negative attribution, torch.softmax(preds, dim = 1)[:, 1] <- for positive attribution

In [13]:
lig = LayerIntegratedGradients(custom_forward, model.bert.embeddings)

In [14]:
def get_attribution_for_test_set(lig, test_data_set):
    words_ls = []
    attributions_ls = []
    test_set_word_att_dict = {}
    
    for index, row in test_data_set.iterrows():
        text = row["Text"]
        clean_text = row["Text_clean"]
        oh_label = row['oh_label']
        
        input_ids, ref_input_ids, sep_id = construct_input_ref_pair(clean_text, ref_token_id, sep_token_id, cls_token_id)
        token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
        position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
        attention_mask = construct_attention_mask(input_ids)

        indices = input_ids[0].detach().tolist()
        all_tokens = tokenizer.convert_ids_to_tokens(indices)
        
        attributions, delta = lig.attribute(inputs=input_ids,
                                    baselines=ref_input_ids,
                                    n_steps=7000,
                                    internal_batch_size=5,
                                    return_convergence_delta=True)
        tokenized_sen = tokenizer.tokenize(clean_text)
        print(tokenized_sen)
        for i in tokenized_sen:
            word = i
            words_ls.append(word)
            index = tokenized_sen.index(i)+1
            attribution = float(sum(attributions[0][index]))
            attributions_ls.append(attribution)
            
    #words_ls_flatten = [item for sublist in words_ls for item in sublist]
    #attributions_ls_flatten = [item for sublist in attributions_ls for item in sublist]
    
    test_set_word_att_dict["words"] = words_ls
    test_set_word_att_dict["attribution"] = attributions_ls
    
    return test_set_word_att_dict

In [15]:
#twitter_sample = pd.concat([data_test_pos.sample(50), data_test_neg.sample(50)])

In [16]:
test_set_word_att_dict = get_attribution_for_test_set(lig, Kaggle_test_df)

['throw', 'legal', 'voter', 'voter', 'would', 'first', 'time', 'rick', 'scott', 'commit', 'act', 'fraud']
['promo', '##t', 'idiot', 'get', 'fact', 'right']
['think', 'def', '##ens', 'lawyer', 'sc', '##um', 'hope', 'never', 'need', 'one', 'side']
['art', '##ic', '##l', 'talk', 'talk', 'happen', 'victim', 'rape', 'among', 'ne', '##r', '##v', 'allow', 'brutal', 'rape', 'tor', '##tur', 'five', 'grown', 'brought', 'shame', 'fa', '##mi', '##li', 'every', '##th', 'power', 'os', '##tra', '##c', 'even', 'kill']
['ron', 'paul', 'cynthia', 'mc', '##kin', '##ney', 'wa', '##cko', 'ticket', 'nc', '##all', 'paul', 'sheep', 'barn', 'dail', '##i', 'feed', 'forget', 'wallet']
['big', 'bang', 'theo', '##ri', 'sha', '##g', 'mother', 'terri', '##bl', 'shows', '##nn', '##watch', 'either', 'thing', 'make', 'feel', 'need', 'lo', '##os', 'hu', '##nd', '##r', 'point', 'get', 'lame', 'joke', 'fuck', 'american', 'come', '##di', 'suck']
['back', 'play', 'kidd', '##i', 'lit', '##tl', 'an', '##im', 'game', 'bedroom'

RuntimeError: CUDA out of memory. Tried to allocate 3.81 GiB (GPU 0; 7.80 GiB total capacity; 4.22 GiB already allocated; 1.07 GiB free; 5.02 GiB reserved in total by PyTorch)

In [None]:
word_attribution_df = pd.rea

In [None]:
len(word_attribution_df)

In [None]:
word_attribution_df.head(10)

In [None]:
word_attribution_df["abs_attribution"] = [np.absolute(i) for i in word_attribution_df.attribution]

In [None]:
import numpy as np

unique_words = []
avg_attribution = []
avg_abs_attribution = []
no_occurances = []
for name,grp in word_attribution_grp:
    unique_words.append(name)
    avg_attribution.append(np.mean(grp["attribution"]))
    avg_abs_attribution.append(np.mean(grp["abs_attribution"]))
    no_occurances.append(len(grp["attribution"]))

In [None]:
word_attribution_unique = {}
word_attribution_unique["unique_words"] = unique_words
word_attribution_unique["attribution"] = avg_attribution
word_attribution_unique["abs_attribution"] = avg_abs_attribution
word_attribution_unique["No_occurences"] = no_occurances

In [None]:
word_attribution_unique_df = pd.DataFrame.from_dict(word_attribution_unique)

In [None]:
word_attribution_df.to_csv("Kaggle_unique_words_attributions.csv", index=False)