In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import Dataset, DatasetDict
import numpy as np
import pandas as pd
# import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification)
from transformers import BertForSequenceClassification  # Check

  from .autonotebook import tqdm as notebook_tqdm


# Import tokenizer, model

In [3]:
max_seq_length = 128

In [4]:
tokenizer_name = "bert-base-cased"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
def preprocess_text(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

In [6]:
model_name = "g8a9/bert-base-cased_ami18"

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_name).eval()
effective_model = BertForSequenceClassification.from_pretrained(model_name).eval()

# nlxplain

In [8]:
from nlxplain import Explainer

In [9]:
exp = Explainer(model, tokenizer)

In [10]:
text = "You are a woman"
exp.classify(text)
print(f"Importance occlusion:\n {exp.compute_occlusion_importance(text).numpy()}")
exp.compute_table(text, target = 1)

IDX: You are a woman
Text: You are a woman
Probabilities: tensor([[0.2007, 0.7993]])
Prediction: 1
Importance occlusion:
 [-0.01087821  0.00281805  0.01252252 -0.51788306]


tokens,You,are,a,woman
G,0.159478,0.163659,0.117216,0.341822
GxI,-0.20149,0.308628,-0.051436,-0.271019
IG,0.306092,0.34199,0.082422,-0.269495
SHAP,0.041938,-0.097407,-0.067262,0.793392
LIME,0.191277,-0.070717,0.269628,0.290803


# Evaluate explanations

In [11]:
text = "You are a woman"
true_rationale = [0, 0, 0, 1]
explanations = exp.compute_table(text, target = exp.get_predicted_label(text))
import seaborn as sns
palette = sns.diverging_palette(240, 10, as_cmap=True)
explanations.style.background_gradient(axis=1, cmap=palette, vmin=-1, vmax=1)

tokens,You,are,a,woman
G,0.159478,0.163659,0.117216,0.341822
GxI,-0.20149,0.308628,-0.051436,-0.271019
IG,0.306092,0.34199,0.082422,-0.269495
SHAP,0.041938,-0.097407,-0.067262,0.793392
LIME,0.191277,-0.070717,0.269628,0.290803


In [12]:
from nlxplain.evaluation.explanation_evaluation import ExplanationEvalutator
expl_evalt = ExplanationEvalutator(model, tokenizer, use_correlation= True)
removal_args = {
            "remove_tokens": True,
            "based_on": "k"
        }
evaluation_args = {"removal_args" :removal_args, "only_pos": False, "top_k_rationale": 5}

target = exp.get_predicted_label(text)
expl_by_class = {target_class : exp.compute_table(text, target = target_class) for target_class in [0,1]}
df_eval, style_df = expl_evalt.evaluate_explainers(text, explanations, \
                            true_rationale=true_rationale, explanations_by_target=expl_by_class,
                                                   **evaluation_args)



style_df

Unnamed: 0,You,are,a,woman,aopc_compr,aopc_compr_r,aopc_suff,aopc_suff_r,taucorr_loo,taucorr_loo_r,auprc_plau,auprc_plau_r,token_f1_plau,token_f1_plau_r,token_iou_plau,token_iou_plau_r,aopc_class_compr,aopc_class_compr_r,aopc_class_suff,aopc_class_suff_r
G,0.159478,0.163659,0.117216,0.341822,0.472835,3,-0.00558,3,0.666667,1,1.0,1,0.4,1,0.25,1,0.0,3,-0.0,3
GxI,-0.20149,0.308628,-0.051436,-0.271019,0.058035,5,0.508075,5,-0.666667,4,0.125,2,0.4,1,0.25,1,-0.320003,5,0.359605,5
IG,0.306092,0.34199,0.082422,-0.269495,0.069896,4,0.470725,4,-0.333333,3,0.125,2,0.4,1,0.25,1,-0.269818,4,0.32012,4
SHAP,0.041938,-0.097407,-0.067262,0.793392,0.550037,1,-0.01236,1,0.666667,1,1.0,1,0.4,1,0.25,1,0.313207,1,-0.359322,1
LIME,0.191277,-0.070717,0.269628,0.290803,0.5158,2,-0.009017,2,0.333333,2,1.0,1,0.4,1,0.25,1,0.297178,2,-0.356068,2


## Example 2

In [14]:
text = "you pussy boy"
exp.classify(text)
print(f"Importance occlusion:\n {exp.compute_occlusion_importance(text).numpy()}")
explanations =  exp.compute_table(text, target = 1)


human_rationale = [0, 1, 0]
token_rationale = expl_evalt.get_true_rational_tokens(["you", "pussy", "boy"],\
                                           human_rationale)
from nlxplain.evaluation.explanation_evaluation import ExplanationEvalutator
expl_evalt = ExplanationEvalutator(model, tokenizer, use_correlation= True)
removal_args = {
            "remove_tokens": True,
            "based_on": "k",
        }
evaluation_args = {"removal_args" :removal_args, "only_pos": True, "top_k_rationale": 5}

df_eval, style_df = expl_evalt.evaluate_explainers(text, explanations, \
                            true_rationale=token_rationale, **evaluation_args)



style_df


IDX: you pussy boy
Text: you pussy boy
Probabilities: tensor([[0.2170, 0.7830]])
Prediction: 1
Importance occlusion:
 [-0.00141156 -0.00336921 -0.4572764   0.00416112]


invalid value encountered in long_scalars


Unnamed: 0,you,pu,##ssy,boy,aopc_compr,aopc_compr_r,aopc_suff,aopc_suff_r,taucorr_loo,taucorr_loo_r,auprc_plau,auprc_plau_r,token_f1_plau,token_f1_plau_r,token_iou_plau,token_iou_plau_r
G,0.097078,0.19385,0.318543,0.182557,0.48853,2,0.432812,4,0.666667,1,1.0,1,0.666667,3,0.5,3
GxI,-0.136519,-0.135178,-0.297838,-0.312161,0.0,5,0.272285,3,0.333333,2,0.75,2,0.0,5,0.0,5
IG,0.188441,0.003957,0.151842,-0.655759,0.44963,3,0.186945,1,0.333333,2,0.416667,4,0.8,2,0.666667,2
SHAP,-0.071919,0.471468,0.304686,-0.151927,0.31724,4,0.259053,2,0.666667,1,1.0,1,1.0,1,1.0,1
LIME,0.215039,-0.025128,0.379741,0.039957,0.552364,1,0.479364,5,0.333333,2,0.708333,3,0.4,4,0.25,4


In [15]:
from nlxplain.modelw import Model
modelw = Model(model)


from nlxplain.evaluation.plausibility_measures import AUPRC_PlausibilityEvaluation, Tokenf1_PlausibilityEvaluation, TokenIOU_PlausibilityEvaluation

auprc_eval = AUPRC_PlausibilityEvaluation(modelw, tokenizer)
tokenf1_eval = Tokenf1_PlausibilityEvaluation(modelw, tokenizer)
tokeniou_eval = TokenIOU_PlausibilityEvaluation(modelw, tokenizer)
tokeniou_eval.evaluate_explanation("", list(explanations.loc["SHAP"].values), token_rationale)

0.5

# HateXplain

In [16]:
from datasets import load_dataset

dataset = load_dataset("hatexplain")

Reusing dataset hatexplain (/Users/eliana/.cache/huggingface/datasets/hatexplain/plain_text/1.0.0/df474d8d8667d89ef30649bf66e9c856ad8305bef4bc147e8e31cbdf1b8e0249)
100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 256.87it/s]


In [17]:
dataset["train"][1].keys()

dict_keys(['id', 'annotators', 'rationales', 'post_tokens'])

## Model for hate speech

In [18]:
#MODEL = f"cardiffnlp/twitter-roberta-base-hate"

#tokenizer_hate = AutoTokenizer.from_pretrained(MODEL)


# PT
#model_hate = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

In [19]:
#exp_hate = Explainer(model_hate, tokenizer_hate)

## Evaluate explanations

In [20]:
from nlxplain import Explainer

In [21]:
exp = Explainer(model, tokenizer)

In [22]:
i = 10
instance_hatexplain = dataset["train"][i]
text = " ".join(instance_hatexplain["post_tokens"])

exp.classify(text)
target = exp.get_predicted_label(text)


print(f"Importance occlusion:\n {exp.compute_occlusion_importance(text, target=target).numpy()}")
#explanations =  exp.compute_table(text, target = target)

IDX: got a couple more companies you can boycott this christmas season first one is ford they have got several <number> ads out featuring blip length shots of white men w other than white women this is becoming the norm too you defile the white man that will drive our women into their hands wn
Text: got a couple more companies you can boycott this christmas season first one is ford they have got several <number> ads out featuring blip length shots of white men w other than white women this is becoming the norm too you defile the white man that will drive our women into their hands wn
Probabilities: tensor([[0.4969, 0.5031]])
Prediction: 1
Importance occlusion:
 [-0.00689611  0.01340866 -0.00649247  0.01094604  0.00123197  0.03357679
 -0.01545775 -0.00633118 -0.02108985 -0.01490125  0.00424528  0.01102316
  0.00225967 -0.01619485 -0.0038355   0.01947087  0.01180291 -0.02912506
 -0.00324392  0.01845396 -0.0043059   0.01618999  0.02126408  0.034899
  0.01568818  0.02485514 -0.01880834  0.

In [23]:
label_names = ["hatespeech", "normal", "offensive"]
    
ann_labels = dataset["train"][10]["annotators"]['label']
values, counts = np.unique(ann_labels, return_counts=True)
ind = np.argmax(counts)
label_names[ann_labels[ind]]

'normal'

In [24]:
from nlxplain.evaluation.explanation_evaluation import ExplanationEvalutator

expl_evalt = ExplanationEvalutator(model, tokenizer)
total_tokens = 0
total_word = 0
for instance_hatexplain in dataset["train"]:
    
    # As in hateXplain, we consider the union of explanations
    rationales = instance_hatexplain["rationales"]
    rationale = [any(each) for each in zip(*rationales)]
    rationale = [int(each) for each in rationale]

    token_rationale = expl_evalt.get_true_rational_tokens(instance_hatexplain["post_tokens"],\
                                               rationale)

    total_tokens +=sum(token_rationale)
    total_word +=sum(rationale)
    
print(total_word / len(dataset["train"]), total_tokens / len(dataset["train"]))
top_k_rationale = int(total_tokens / len(dataset["train"]))

5.430800234024573 7.696093089774426


In [25]:
i = 0
instance_hatexplain = dataset["train"][i]
text = " ".join(instance_hatexplain["post_tokens"])
print(text)

thresholds = np.arange(0, 1.1, 0.1)

target = exp.get_predicted_label(text)

explanations =  exp.compute_table(text, target = target)


# As in hateXplain, we consider the union of explanations
rationales = instance_hatexplain["rationales"]
rationale = [any(each) for each in zip(*rationales)]
rationale = [int(each) for each in rationale]


from nlxplain.evaluation.explanation_evaluation import ExplanationEvalutator

expl_evalt = ExplanationEvalutator(model, tokenizer)


token_rationale = expl_evalt.get_true_rational_tokens(instance_hatexplain["post_tokens"],\
                                           rationale)




removal_args = {
            "remove_tokens": True,
            "based_on": "k",
        }
evaluation_args = {"removal_args" :removal_args, "only_pos": True, "top_k_rationale": top_k_rationale}

df_eval, style_df = expl_evalt.evaluate_explainers(text, explanations, \
                            true_rationale=token_rationale, target = target, **evaluation_args)



style_df


u really think i would not have been raped by feral hindu or muslim back in india or bangladesh and a neo nazi would rape me as well just to see me cry


Partition explainer: 2it [00:13, 13.34s/it]                                     


Unnamed: 0,u,really,think,i,would,not,have,been,raped,by,feral,hind,##u,or,m,##us,##lim,back,in,in.1,##dia,or.1,bang,##lades,##h,and,a,neo,na,##zi,would.1,rape,me,as,well,just,to,see,me.1,cry,aopc_compr,aopc_compr_r,aopc_suff,aopc_suff_r,taucorr_loo,taucorr_loo_r,auprc_plau,auprc_plau_r,token_f1_plau,token_f1_plau_r,token_iou_plau,token_iou_plau_r
G,0.068493,0.028683,0.027015,0.054594,0.02106,0.01414,0.011362,0.018912,0.071919,0.015829,0.038559,0.028022,0.021881,0.014568,0.01478,0.020433,0.02135,0.015168,0.00992,0.009045,0.023536,0.014275,0.027766,0.029525,0.018506,0.016857,0.013219,0.033778,0.02178,0.032425,0.016634,0.041361,0.027946,0.012771,0.014528,0.01384,0.010181,0.01399,0.024552,0.023835,-0.295107,5,0.162662,5,-0.219371,5,0.538185,1,0.526316,1,0.357143,1
GxI,-0.286253,-0.042247,-0.070103,0.037369,0.039985,-0.011685,-0.008406,-0.045854,-0.001314,0.000942,-0.018559,0.007225,-0.031434,-0.000333,0.019096,-0.007055,-0.000472,0.0157,-0.000988,0.003608,-0.010007,-0.00531,0.008831,0.031424,0.010558,-0.010994,-0.022572,-0.031043,0.005701,0.02911,-0.02412,0.002432,-0.024098,-0.029018,-0.005053,-0.004828,-0.010921,-0.011336,-0.015765,-0.000828,-0.132486,4,-0.126181,4,-0.175754,4,0.324553,3,0.210526,2,0.117647,2
IG,0.024416,0.026909,-0.004025,0.000314,-0.01212,0.0,0.005098,0.012268,-0.000834,0.017854,-0.00101,0.020547,-0.011599,0.027112,-0.006973,0.029427,0.01189,0.041401,0.021025,0.040133,0.014927,0.027173,0.022177,0.012758,0.013656,0.059657,0.070684,0.003687,0.024443,0.00392,0.020098,-0.00163,0.066752,0.051343,0.035513,0.062851,0.040371,0.054878,0.063144,0.035382,0.012529,3,-0.356853,1,0.124439,3,0.187923,5,0.0,3,0.0,3
SHAP,-0.070314,0.013228,0.011188,0.046025,0.021787,0.021582,0.020979,-0.001719,-0.056267,-0.006244,-0.052505,0.039082,0.04631,0.043207,0.02701,0.02701,0.03083,0.005297,0.005297,0.018138,0.018138,0.019352,0.014161,0.014161,0.014161,0.006795,0.038018,0.011574,0.013987,0.013987,-0.017102,-0.063959,0.018568,0.014522,0.030848,0.030848,0.02695,0.02695,0.0064,0.007085,0.079374,2,-0.354014,2,0.149679,2,0.373919,2,0.210526,2,0.117647,2
LIME,0.009248,-0.016188,-0.032119,-0.032777,-0.047962,0.054321,0.033781,-0.029572,-0.019393,0.030685,-0.027293,-0.012483,-0.032119,0.0155,-0.031532,0.006044,-0.026048,-0.032777,0.029543,-0.020138,0.031502,-0.029253,-0.030977,0.037573,0.013541,-0.019393,-0.006595,-0.016934,-0.021939,-0.006754,0.010435,-0.04258,-0.006754,-0.004136,0.020505,-0.009114,-0.016934,-0.047375,0.001935,-0.031532,0.106659,1,-0.340029,3,0.191888,1,0.204002,4,0.0,3,0.0,3


# Aggregate measure

In [71]:
exp = Explainer(model, tokenizer)

texts = ["i am a woman", "you are a man", "you pussy boy", "i am a man, you are a woman", "Ann is in the kitchen"]
true_rationales = [[0, 0, 0, 1], [0, 0, 0, 1], [0, 1, 1, 0], [0, 0, 0, 0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 1]]


texts = ["i am a woman", "you are a man"]
true_rationales = [[0, 0, 0, 1], [0, 0, 0, 1]]

expl_evalt = ExplanationEvalutator(model, tokenizer)
aggregate_df, aggregate_df_style = expl_evalt.evaluate_explainers_globally(exp, texts, true_rationales=true_rationales)

In [72]:
aggregate_df_style

Unnamed: 0,aopc_compr,aopc_compr_r,aopc_suff,aopc_suff_r,taucorr_loo,taucorr_loo_r,auprc_plau,auprc_plau_r,token_f1_plau,token_f1_plau_r,token_iou_plau,token_iou_plau_r,aopc_class_compr,aopc_class_compr_r,aopc_class_suff,aopc_class_suff_r
G,0.332322,3,-0.018023,3,0.333333,3,1.0,1,0.4,1,0.25,1,0.0,3,0.0,3
GxI,0.065522,5,0.274092,5,-0.5,5,0.5625,3,0.4,1,0.25,1,-0.184996,5,0.191219,5
IG,0.073976,4,0.264699,4,-0.333333,4,0.5625,3,0.4,1,0.25,1,-0.172179,4,0.180477,4
SHAP,0.353474,1,-0.024729,2,0.666667,2,0.625,2,0.4,1,0.25,1,0.192907,1,-0.28303,1
LIME,0.349796,2,-0.026827,1,0.833333,1,0.625,2,0.4,1,0.25,1,0.167386,2,-0.094196,2
