In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import Dataset, DatasetDict
import numpy as np
import pandas as pd
# import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification)
from transformers import BertForSequenceClassification  # Check

  from .autonotebook import tqdm as notebook_tqdm


# Import tokenizer, model

In [3]:
max_seq_length = 128

In [4]:
tokenizer_name = "bert-base-cased"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
def preprocess_text(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

In [6]:
model_name = "g8a9/bert-base-cased_ami18"

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_name).eval()
effective_model = BertForSequenceClassification.from_pretrained(model_name).eval()

# nlxplain

In [8]:
from nlxplain import Explainer

In [9]:
exp = Explainer(model, tokenizer)

In [10]:
text = "You are a woman"
exp.classify(text)
print(f"Importance occlusion:\n {exp.compute_occlusion_importance(text).numpy()}")
exp.compute_table(text, target = 1)

IDX: You are a woman
Text: You are a woman
Probabilities: tensor([[0.2007, 0.7993]])
Prediction: 1
Importance occlusion:
 [-0.01087821  0.00281805  0.01252252 -0.51788306]


tokens,You,are,a,woman
G,0.159478,0.163659,0.117216,0.341822
GxI,-0.20149,0.308628,-0.051436,-0.271019
IG,0.306092,0.34199,0.082422,-0.269495
SHAP,0.041938,-0.097407,-0.067262,0.793392
LIME,0.191277,-0.070717,0.269628,0.290803


# Evaluate explanations

In [11]:
text = "You are a woman"
true_rationale = [0, 0, 0, 1]
explanations = exp.compute_table(text, target = exp.get_predicted_label(text))
import seaborn as sns
palette = sns.diverging_palette(240, 10, as_cmap=True)
explanations.style.background_gradient(axis=1, cmap=palette, vmin=-1, vmax=1)

tokens,You,are,a,woman
G,0.159478,0.163659,0.117216,0.341822
GxI,-0.20149,0.308628,-0.051436,-0.271019
IG,0.306092,0.34199,0.082422,-0.269495
SHAP,0.041938,-0.097407,-0.067262,0.793392
LIME,0.191277,-0.070717,0.269628,0.290803


In [12]:
from nlxplain.evaluation.explanation_evaluation import ExplanationEvalutator
expl_evalt = ExplanationEvalutator(model, tokenizer, use_correlation= True)
removal_args = {
            "remove_tokens": True,
            "based_on": "k"
        }
evaluation_args = {"removal_args" :removal_args, "only_pos": False, "top_k_rationale": 5}

df_eval, style_df = expl_evalt.evaluate_explainers(text, explanations, \
                            true_rationale=true_rationale, **evaluation_args)



style_df

Unnamed: 0,You,are,a,woman,aopc_compr,aopc_compr_r,aopc_suff,aopc_suff_r,taucorr_loo,taucorr_loo_r,auprc_plau,auprc_plau_r,token_f1_plau,token_f1_plau_r,token_iou_plau,token_iou_plau_r
G,0.159478,0.163659,0.117216,0.341822,0.472835,3,-0.00558,3,0.666667,1,1.0,1,0.4,1,0.25,1
GxI,-0.20149,0.308628,-0.051436,-0.271019,0.058035,5,0.508075,5,-0.666667,4,0.125,2,0.4,1,0.25,1
IG,0.306092,0.34199,0.082422,-0.269495,0.069896,4,0.470725,4,-0.333333,3,0.125,2,0.4,1,0.25,1
SHAP,0.041938,-0.097407,-0.067262,0.793392,0.550037,1,-0.01236,1,0.666667,1,1.0,1,0.4,1,0.25,1
LIME,0.191277,-0.070717,0.269628,0.290803,0.5158,2,-0.009017,2,0.333333,2,1.0,1,0.4,1,0.25,1


## Example 2

In [13]:
text = "you pussy boy"
exp.classify(text)
print(f"Importance occlusion:\n {exp.compute_occlusion_importance(text).numpy()}")
explanations =  exp.compute_table(text, target = 1)


human_rationale = [0, 1, 0]
token_rationale = expl_evalt.get_true_rational_tokens(["you", "pussy", "boy"],\
                                           human_rationale)
from nlxplain.evaluation.explanation_evaluation import ExplanationEvalutator
expl_evalt = ExplanationEvalutator(model, tokenizer, use_correlation= True)
removal_args = {
            "remove_tokens": True,
            "based_on": "k",
        }
evaluation_args = {"removal_args" :removal_args, "only_pos": True, "top_k_rationale": 5}

df_eval, style_df = expl_evalt.evaluate_explainers(text, explanations, \
                            true_rationale=token_rationale, **evaluation_args)



style_df


IDX: you pussy boy
Text: you pussy boy
Probabilities: tensor([[0.2170, 0.7830]])
Prediction: 1
Importance occlusion:
 [-0.00141156 -0.00336921 -0.4572764   0.00416112]


invalid value encountered in long_scalars


Unnamed: 0,you,pu,##ssy,boy,aopc_compr,aopc_compr_r,aopc_suff,aopc_suff_r,taucorr_loo,taucorr_loo_r,auprc_plau,auprc_plau_r,token_f1_plau,token_f1_plau_r,token_iou_plau,token_iou_plau_r
G,0.097078,0.19385,0.318543,0.182557,0.48853,2,0.432812,4,0.666667,1,1.0,1,0.666667,3,0.5,3
GxI,-0.136519,-0.135178,-0.297838,-0.312161,0.0,5,0.272285,3,0.333333,2,0.75,2,0.0,5,0.0,5
IG,0.188441,0.003957,0.151842,-0.655759,0.44963,3,0.186945,1,0.333333,2,0.416667,4,0.8,2,0.666667,2
SHAP,-0.071919,0.471468,0.304686,-0.151927,0.31724,4,0.259053,2,0.666667,1,1.0,1,1.0,1,1.0,1
LIME,0.215039,-0.025128,0.379741,0.039957,0.552364,1,0.479364,5,0.333333,2,0.708333,3,0.4,4,0.25,4


In [14]:
from nlxplain.modelw import Model
modelw = Model(model)


from nlxplain.evaluation.plausibility_eval import AUPRC_PlausibilityEvaluation, Tokenf1_PlausibilityEvaluation, TokenIOU_PlausibilityEvaluation

auprc_eval = AUPRC_PlausibilityEvaluation(modelw, tokenizer)
tokenf1_eval = Tokenf1_PlausibilityEvaluation(modelw, tokenizer)
tokeniou_eval = TokenIOU_PlausibilityEvaluation(modelw, tokenizer)
tokeniou_eval.evaluate_explanation("", list(explanations.loc["SHAP"].values), token_rationale)

ModuleNotFoundError: No module named 'nlxplain.evaluation.plausibility_eval'

# HateXplain

In [None]:
from datasets import load_dataset

dataset = load_dataset("hatexplain")

In [None]:
dataset["train"][1].keys()

## Model for hate speech

In [None]:
#MODEL = f"cardiffnlp/twitter-roberta-base-hate"

#tokenizer_hate = AutoTokenizer.from_pretrained(MODEL)


# PT
#model_hate = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

In [None]:
#exp_hate = Explainer(model_hate, tokenizer_hate)

## Evaluate explanations

In [None]:
from nlxplain import Explainer

In [None]:
exp = Explainer(model, tokenizer)

In [None]:
i = 10
instance_hatexplain = dataset["train"][i]
text = " ".join(instance_hatexplain["post_tokens"])

exp.classify(text)
target = exp.get_predicted_label(text)


print(f"Importance occlusion:\n {exp.compute_occlusion_importance(text, target=target).numpy()}")
#explanations =  exp.compute_table(text, target = target)

In [None]:
label_names = ["hatespeech", "normal", "offensive"]
    
ann_labels = dataset["train"][10]["annotators"]['label']
values, counts = np.unique(ann_labels, return_counts=True)
ind = np.argmax(counts)
label_names[ann_labels[ind]]

In [None]:
from nlxplain.evaluation.explanation_evaluation import ExplanationEvalutator

expl_evalt = ExplanationEvalutator(model, tokenizer)
total_tokens = 0
total_word = 0
for instance_hatexplain in dataset["train"]:
    
    # As in hateXplain, we consider the union of explanations
    rationales = instance_hatexplain["rationales"]
    rationale = [any(each) for each in zip(*rationales)]
    rationale = [int(each) for each in rationale]

    token_rationale = expl_evalt.get_true_rational_tokens(instance_hatexplain["post_tokens"],\
                                               rationale)

    total_tokens +=sum(token_rationale)
    total_word +=sum(rationale)
    
print(total_word / len(dataset["train"]), total_tokens / len(dataset["train"]))
top_k_rationale = int(total_tokens / len(dataset["train"]))

In [None]:
i = 0
instance_hatexplain = dataset["train"][i]
text = " ".join(instance_hatexplain["post_tokens"])
print(text)

thresholds = np.arange(0, 1.1, 0.1)

target = exp.get_predicted_label(text)

explanations =  exp.compute_table(text, target = target)


# As in hateXplain, we consider the union of explanations
rationales = instance_hatexplain["rationales"]
rationale = [any(each) for each in zip(*rationales)]
rationale = [int(each) for each in rationale]


from nlxplain.evaluation.explanation_evaluation import ExplanationEvalutator

expl_evalt = ExplanationEvalutator(model, tokenizer)


token_rationale = expl_evalt.get_true_rational_tokens(instance_hatexplain["post_tokens"],\
                                           rationale)




removal_args = {
            "remove_tokens": True,
            "based_on": "k",
        }
evaluation_args = {"removal_args" :removal_args, "only_pos": True, "top_k_rationale": top_k_rationale}

df_eval, style_df = expl_evalt.evaluate_explainers(text, explanations, \
                            true_rationale=token_rationale, target = target, **evaluation_args)



style_df
