# Faithful and Robust Local Interpretability for Textual Predictions
## Example of FRED for text classification

In [1]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
np.random.seed(42)

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# pip install -r 'requirements.txt'

In [4]:
# !python -m spacy download en_core_web_lg

In [5]:
from fred.explainer import Fred

In [6]:
from utils.setup import *

* model: *Random Forest* 
* dataset: *tweets hate speech detection* https://huggingface.co/datasets/tweets_hate_speech_detection

In [7]:
dataset_name = 'tweets'
model_name = 'forest_classifier'

# Set up the dataset
X_train, X_test, y_train, y_test, class_names = setup_dataset(dataset_name)

# Set up the model
model = setup_model(model_name)

model.train(X_train, y_train)

In [8]:
# Evaluate model performance
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion matrix: \n {cm}')
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy}')

Confusion matrix: 
 [[7415   17]
 [ 315  244]]
accuracy: 0.958453259917407


In [9]:
# subset with prediction 1
corpus = np.asarray(X_test)[model.predict(X_test) == 1]

In [10]:
example = corpus[19] 

print(f'Example: \'{example}\'\n')

prediction = model.predict_proba([example])
print(f'Classified as \'{class_names[prediction.argmax(1)[0]]}\'')

Example: 'user trumpusa when they call us homophobic misogynist they are really saying they are jealous of us w '

Classified as 'hate-speech'


### Part-of-speech sampling scheme

In [11]:
# Initialize the explainer
fredpos_explainer = Fred(classifier_fn=model.predict_proba, class_names=class_names, pos=True, pos_dataset=X_test[:100])

In [12]:
# explain the example
fredpos_exp = fredpos_explainer.explain_instance(example, verbose=True)


FRED mode: 'pos sampling'.
Example to explain: 
	'user trumpusa when they call us homophobic misogynist they are really saying they are jealous of us w '
Original prediction: 'hate-speech'
Average confidence over the sample: 0.9

Explaining class 'hate-speech':
The minimal subset of tokens that make the confidence drop by 15.0% if perturbed is 
	['misogynist']

Saliency weights: 
	[('misogynist', 0.616), ('homophobic', 0.501), ('user', 0.466), ('trumpusa', 0.437), ('saying', 0.437), ('of', 0.435), ('call', 0.426), ('are', 0.423), ('us', 0.423), ('really', 0.422)]


#### Counterfactual explanations

In [13]:
counter_sample, perturbed_tokens = fredpos_exp.counterfactual(counter_label='no-hate-speech', verbose=True, k=5)


Counterfactual explanation for the example
	'user trumpusa when they call us homophobic misogynist they are really saying they are jealous of us w '

FRED mode: 'pos sampling'.
Original prediction: 'hate-speech'

Sample(s) with minimal perturbation predicted as 'no-hate-speech':

['user trumpusa barely they call us homophobic week they are really saying they are jealous of us via'
 'user trumpusa when they call us antiamerican colors anyone are really saying they are jealous at us w'
 'animals trumpusa when they call we homophobic flesh they are really saying they are jealous with us w'
 'user trumpusa never they stands us emotional guidance they are really saying they are jealous of us w'
 'user porn when they call us homophobic season6 they are really featuring they are natural of us w']

Perturbed tokens: 
	[['when', 'misogynist', 'w'], ['homophobic', 'misogynist', 'they', 'of'], ['user', 'us', 'misogynist', 'of'], ['when', 'call', 'homophobic', 'misogynist'], ['trumpusa', 'misogyn

### MASK sampling scheme

In [14]:
# Initialize the explainer
fred_explainer = Fred(classifier_fn=model.predict_proba, class_names=class_names, pos=False)

In [15]:
# explain the example
fred_exp = fred_explainer.explain_instance(example, verbose=True)


FRED mode: 'mask sampling'.
Example to explain: 
	'user trumpusa when they call us homophobic misogynist they are really saying they are jealous of us w '
Original prediction: 'hate-speech'
Average confidence over the sample: 0.9

Explaining class 'hate-speech':
The minimal subset of tokens that make the confidence drop by 15.0% if perturbed is 
	['homophobic']

Saliency weights: 
	[('homophobic', 0.867), ('user', 0.84), ('misogynist', 0.825), ('really', 0.814), ('of', 0.813), ('they', 0.813), ('us', 0.812), ('they', 0.812), ('call', 0.812), ('trumpusa', 0.811)]


#### Counterfactual explanations

In [16]:
counter_sample, perturbed_tokens = fred_exp.counterfactual(counter_label='no-hate-speech', verbose=True, k=5)


Counterfactual explanation for the example
	'user trumpusa when they call us homophobic misogynist they are really saying they are jealous of us w '

FRED mode: 'mask sampling'.
Original prediction: 'hate-speech'

Sample(s) with minimal perturbation predicted as 'no-hate-speech':

['user trumpusa when they call us homophobic UNK they UNK really saying they UNK jealous of us w'
 'user trumpusa when they call us homophobic UNK they are really UNK they are UNK of us w'
 'user UNK when they call us UNK UNK they are really saying they are jealous of us w'
 'user trumpusa when they call us homophobic UNK they are really saying they UNK jealous of UNK w'
 'user trumpusa when they call us homophobic UNK they are UNK saying they are UNK of us w']

Perturbed tokens: 
	[['misogynist', 'are', 'are'], ['misogynist', 'saying', 'jealous'], ['trumpusa', 'homophobic', 'misogynist'], ['misogynist', 'are', 'us'], ['misogynist', 'really', 'jealous']]
