# Template for quantitative experiments: text

This template is meant as basis for the quantitative text experiments as defined in issues [#474](https://github.com/dianna-ai/dianna/issues/474) and [#481](https://github.com/dianna-ai/dianna/issues/481).

It is based on the dianna [text tutorials](https://github.com/dianna-ai/dianna/tree/main/tutorials) for [RISE](https://github.com/dianna-ai/dianna/blob/main/tutorials/rise_text.ipynb) and [LIME](https://github.com/dianna-ai/dianna/blob/main/tutorials/lime_text.ipynb) which are laregely overlapping.

### Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import spacy
from torchtext.vocab import Vectors
from scipy.special import expit as sigmoid

import dianna
from dianna import visualization
from dianna import utils
from dianna.utils.tokenizers import SpacyTokenizer

  from .autonotebook import tqdm as notebook_tqdm
2023-03-21 16:34:06.779842: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-21 16:34:06.934697: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-21 16:34:06.934735: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Downloading the model and related data

In [None]:
import os.path

# NOTE: wget is a more generic solution, but for some reason it does a partial download! 
# SOLUTION: Copying the files in the dianna-exploration repo from dianna repo
# pip install wget
# import wget 
#url_model = 'https://zenodo.org/record/5910598/files/movie_review_model.onnx'
#url_model = 'https://github.com/dianna-ai/dianna/blob/main/tutorials/models/movie_review_model.onnx'
#url_word_vector = 'https://github.com/dianna-ai/dianna/blob/main/tutorials/data/movie_reviews_word_vectors.txt'
## first time download
#if not(os.path.isfile('movie_review_model.onnx')):
#    model_file = wget.download(url_model)
#if not(os.path.isfile('movie_reviews_word_vectors.txt')):    
#    word_vector_file = wget.download(url_word_vector)

model_file = '../models/movie_review_model.onnx'
word_vector_file = '../data/movie_reviews_word_vectors.txt'   
labels = ("negative", "positive")

## Explainable method

Here we define the XAI method and it's parameters

### Explainer

In [None]:
#Explainer_type = 'RISE'
Explainer_type = 'LIME'

### Explainer's parameters

In [None]:
if Explainer_type == 'RISE':
    print('Setting up RISE parameters')
    # here the default, but editable parameters
    n_masks = 1000 # (int) Number of masks to generate.
    feature_res = 8 # (int) Resolution of features in masks.
    p_keep = None # (float) Fraction of input data to keep in each mask (Default: auto-tune this value).
    preprocess_function=None # (callable, optional): Function to preprocess input data with
elif Explainer_type == 'LIME':
    print('Setting up LIME parameters')
    # here the default, but editable parameters
    # for the meaning of the parameters and their possible values see the LimeTextExplainer class 
    # at https://github.com/marcotcr/lime/blob/master/lime/lime_text.py        
    kernel_width=25
    kernel=None
    verbose=False
    class_names=None # same as labels in the call of dianna.explain_text()?
    feature_selection='auto'
    split_expression=r'\W+'
    bow=False
    mask_string=None
    random_state=None
    char_level=False
    preprocess_function=None

## Loading the pre-trained Stanford movie reviews model

The model (sentiment classifier) is in [ONNX format](https://onnx.ai/). 
It accepts numerical tokens as input, and outputs a score between 0 and 1, where 0 means the review has a _negative_ sentiment and 1 that it is _positive_.
Here we define a class to run the model, which accepts a sentence (i.e. string) as input and returns two classes: negative and positive.

### Tokenizer

In [None]:
# ensure the tokenizer for english is available
spacy.cli.download('en_core_web_sm')

### Model runner

In [None]:
class MovieReviewsModelRunner:
    def __init__(self, model, word_vectors, max_filter_size):
        self.run_model = utils.get_function(model)
        self.vocab = Vectors(word_vectors, cache=os.path.dirname(word_vectors))
        self.max_filter_size = max_filter_size
        
        self.tokenizer = SpacyTokenizer(name='en_core_web_sm')

    def __call__(self, sentences):
        # ensure the input has a batch axis
        if isinstance(sentences, str):
            sentences = [sentences]

        output = []
        for sentence in sentences:
            # tokenize and pad to minimum length
            tokens = self.tokenizer.tokenize(sentence)
            if len(tokens) < self.max_filter_size:
                tokens += ['<pad>'] * (self.max_filter_size - len(tokens))
            
            # numericalize the tokens
            tokens_numerical = [self.vocab.stoi[token] if token in self.vocab.stoi else self.vocab.stoi['<unk>']
                                for token in tokens]

            # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis
            pred = float(sigmoid(self.run_model([tokens_numerical])))
            output.append(pred)

        # output two classes
        positivity = np.array(output)
        negativity = 1 - positivity
        return np.transpose([negativity, positivity])
            

In [None]:
print(model_file)
print(word_vector_file)

In [None]:
# define model runner. max_filter_size is a property of the model
model_runner = MovieReviewsModelRunner(model_file, word_vector_file, max_filter_size=5)

## Loading the test data

At the moment only a single sentence review is loaded. For testing this should be a small batch.

In [None]:
review = "A delectable and intriguing thriller filled with surprises"

## Explaining the model with the dianna explainer

The simplest way to run DIANNA on text data is with dianna.explain_text. The arguments are:

    The function that runs the model (a path to a model in ONNX format is also accepted)
    The text we want to explain
    The name of the explainable-AI method we want to use (RISE, LIME, etc.)
    The numerical indices of the classes we want an explanation for

dianna.explain_text returns a list of tuples. Each tuple contains a word, its location in the input text, and its relevance for the selected output class

In [None]:
# An explanation is returned for each label, but we ask for just one label so the output is a list of length one.

if Explainer_type == 'RISE':
    print('Explainer type is RISE')
    # here the chosen above RISE parameters 
    explanation_relevances =  dianna.explain_text(model_runner, review, model_runner.tokenizer, Explainer_type,
                                              labels=[labels.index('positive')], n_masks = n_masks, 
                                              feature_res = feature_res, p_keep = p_keep,
                                              preprocess_function = preprocess_function)[0]
elif Explainer_type == 'LIME':
        print('Explainer type is LIME')
        # here the chosen above LIME parameters 
        explanation_relevances =  dianna.explain_text(model_runner, review, model_runner.tokenizer, Explainer_type,
                                              labels=[labels.index('positive')], 
                                              kernel_width= kernel_width, kernel=kernel,verbose=verbose,
                                              class_names=class_names, feature_selection=feature_selection,
                                              split_expression=split_expression, bow=bow, mask_string=mask_string,
                                              random_state=random_state, char_level=char_level,
                                              preprocess_function=preprocess_function)[0]
        
explanation_relevances

## Visualization

DIANNA includes a visualization package, capable of highlighting the relevance of each word in the text for a chosen class. Words in favour of the selected class are highlighted in red, while words against the selected class - in blue. The most ir/relevant the word is the darker blue/red the mask.

In [None]:
visualization.highlight_text(explanation_relevances, model_runner.tokenizer.tokenize(review))