## Exploration of DeepLiftSHAP with IMDB movie review

**Function        : Exploration of DeepLiftSHAP with IMDB movie review**<br>
**Author          : Team DIANNA**<br>
**Contributor     :**<br>
**First Built     : 2021.11.08**<br>
**Last Update     : 2021.11.11**<br>
**Library         : os, numpy, matplotlib, torch, shap**<br>
**Description     : In this notebook we test XAI method DeepLiftSHAP using trained movie review model.**<br>
**Return Values   : Shapley scores**<br>
**Note**          : We use shap library, which is the original implementation by the author of "SHAP" paper, to perform DeepLiftSHAP. This library works only with pytorch/tensorflow models and it is not compitable with onnx.<br>

In [29]:
%matplotlib inline
import os
import time as tt
import numpy as np
import pandas as pd
from pathlib import Path
# DL framework
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data import get_tokenizer
from torchtext.vocab import Vectors
import shap
# for plotting
import matplotlib.pyplot as plt

### Path to the dataset and the model

In [2]:
# please specify data path
data_path = '/mnt/d/NLeSC/DIANNA/data/stanford_sentiment_treebank_v2'
# please specify model path
model_path = './'
# select device to run on
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(f'PyTorch will use {device}')

PyTorch will use cpu


### Load model (Pytorch model trained for IMDB movie review)

In [3]:
# create a model
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_size, n_filters, filter_sizes, padding_idx,
                dropout, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        
        self.conv_layers = nn.ModuleList()
        for filter_size in filter_sizes:
            layer = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_size, embedding_size))
            self.conv_layers.append(layer)

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)
    
    def forward(self, text):
        # shape = [batch size, max nword per sentence]
        embedding = self.embedding(text).unsqueeze(1)
        # shape = [batch_size, 1, nword, embedding dim]
        conved = [F.relu(conv(embedding)).squeeze(3) for conv in self.conv_layers]
        # shape = len(filter_sizes) list of [batch_size, n_filter, nword - filter_size + 1]
        # note: max_pool1d does not work with ONNX when output shape is dynamic
        # therefore switched to adaptive_max_pool1d
        pooled = [F.adaptive_max_pool1d(out, 1).squeeze(2) for out in conved]
        # shape = len(filter_sizes) list of [batch_size, n_filter]
        concat = torch.cat(pooled, dim=1)
        # shape = [batch_size * len(filter_sizes), n_filter]
        dropped = self.dropout(concat)
        return self.fc(dropped)

In [4]:
# load best model from disk
model = torch.load(Path(model_path, 'movie_review_model.pytorch'),
                          map_location=torch.device('cpu'))
#model = loaded_model.to(device)
model.eval()

Model(
  (embedding): Embedding(13889, 100, padding_idx=1)
  (conv_layers): ModuleList(
    (0): Conv2d(1, 245, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 245, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 245, kernel_size=(5, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.6913344449168243, inplace=False)
  (fc): Linear(in_features=735, out_features=1, bias=True)
)

In [5]:
# tokenizer for splitting input sentences
tokenizer = get_tokenizer('spacy', 'en_core_web_sm')

# word vectors as generated from data preparation notebook
vocab = Vectors('word_vectors.txt', cache='.')
# maximum samples of training set to use, only for background
max_samples = 2000
# print some predictions from the (unlabeled) test set
sentences = pd.read_csv(os.path.join(data_path, 'test.tsv'), delimiter='\t')['sentence']
classes = ['negative', 'positive']

In [41]:
max_filter_size = 5
# function to pad to some minimum length
def pad(tokens, max_filter_size, padding_token='<pad>'):
    npad = max(max_filter_size - len(tokens), 0)
    tokens += [padding_token] * npad
    return tokens
tokens_list = np.zeros(len(sentences), dtype=object)
for i in range(len(sentences)):
    tokens = tokenizer(sentences[i])
    if max_filter_size is not None:
        tokens = pad(tokens, max_filter_size)
    # numericalize
    tokens = [vocab.stoi[token] if token in vocab.stoi else vocab.stoi['<unk>'] for token in tokens]
    # move to device and add required batch axis
    tokens = torch.tensor(tokens).unsqueeze(0)
    # add tokens to the list
    tokens_list[i] = tokens


In [44]:
# we use the first 100 training examples as our background dataset to integrate over
explainer = shap.DeepExplainer(model, tokens_list[2])
# explaining each prediction requires 2 * background dataset size runs
shap_values = explainer.shap_values(tokens_list[1])

RuntimeError: Sizes of tensors must match except in dimension 0. Got 33 and 21 in dimension 1 (The offending index is 1)