In [3]:
import numpy as np
import sys
#!conda install --yes --prefix {sys.prefix} pytorch==1.7.1 torchtext==0.8.1 -c pytorch

In [2]:
#!{sys.executable} -m pip install -U torchtext==0.8.1

In [4]:
#!{sys.executable} -m pip install lime
import lime
import torch
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer
from torch.utils.data import DataLoader, Subset
from dataset import ArticleDataset
from sklearn.model_selection import train_test_split
import torchtext
from torchtext.data.utils import get_tokenizer
from cbow import BOW
from tqdm import tqdm
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

In [5]:
print(torchtext.__version__)
print(torch.__version__)

0.8.1
1.7.1


In [6]:
model = torch.load('../CBOW/saved_models/bow_heading_body.pt')

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 128

epochs = 5

# SET SEED
seed = np.random.seed(100)

articles = ArticleDataset()

train_indices, test_indices, _, _ = train_test_split(
range(len(articles)),
articles.labels,
stratify=articles.labels,
test_size=0.2,
random_state=seed
)

# generate subset based on indices
train_split = Subset(articles, train_indices)
test_split = Subset(articles, test_indices)

def collate_fn_attn(batch):
	return tuple(zip(*batch))

# create batches
train_batches = DataLoader(train_split, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_attn)
test_batches = DataLoader(test_split, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_attn)

In [8]:
test_str_1 = 'At a recent Miami art show a newly discovered Leonardo da Vinci sketch of Mona Lisa eating a banana sold for $1200000. But before the piece could be transferred to the new owners a man posing for a selfie with the newly discovered work of the master snatched the centuries old drawing from the gallery wall and ate it.Asked to explain his sudden appetite for art the vandal claimed that his eating of the piece was itself a type of art although he admitted that it would be hard to put a value on his performance which after digestion was likely worth less than $1200000.“All art is eventually consumed" explained Mr. Datura. "That was my point in eating it. Of course I was also very hungry. Besides the drawing was something new for me. I have lately grown bored with the taste of canvas and pigments not to mention wooden frames.'
test_str_2 = 'US Women Astronauts Set Space Records But Lack Reproduction Rights On Earth,A 40-year-old electrical engineer broke the 288-day record set by a former female space station commander in 2016-2017. The engineer is planning to spend a total of 328 days aboard the space station before returning to Earth.She has an additional two more months left to go on the International Space Station so all she has to do is float around in the space station for another two months to set another even more extended and impressive world record!As though her endurance record wasn’t impressive enough the electrical engineer was also cerebrated last October for being one-half of the first US women’s spacewalking team.Big stuff going on upstairs for women on the space station.News media watched as the women climbed out of the space station for a five to six hours spacewalk to replace a dead battery that went out the week before.Women replace dead batteries on Earth every day. In zero gravity it’s even easier. Women can do anything a man can do. The absence of gravity is the equalizer.Space records set by women are mere crumbs and a distraction of how women are treated on Earth.Women may be breaking records in outer space but down here on Earth men in the Republican-run Senate are still deciding if a woman can terminate a pregnancy.These men are doing away with Planned Parenthood a necessary form of health care for minority women. A clear violation of human rights. Men are forcing women to have an extra pelvic exam to humiliate women further. Men also insist women view ultrasound images of their pregnancy.Hook up Mitch McConnell to a labor simulator (Google it) and see how long he’ll last before screaming “Uncle!”The same pussy who claimed bone spurs during Vietnam is against abortion but quick to rip babies from mothers build walls declare wars and send other people’s sons and daughters to kill other people’s sons and daughters on the other side of the world.A US woman will one day walk on the Moon but still lack reproduction rights on Earth.Read more by this author:'
test_str_3 = 'Trump ID’s Whistleblower: Obama’s Mother In Law Was Trojan Horse,BILLINGSGATE POST: Not since Odysseus snuck into Troy in a wooden horse has such an audacious plot been attempted. In a replay of this mythological ploy of skullduggery Marian Lois Robinson mother of Michelle Obama was identified by President Trump as the whistleblower who blew the whistle on the President’s “perfect” phone call to Ukraine President Volodymyr Zelensky.Not heard of since the Obama family moved out of the White House prior to the Trump inauguration Michelle’s mother exchanged baby sitting services for free room and board in the White House. Although she once accused Barack of stealing her Sosha Shurity check she seemed very happy with her arrangement and was determined to hang on even after her family moved out.In the confusion of the transition Ms. Robinson posed as a senior staff member for the new administration. Easily integrating herself because she knew where the kitchen was she became a trusted consultant to then Chief of Staff Reince Priebus. She was then passed down to John Kelly and then to Mick Mulvaney; both who trusted her implicitly with the frenetic comings and goings of the president.Allowed to sleep in the Lincoln Bedroom she moved at will throughout the White House. Knowing that her time was running out she sought out Adam “Schifty” Schiff and turned over her transcript of the Trump-Zelensky phone call.The rest is history. Because of her President Trump has been impeached.Slim: “You can’t make shit like this up.”Dirty: “Yo Dude. Not unless you’re Dr. B.”'
test_str_4 = "WATCH: TREY GOWDY FURIOUS OVER LAWLESS Loretta Lynch During Clinton Email Hearing: “It was a total WASTE of time…The facts are embarrassing for her presidential candidate [Hillary]” The lawless and in-your-face behavior that this  President  and his regime have been able to get away with is simply breathtaking She [Lynch] could have answered every one of those questions, she just chose not to. It s really not that complicated. You take the facts as Director Comey gave  em to us, and as he found, and you apply the law, which it s public and everybody knows what it is. But the facts are embarrassing for her presidential candidate. So, discussing the facts necessarily leads to more questions like,  Well, if you had all those good facts, why didn t you indict her?' "

In [9]:
test_strs = [test_str_1, test_str_2, test_str_3]

In [11]:
import string
# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

tokenizer = get_tokenizer("basic_english")

def remove_punc(s):
    #punc_dict = {'!': ' ! ', '"': ' " ', '#': ' # ', '$': ' $ ', '%': ' % ', '&': ' & ', "'": " ' ", '(': ' ( ', ')': ' ) ', '*': ' * ', '+': ' + ', ',': ' , ', '-': ' - ', '.': ' . ', '/': ' / ', ':': ' : ', ';': ' ; ', '<': ' less than ', '=': ' equals ', '>': ' greater than ', '?': ' ? ', '@': ' @ ', '[': ' [ ', '\\': ' \\ ', ']': ' ] ', '^': ' ^ ', '_': ' _ ', '`': ' ` ', '{': ' { ', '|': ' | ', '}': ' } ', '~': ' ~ ', '—':' — '}
    punc_dict = {key: ' ' for key in string.punctuation}
    table = str.maketrans(punc_dict)  # OR {key: None for key in string.punctuation}
    new_s = s.translate(table)
    return new_s

def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))

def interpret_sentence(model, sentence, min_len = 300, label = 0):
    # Pad if needed
    vectorizer = CountVectorizer(stop_words='english', max_df=0.99, min_df=0.005)
    sequences = vectorizer.fit_transform(sentence.tolist())
        
    label_map = {'real' : 0, 'satire' : 1, 'fake' : 2}
        
    labels = df.label.apply(lambda x: label_map[x]).tolist()

    token2idx = vectorizer.vocabulary_
    idx2token = {idx: token for token, idx in token2idx.items()}

    model.zero_grad()

    input_indices = torch.tensor(sequences)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    pred = forward_with_sigmoid(input_indices)
    print(pred)
    pred_ind = torch.argmax(pred)
    if pred_ind != 1:
        pred_ind = torch.tensor(0)
        
    token_reference = TokenReferenceBase(reference_token_idx=len(token2idx))

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device='cpu').unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, \
                                           n_steps=500, return_convergence_delta=True, target=1)

    label_map = {0 : 'not satire', 1 : 'satire'}
    print('pred: ', label_map[pred_ind.item()], '(', '%.2f'%pred[0, pred_ind].item(), ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, tokenized, pred, pred_ind, label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    label_map = {0 : 'not satire', 1 : 'satire'}

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            #pred,
                            pred[0, pred_ind].item(),
                            label_map[pred_ind.item()],
                            label_map[label],
                            label_map[1],
                            attributions.sum(),
                            text,
                            delta))

In [12]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization
lig = LayerIntegratedGradients(model, model.glove_emb)

ModuleAttributeError: 'BOW' object has no attribute 'glove_emb'

In [22]:
interpret_sentence(model, test_str_4, label=0)

tensor([[5.5178e-02, 5.6457e-04, 9.8987e-01]], grad_fn=<SigmoidBackward>)
pred:  not satire ( 0.06 ) , delta:  tensor([1.3556], dtype=torch.float64)


In [23]:
visualization.visualize_text(vis_data_records_ig)

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
not satire,not satire (0.06),satire,-2.03,"watch trey gowdy furious over lawless loretta lynch during clinton email hearing “it was a total waste of time…the facts are embarrassing for her presidential candidate [ hillary ] ” the lawless and in - your - face behavior that this president and his regime have been able to get away with is simply breathtaking she [ lynch ] could have answered every one of those questions , she just chose not to . it s really not that complicated . you take the facts as director comey gave em to us , and as he found , and you apply the law , which it s public and everybody knows what it is . but the facts are embarrassing for her presidential candidate . so , discussing the facts necessarily leads to more questions like , well , if you had all those good facts , why didn t you indict her ? ' #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
not satire,not satire (0.06),satire,-2.03,"watch trey gowdy furious over lawless loretta lynch during clinton email hearing “it was a total waste of time…the facts are embarrassing for her presidential candidate [ hillary ] ” the lawless and in - your - face behavior that this president and his regime have been able to get away with is simply breathtaking she [ lynch ] could have answered every one of those questions , she just chose not to . it s really not that complicated . you take the facts as director comey gave em to us , and as he found , and you apply the law , which it s public and everybody knows what it is . but the facts are embarrassing for her presidential candidate . so , discussing the facts necessarily leads to more questions like , well , if you had all those good facts , why didn t you indict her ? ' #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad #pad"
,,,,


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

def predictor(texts):
    vectorizer = CountVectorizer(stop_words='english', max_df=0.99, min_df=0.005)
    sequences = vectorizer.fit_transform(texts)
        
    label_map = {'real' : 0, 'satire' : 1, 'fake' : 2}
    token2idx = vectorizer.vocabulary_

    outputs = model(sequences)
    tensor_logits = outputs[0]
    probas = F.softmax(tensor_logits).detach().numpy()
    return probas

class_names = ['real', 'satire', 'fake']
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(test_str_1, predictor, num_features=20, num_samples=2000)
exp.show_in_notebook(text=text)

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]