In [1]:
# ! pip install checklist
# ! pip install nltk
# ! pip install pandas
# ! pip install simpletransformers

In [2]:
import nltk
import spacy
# nltk.download('omw-1.4')

In [3]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

import pandas as pd
import numpy as np
from simpletransformers.classification import ClassificationModel

np.random.seed(42)

editor = Editor()

In [4]:
#load local model
model = ClassificationModel("bert", "outputs", use_cuda = False)


In [5]:
#load subset
subset = pd.read_csv("data\olid-subset-diagnostic-tests.csv", index_col = 0)
subset

Unnamed: 0_level_0,text,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1
89200,@USER @USER Who the hell does he think he is?,1
71294,#BREAKING. #Greece: Molotov cocktails fly afte...,1
55633,"#OrrinHatch I can’t believe this sexist , clue...",1
16856,@USER @USER I'll use that one the next time im...,1
26072,0-1 lost my acca on the first fucking fight cba,1
...,...,...
45518,@USER He is obviously getting suspended. He is...,0
51610,#Canada - EXCLUSIVE: #Trudeau #Liberals leave ...,0
26758,@USER @USER ...than why did you show us how ho...,0
30718,@USER @USER @USER You have yet to answer what ...,0


In [6]:
#need spacy model
nlp = spacy.load('en_core_web_sm')

In [7]:
#convert instances to list, then to spacy DOC tokens. Needed for checklist negation function
data = subset["text"].tolist()
pdataset = list(nlp.pipe(data))
pdataset


[@USER @USER Who the hell does he think he is?,
 #BREAKING. #Greece: Molotov cocktails fly after protest honouring killed antifa arti... URL via @USER URL,
 #OrrinHatch I can’t believe this sexist , clueless, old fart gets to weigh in on another woman’s charges against a Supreme Court nominee. And he is spouting the same old nasty shit he spewed 20+ years ago against Anita Hill. His time’s up! Good riddance Neanderthal!,
 @USER @USER I'll use that one the next time im in a gun control debate or in a debate about free speech or taxes. Yes you can choose to be irresponsible or choose not to be. I argue responsible. Whats wrong with that? Don't justify murder by saying it was never alive or its my right.,
 0-1 lost my acca on the first fucking fight cba,
 #Bakersfield is why we need gun control! Sorry for the victims other than the gunman himself I hope he rots in hell!,
 #Christian #America – If we go by #Trump’s example, where liberals support open borders, I guess conservatives support

In [8]:
#add negation to sentences
#only works on the second try for some reason, so quick patch
try:
    ret = Perturb.perturb(pdataset, Perturb.add_negation, keep_original=True)
except:
    ret = Perturb.perturb(pdataset, Perturb.add_negation, keep_original=True)

In [9]:
#question 2 & 3: look at first 10 examples
ret.data[:10]

[['@USER @USER Who the hell does he think he is?',
  "@USER @USER Who the hell doesn't he think he is?"],
 ['#BREAKING. #Greece: Molotov cocktails fly after protest honouring killed antifa arti... URL via @USER URL',
  "#BREAKING. #Greece: Molotov cocktails don't fly after protest honouring killed antifa arti... URL via @USER URL"],
 ['#OrrinHatch I can’t believe this sexist , clueless, old fart gets to weigh in on another woman’s charges against a Supreme Court nominee. And he is spouting the same old nasty shit he spewed 20+ years ago against Anita Hill. His time’s up! Good riddance Neanderthal!',
  '#OrrinHatch I can’t believe this sexist , clueless, old fart gets to weigh in on another woman’s charges against a Supreme Court nominee. And he is not spouting the same old nasty shit he spewed 20+ years ago against Anita Hill. His time’s up! Good riddance Neanderthal!'],
 ["@USER @USER I'll use that one the next time im in a gun control debate or in a debate about free speech or taxes.

In [17]:
len(ret.data)

#only 95 out of 100 perturbed

95

In [12]:
#make prediction for original data and perturbed data to compare
original_predictions=model.predict([sent[0] for _, sent in enumerate(ret.data)])
perturb_predictions=model.predict([sent[1] for _, sent in enumerate(ret.data)])

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [21]:
print(f'gold labels: {subset["labels"].tolist()}')
print(f'perturb predictions: {perturb_predictions[0]}')
print(f'original predictions: {original_predictions[0]}')

gold labels: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
perturb predictions: [1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
original predictions: [1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [22]:
#check where model assigned different label after perturb
compare_order = [0 if i==j else 1 for i, j in zip(perturb_predictions[0],original_predictions[0])]
print(compare_order)
print(f'amount different: {sum(compare_order)}')
#only 1 sentence changed label

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
amount different: 1


In [23]:
#get index of tweet that was classified differently
idx=compare_order.index(1)
print(ret.data[idx][0])
print(ret.data[idx][1])

@USER #MAGA who cares about the farm.  He had no reason to commit murder. End of story.
@USER #MAGA who cares about the farm.  He didn't have no reason to commit murder. End of story.


In [16]:
#get id of tweet
subset.loc[subset['text'].str.contains("cause everyone knows a real leader falls into a van like a sack of potatoes")]

Unnamed: 0_level_0,text,labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1
79934,@USER 'cause everyone knows a real leader fall...,1
