In [1]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.test_suite import TestSuite
from checklist.expect import Expect

In [2]:
import sys
import spacy
import numpy as np
processor = spacy.load('en_core_web_sm')

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
model_name = "textattack/bert-base-uncased-QQP"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# sentiment analysis is a general name in Huggingface to load the pipeline for text classification tasks.
# set device=-1 if you don't have a gpu
pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, framework="pt", device=0)

In [3]:
# our problem is extrememely similar to QQP


from datasets import load_dataset
qqp_data = load_dataset('glue', 'qqp', split='validation')
all_questions = set()
q1s = [d["question1"] for d in qqp_data]
q2s = [d["question2"] for d in qqp_data]
labels = np.array([d["label"] for d in qqp_data]).astype(int)

qs = list(zip(q1s, q2s))
qqp_data[0]

Reusing dataset glue (/home/eculbertson/.cache/huggingface/datasets/glue/qqp/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


{'question1': 'Why are African-Americans so beautiful?',
 'question2': 'Why are hispanics so beautiful?',
 'label': 0,
 'idx': 0}

In [4]:
from tqdm import tqdm
all_questions.update(set(q1s))
all_questions.update(set(q2s))
print(f"Total count of unique questions: {len(all_questions)}")
processed_qs = list(tqdm(processor.pipe(all_questions, batch_size=64)))

Total count of unique questions: 73324


73324it [01:08, 1065.21it/s]


In [5]:
spacy_map = {q: processed_q for (q, processed_q) in zip(all_questions, processed_qs)}
parsed_qs = [(spacy_map[q[0]], spacy_map[q[1]]) for q in qs]


In [6]:
suite = TestSuite()
editor = Editor()


In [7]:
# Changing the name should lead to different questions. 
# Idea: changing name should lead to non entailment? (can check that entailment prob does NOT increase)

t = editor.template((
    'Is {first_name} {last_name} {mask}?',
    'Is {first_name2} {last_name} {mask}?',
    ),
    remove_duplicates=True, 
    nsamples=300)
test = MFT(**t, labels=0, name='same adjectives, different people', capability = 'NER',
          description='Different first name, same adjective and last name')
suite.add(test)
print(t.data[0])
print(t.data[1])

  to_pred = torch.tensor(to_pred, device=self.device).to(torch.int64)


('Is Julia Perry Married?', 'Is Robin Perry Married?')
('Is Rebecca Ross gone?', 'Is Grace Ross gone?')


In [12]:
t.data

[[('Who do you think will win, Trump or Hillary?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or Kayla?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or Kimberly?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or Karen?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or Nicole?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or Katie?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or Amanda?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or Monica?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or Vanessa?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or Lauren?',
   'Who is going to win, Trump or Hillary?'),
  ('Who do you think will win, Trump or

In [8]:
def change_name_on_one(qs):
    q1, q2 = qs
    c1 = Perturb.change_names(q1, seed=1, meta=True)
    c2 = Perturb.change_names(q2, seed=1, meta=True)
    if not c1 or not c2:
        return
    c1, m1 = c1
    c2, m2 = c2
    ret = []
    ret.extend([(q1_, str(q2)) for q1_, m1_ in zip(c1, m1) if m1_[0] in str(q2)])
    ret.extend([(str(q1), q2_) for q2_, m2_ in zip(c2, m2) if m2_[0] in str(q1)])
    return ret

In [10]:

expect_fn = Expect.eq(0) # expect predictions to be equal to a value
expect_fn = Expect.slice_orig(expect_fn, lambda orig, *args: orig == 1)

In [11]:
t = Perturb.perturb(parsed_qs, change_name_on_one, nsamples=200)
name = 'Change name in one of the questions'
desc = 'Take pairs that are originally predicted as duplicates, change name in one of them and expect new prediction to be non-duplicate'
test = DIR(**t, expect=expect_fn, name=name, description=desc, capability='NER')
suite.add(test)
print(t.data[0][0])
print(t.data[0][1])
print(t.data[0][2])

('Who do you think will win, Trump or Hillary?', 'Who is going to win, Trump or Hillary?')
('Who do you think will win, Trump or Kayla?', 'Who is going to win, Trump or Hillary?')
('Who do you think will win, Trump or Kimberly?', 'Who is going to win, Trump or Hillary?')


In [13]:
np.random.seed(14)
i = np.random.choice(len(qs))
qs[i]

('Which company should I join as a fresher, TCS or Virtusa?',
 'Is it a good decision to join Tcs as a fresher?')

In [14]:
', '.join(editor.suggest('{mask} is a large tech company.')[:40])


'Apple, Google, Facebook, This, Microsoft, Amazon, Uber, It, Intel, Samsung, Netflix, Tesla, Twitter, LinkedIn, Oracle, Target, Snap, Disney, AMD, Bloomberg, Sony, That, Wikipedia, China, Here, Fox, this, HP, FB, YouTube, Reddit, Ford, Pinterest, Harris, MIT, GE, CBS, Dialog, Square, Orange'

In [15]:
example = ('Which company should I join as a freshman, Google or Facebook?', 'Should I join Google as a freshman?')
pipe([example])

[{'label': 'LABEL_0', 'score': 0.987102210521698}]

In [16]:
def pred_and_conf(data):
    raw_preds = pipe(data)
    preds = np.array([ int(p["label"][-1]) for p in raw_preds])
    pp = np.array([[p["score"], 1-p["score"]] if int(p["label"][-1]) == 0 else [1-p["score"], p["score"]] for p in raw_preds])
    return preds, pp

In [19]:
suite.run(pred_and_conf, overwrite=True)


Running same adjectives, different people
Predicting 298 examples
Running Change name in one of the questions
Predicting 3912 examples


RuntimeError: CUDA out of memory. Tried to allocate 1.10 GiB (GPU 0; 8.00 GiB total capacity; 5.29 GiB already allocated; 650.44 MiB free; 5.36 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF