capabilities on how it can test models:

1. Data Generation
2. Data created from existing examples in dataset
3. The "checklist" of all tests (MFT, INV, DIR)

Other:

4. The actual process of tests (QQP and BERT tutorial on Github)

Testcases (in paper - about 2/3 times more errors shown in the following models):

5. Sentiment Analysis
6. QQP

In [1]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from pattern.en import sentiment
import numpy as np

# Data Generation

In [2]:
#1. Data Generation
editor = Editor()

In [3]:
#Custom data creation
ret = editor.template('This is a {adj} movie.', adj=['good', 'great', 'awesome', 'excellent'])
ret.data

['This is a good movie.',
 'This is a great movie.',
 'This is a awesome movie.',
 'This is a excellent movie.']

In [4]:
ret = editor.template({'question': 'Is this a {adj} movie?',
                       'context': 'This is a {adj} movie.' },
                      labels='Yes, this is {adj}.',
                      adj=['good', 'great', 'awesome', 'excellent'])
print(ret.data[0])
print(ret.labels[0])
print()
print(ret.data[1])
print(ret.labels[1])
print()

{'question': 'Is this a good movie?', 'context': 'This is a good movie.'}
Yes, this is good.

{'question': 'Is this a great movie?', 'context': 'This is a great movie.'}
Yes, this is great.



In [5]:
#Available Lexicons
print(list(editor.lexicons.keys()))

['male', 'female', 'first_name', 'first_pronoun', 'last_name', 'country', 'nationality', 'city', 'religion', 'religion_adj', 'sexual_adj', 'sentiment', 'country_city', 'male_from', 'female_from', 'last_from']


In [6]:
ret = editor.template('{male} is not friends with {female}')
ret.data[0:4]

['John is not friends with Mary',
 'John is not friends with Elizabeth',
 'John is not friends with Margaret',
 'John is not friends with Sarah']

In [7]:
#can add different flags
editor.add_lexicon('adj', ['good', 'bad', 'great', 'terrible'])

In [8]:
#MLM capability also available using flag {mask}
ret = editor.template('This is {a:adj} {mask}.', remove_duplicates=True)
ret.data[:5]

['This is a good idea.',
 'This is a good example.',
 'This is a good sign.',
 'This is a good thing.',
 'This is a good one.']

In [9]:
editor.suggest('This is {a:adj} {mask}.')[:5]

['idea', 'example', 'sign', 'thing', 'one']

In [10]:
editor.visual_suggest('This is {a:mask} movie.')

TemplateEditor(bert_suggests=['amazing', 'excellent', 'interesting', 'awful', 'action', 'awesome', 'incredible…

# Data Pertubation

In [11]:
#creating example dataset
data = ['John is a very smart person, and lives in Ireland.',
        'Mark Stewart was born and raised in Chicago',
        'Luke Smith has 2 sisters.',
        'Mary is not a nurse.',
        'Julianne is an engineer.',
        'My brother Andrew used to be a doctor.']

In [12]:
#source for function: https://github.com/marcotcr/checklist
import re
def change_professions(x, *args, **kwargs):
    # Returns empty or a list of strings with profesions changed
    professions = ['doctor', 'nurse', 'engineer', 'lawyer']
    ret = []
    for p in professions:
        if re.search(r'\b%s\b' % p, x):
            ret.extend([re.sub(r'\b%s\b' % p, p2, x) for p2 in professions if p != p2])
    return ret

In [13]:
change_professions(data[3])

['Mary is not a doctor.', 'Mary is not a engineer.', 'Mary is not a lawyer.']

In [14]:
#general use functions in documentation can be also used with spacy
import spacy
nlp = spacy.load('en_core_web_sm')

In [15]:
pdata = list(nlp.pipe(data))

1. Strip punctuation marks

In [16]:
ret = Perturb.perturb(pdata, Perturb.punctuation)
ret.data[:4]

[['John is a very smart person, and lives in Ireland.',
  'John is a very smart person, and lives in Ireland'],
 ['Mark Stewart was born and raised in Chicago',
  'Mark Stewart was born and raised in Chicago.'],
 ['Luke Smith has 2 sisters.', 'Luke Smith has 2 sisters'],
 ['Mary is not a nurse.', 'Mary is not a nurse']]

2. Generate typos

In [17]:
ret = Perturb.perturb(data, Perturb.add_typos, nsamples=1)
ret.data

[['Luke Smith has 2 sisters.', 'Luke Smit hhas 2 sisters.']]

3. Working with contractions

In [18]:
ret = Perturb.perturb(data, Perturb.contractions)
ret.data

[['Mary is not a nurse.', "Mary isn't a nurse."]]

4. Changing NE for NER

In [19]:
#can specifically edit first and last names, numbers and locations, showing first name functionality here
ret = Perturb.perturb(pdata, Perturb.change_names, nsamples=1, first_only=True, n=3)
ret.data

[['Luke Smith has 2 sisters.',
  'Edward Smith has 2 sisters.',
  'Joshua Smith has 2 sisters.',
  'Alexander Smith has 2 sisters.']]

5. Adding/Removing Negation

In [21]:
for t in ['This is not good', 'He didn\'t play the harmonica', 'He doesn\'t play football', 'She will not be sad']:
    print(t)
    print(Perturb.remove_negation(nlp(t)))
    print()

This is not good
This is good

He didn't play the harmonica
He played the harmonica

He doesn't play football
He plays football

She will not be sad
She will be sad



# Minimum Functionality Test (MFT) 

In [22]:
pos = ['good', 'enjoyable', 'exciting', 'excellent', 'amazing', 'great', 'engaging']
neg = ['bad', 'terrible', 'awful', 'horrible']

In [23]:
ret = editor.template('This is not {a:pos} {mask}.', pos=pos, labels=0, save=True, nsamples=100)
ret += editor.template('This is not {a:neg} {mask}.', neg=neg, labels=1, save=True, nsamples=100)

In [24]:
#adapted from Github Demo on https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/4.%20The%20CheckList%20process.ipynb
test = MFT(ret.data, labels=ret.labels, name='Simple negation',
           capability='Negation', description='Very simple negations.')

In [25]:
#sentiment analysis model imported from https://github.com/clips/pattern
def predict_proba(inputs):
    p1 = np.array([(sentiment(x)[0] + 1)/2. for x in inputs]).reshape(-1, 1)
    p0 = 1- p1
    return np.hstack((p0, p1))

In [26]:
#checklist uses wrapped fucntions to adapt test to its predictor analysis
from checklist.pred_wrapper import PredictorWrapper
wrapped_pp = PredictorWrapper.wrap_softmax(predict_proba)

In [27]:
test.run(wrapped_pp)
test.summary()

Predicting 200 examples
Test cases:      200
Fails (rate):    94 (47.0%)

Example fails:
0.8 This is not an amazing list.
----
0.0 This is not an awful problem.
----
0.8 This is not an enjoyable substitute.
----


In [28]:
test.visual_summary()

TestSummarizer(stats={'npassed': 106, 'nfailed': 94, 'nfiltered': 0}, summarizer={'name': 'Simple negation', '…

In terms of additional functionality, tests can also be soft maxed from native files, using the run_from_file() function.

# INV Tests (should showcase same behavior) 

In [29]:
#creating new dataset, movie statements
dataset = ['This was a very nice movie directed by John Smith.',
           'Mary Keen was amazing.', 
          'I hated everything about this.',
          'This movie was terrible.',
          'I really liked this movie.',
          'just bad.',
          'amazing.',
          ]
pdataset = list(nlp.pipe(dataset))

In [30]:
#similar process of creating pertubed data using checklist's function. Adding typos
#IMP: have to refer to a dictionary, cannot directly call a fucntion
t_dict = Perturb.perturb(dataset, Perturb.add_typos)
test = INV(**t_dict)

In [31]:
test.run(wrapped_pp)
test.summary()

Predicting 14 examples
Test cases:      7
Fails (rate):    2 (28.6%)

Example fails:
0.8 amazing.
0.5 amazin.g

----
0.8 Mary Keen was amazing.
0.5 Mary Keen was amzaing.

----


In [32]:
test.visual_summary()

TestSummarizer(stats={'npassed': 5, 'nfailed': 2, 'nfiltered': 0}, summarizer={'name': None, 'description': No…

# DIR tests (specific behavior expectation analysis)

In [33]:
#allows tests on paired and singular example cases, allows use of custom expectation functions as well
from checklist.expect import Expect
def changed_pred(orig_pred, pred, orig_conf, conf, labels=None, meta=None):
    return pred != orig_pred
expect_fn = Expect.pairwise(changed_pred)

In [34]:
t = Perturb.perturb(pdataset, Perturb.add_negation)

In [35]:
test = DIR(**t, expect=expect_fn)
test.run(wrapped_pp)
test.visual_summary()

Predicting 10 examples


TestSummarizer(stats={'npassed': 4, 'nfailed': 1, 'nfiltered': 0}, summarizer={'name': None, 'description': No…