In [1]:
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

In [2]:
editor = Editor()


In [3]:
data = ['John is a very smart person, he lives in Ireland.',
        'Mark Stewart was born and raised in Chicago',
        'Luke Smith has 3 sisters.',
        'Mary is not a nurse.',
        'Julianne is an engineer.',
        'My brother Andrew used to be a lawyer.']

In [4]:
# simple regex substitutions can be used on words in the test set and training set. 


import re
def change_professions(x, *args, **kwargs):
    # Returns empty or a list of strings with profesions changed
    professions = ['doctor', 'nurse', 'engineer', 'lawyer']
    ret = []
    for p in professions:
        if re.search(r'\b%s\b' % p, x):
            ret.extend([re.sub(r'\b%s\b' % p, p2, x) for p2 in professions if p != p2])
    return ret

In [5]:
change_professions(data[3])


['Mary is not a doctor.', 'Mary is not a engineer.', 'Mary is not a lawyer.']

In [6]:
#perturb is basically a wrapper that applies your perturbations across elements. 
ret = Perturb.perturb(data, change_professions, keep_original=True)
ret.data

[['Mary is not a nurse.',
  'Mary is not a doctor.',
  'Mary is not a engineer.',
  'Mary is not a lawyer.'],
 ['Julianne is an engineer.',
  'Julianne is an doctor.',
  'Julianne is an nurse.',
  'Julianne is an lawyer.'],
 ['My brother Andrew used to be a lawyer.',
  'My brother Andrew used to be a doctor.',
  'My brother Andrew used to be a nurse.',
  'My brother Andrew used to be a engineer.']]

In [17]:
# can also keep metadata to keep track of what was changed!

def change_professions(x, meta=False, *args, **kwargs):
    # Returns empty or a list of strings with profesions changed
    professions = ['doctor', 'nurse', 'engineer', 'lawyer']
    ret = []
    ret_meta = []
    for p in professions:
        if re.search(r'\b%s\b' % p, x):
            ret.extend([re.sub(r'\b%s\b' % p, p2, x) for p2 in professions if p != p2])
            ret_meta.extend([(p, p2) for p2 in professions if p != p2])
    if meta:
        return ret, ret_meta
    else:
        return ret

In [19]:
ret = Perturb.perturb(data, change_professions, keep_original=True, nsamples=1, meta=True)
print('Data')
print(ret.data)
print('Metadata')
print(ret.meta)

Data
[['My brother Andrew used to be a lawyer.', 'My brother Andrew used to be a doctor.', 'My brother Andrew used to be a nurse.', 'My brother Andrew used to be a engineer.']]
Metadata
[[None, ('lawyer', 'doctor'), ('lawyer', 'nurse'), ('lawyer', 'engineer')]]


In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')

pdata = list(nlp.pipe(data))


NameError: name 'data' is not defined

In [21]:
pdata[0], Perturb.strip_punctuation(pdata[0])


(John is a very smart person, he lives in Ireland.,
 'John is a very smart person, he lives in Ireland')

In [22]:
data[0], Perturb.add_typos(data[0])

('John is a very smart person, he lives in Ireland.',
 'John is a very smrat person, he lives in Ireland.')

In [23]:
data[3], Perturb.contract(data[3])


('Mary is not a nurse.', "Mary isn't a nurse.")

In [25]:
type(pdata[0])

spacy.tokens.doc.Doc

In [28]:
pdata[0].ents

(John, Ireland)

In [35]:
ret = Perturb.perturb(pdata,, nsamples=1, n=3, meta=True)
ret.data[0], ret.meta[0]

(['Luke Smith has 3 sisters.',
  'Luke Smith has 2 sisters.',
  'Luke Smith has 4 sisters.',
  'Luke Smith has 2 sisters.'],
 [None, ('3', '2'), ('3', '4'), ('3', '2')])

In [44]:
ret = Perturb.perturb(pdata, Perturb.add_negation)
ret.data

[['John is a very smart person, he lives in Ireland.',
  "John is a very smart person, he doesn't live in Ireland."],
 ['Mark Stewart was born and raised in Chicago',
  'Mark Stewart was not born and raised in Chicago'],
 ['Luke Smith has 3 sisters.', "Luke Smith doesn't have 3 sisters."],
 ['Julianne is an engineer.', 'Julianne is not an engineer.']]

In [50]:
# we can try to add negations on both or just one of the examples


ret = Perturb.perturb(pdata, Perturb.add_negation)
ret.data

[['John is a very smart person, he lives in Ireland.',
  "John is a very smart person, he doesn't live in Ireland."],
 ['Mark Stewart was born and raised in Chicago',
  'Mark Stewart was not born and raised in Chicago'],
 ['Luke Smith has 3 sisters.', "Luke Smith doesn't have 3 sisters."],
 ['Julianne is an engineer.', 'Julianne is not an engineer.']]

In [9]:
# idea: train an ML model that auto generates negations (since this perturbation doesn't work too well)



for t in ['This is not good', 'He didn\'t play the guitar', 'He doesn\'t play anything', 'She wasn\'t sad']:
    print(t)
    print(Perturb.remove_negation(nlp(t)))
    print()

This is not good
This is good

He didn't play the guitar
He played the guitar

He doesn't play anything
He plays anything

She wasn't sad
She was sad

