# Creating checklist test suite for SQuAD

Source: code from https://github.com/marcotcr/checklist/blob/115f123de47ab015b2c3a6baebaffb40bab80c9f/notebooks/SQuAD.ipynb with some changes

In [1]:
%load_ext autoreload
%autoreload 2

import checklist
import spacy
import itertools

import checklist.editor
import checklist.text_generation
from checklist.test_types import MFT, INV, DIR
from checklist.expect import Expect
from checklist.test_suite import TestSuite
import numpy as np
from checklist.perturb import Perturb


### load the default hugging face model for question-answering
[`distilbert-base-cased-distilled-squad`](https://huggingface.co/distilbert-base-cased-distilled-squad)

In [78]:
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser
from transformers import pipeline 

model = pipeline('question-answering', model="distilbert-base-cased-distilled-squad")
model({
    'context': 'A new strain of flu that has the potential to become a pandemic has been identified by scientists.',
    'question': 'What has been discovered by scientists?'
})


{'score': 0.38112872838974,
 'start': 0,
 'end': 19,
 'answer': 'A new strain of flu'}

In [3]:
def predconfs(context_question_pairs):
    """
    output: predictions, confidence 
    source: https://github.com/marcotcr/checklist/blob/115f123de47ab015b2c3a6baebaffb40bab80c9f/notebooks/tutorials/5.%20Testing%20transformer%20pipelines.ipynb
    """
    preds = []
    confs = []
    for c, q in context_question_pairs:
        try:
            p = model(question=q, context=c, truncation=True, )
        except:
            print('Failed', q)
            preds.append(' ')
            confs.append(1)
        preds.append(p['answer'])
        confs.append(p['score'])
    return preds, np.array(confs)

def format_squad_with_context(x, pred, conf, label=None, *args, **kwargs):
    c, q = x
    ret = 'C: %s\nQ: %s\n' % (c, q)
    if label is not None:
        ret += 'A: %s\n' % label
    ret += 'P: %s\n' % pred
    return ret

def format_squad(x, pred, conf, label=None, *args, **kwargs):
    c, q = x
    ret = 'Q: %s\n' % (q)
    if label is not None:
        ret += 'A: %s\n' % label
    ret += 'P: %s\n' % pred
    return ret

In [4]:
editor = checklist.editor.Editor()
suite = TestSuite()

# Generating Tests

## Vocabulary

In [5]:
suggestions1 = editor.suggest('{first_name} is {mask} than {first_name2}.')[:60]
print(suggestions1)

  to_pred = torch.tensor(to_pred, device=self.device).to(torch.int64)


['smarter', 'older', 'better', 'younger', 'taller', 'worse', 'stronger', 'different', 'shorter', 'cooler', 'nicer', 'tougher', 'bigger', 'hotter', 'happier', 'smaller', 'wiser', 'more', 'faster', 'richer', 'darker', 'thinner', 'weaker', 'less', 'larger', 'quieter', 'cleaner', 'heavier', 'healthier', 'closer', 'colder', 'slower', 'wealthier', 'quicker', 'longer', 'harder', 'safer', 'lighter', 'warmer', 'brighter', 'cheaper', 'sharper', 'higher', 'louder', 'thicker', 'greater', 'lower', 'easier', 'deeper', 'poorer', 'softer', 'smoother', 'simpler', 'stranger', 'newer', 'other', 'superior', 'clearer', 'stricter', 'tighter']


In [6]:
suggestions1 = [x for x in suggestions1 if x.endswith("er")][:20]

### A is COMP than B. Who is more COMP?

In [7]:
t = editor.template(
    [(
    '{first_name} is {adj} than {first_name1}.',
    'Who is {adj}?'
    )
    ],
    labels = ['{first_name}'],
    adj=suggestions1,
    remove_duplicates=True,
    nsamples=500,
    save=True
    )
name = 'A is COMP than B. Who is more COMP?'
description = ''
test = MFT(**t, name=name, description=description, capability='Vocabulary')
suite.add(test)

In [8]:
test.run(predconfs, n=100, overwrite=True)

Predicting 100 examples


In [9]:
test.summary(format_example_fn=format_squad_with_context)

Test cases:      498
Test cases run:  100
Fails (rate):    4 (4.0%)

Example fails:
C: Claire is stronger than Donald.
Q: Who is stronger?
A: Claire
P: Donald

----
C: Christine is stronger than Richard.
Q: Who is stronger?
A: Christine
P: Christine is stronger than Richard

----
C: Katie is older than Adam.
Q: Who is older?
A: Katie
P: Katie is older than Adam

----


### A is COMP than B. Who is less COMP?

In [10]:
t = editor.template(
    [(
    '{first_name} is {adj} than {first_name1}.',
    'Who is less {adj}?'
    )
    ],
    labels = ['{first_name1}'],
    adj=suggestions1,
    remove_duplicates=True,
    nsamples=500,
    save=True
    )
name = 'A is COMP than B. Who is less COMP?'
description = ''
test = MFT(**t, name=name, description=description, capability='Vocabulary')
suite.add(test)

In [11]:
test.run(predconfs, n=100)

Predicting 100 examples


In [12]:
test.summary(format_example_fn=format_squad_with_context, n=3)

Test cases:      499
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Tom is tougher than Jennifer.
Q: Who is less tougher?
A: Jennifer
P: Tom

----
C: Kim is bigger than Heather.
Q: Who is less bigger?
A: Heather
P: Kim

----
C: Alan is bigger than Bobby.
Q: Who is less bigger?
A: Bobby
P: Alan

----


### Intensifiers (very, super, extremely) and reducers (somewhat, kinda, etc)?

In [13]:
def crossproduct(t):
    # takes the output of editor.template and does the cross product of contexts and qas
    ret = []
    ret_labels = []
    for x in t.data:
        cs = x['contexts']
        qas = x['qas']
        d = list(itertools.product(cs, qas))
        ret.append([(x[0], x[1][0]) for x in d])
        ret_labels.append([x[1][1] for x in d])
    t.data = ret
    t.labels = ret_labels
    return t


In [14]:
state = editor.suggest('John is very {mask} about the project.')[:20]
print(', '.join(editor.suggest('John is {mask} {state} about the project.', state=state)[:30]))
very = ['very', 'extremely', 'really', 'quite', 'incredibly', 'particularly', 'highly', 'super']
somewhat = ['a little', 'somewhat', 'slightly', 'mildly']

very, pretty, extremely, also, still, quite, more, really, not, clearly, fairly, incredibly, particularly, now, understandably, rather, cautiously, surprisingly, certainly, feeling, so, especially, definitely, generally, most, highly, super, reportedly, being, obviously


In [15]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {very} {s} about the project. {first_name1} is {s} about the project.',
            '{first_name1} is {s} about the project. {first_name} is {very} {s} about the project.',
            '{first_name} is {s} about the project. {first_name1} is {somewhat} {s} about the project.',
            '{first_name1} is {somewhat} {s} about the project. {first_name} is {s} about the project.',
            '{first_name} is {very} {s} about the project. {first_name1} is {somewhat} {s} about the project.',
            '{first_name1} is {somewhat} {s} about the project. {first_name} is {very} {s} about the project.',
        ],
        'qas': [
            (
                'Who is most {s} about the project?',
                '{first_name}'
            ), 
            (
                'Who is least {s} about the project?',
                '{first_name1}'
            ), 
            
        ]
        
    },
    s = state,
    very=very,
    somewhat=somewhat,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'Intensifiers (very, super, extremely) and reducers (somewhat, kinda, etc)?'
desc = ''
test = MFT(**t, name=name, description=desc, capability='Vocabulary')
suite.add(test)


In [16]:
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)


Predicting 1200 examples
Test cases:      500
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Billy is somewhat passionate about the project. Lisa is passionate about the project.
Q: Who is least passionate about the project?
A: Billy
P: Lisa

C: Lisa is passionate about the project. Billy is somewhat passionate about the project.
Q: Who is least passionate about the project?
A: Billy
P: Lisa

C: Lisa is extremely passionate about the project. Billy is somewhat passionate about the project.
Q: Who is least passionate about the project?
A: Billy
P: Lisa


----
C: Diane is highly skeptical about the project. Jay is slightly skeptical about the project.
Q: Who is least skeptical about the project?
A: Jay
P: Diane

C: Diane is skeptical about the project. Jay is slightly skeptical about the project.
Q: Who is most skeptical about the project?
A: Diane
P: Jay

C: Diane is highly skeptical about the project. Jay is skeptical about the project.
Q: Who is most skeptical a

## Taxonomy

### Size, chape, color, age, material

In [17]:
import munch
order = ['size', 'shape', 'age', 'color']
props = []
properties = {
    'color' : ['red', 'blue','yellow', 'green', 'pink', 'white', 'black', 'orange', 'grey', 'purple', 'brown'],
    'size' : ['big', 'small', 'tiny', 'enormous'],
    'age' : ['old', 'new'],
    'shape' : ['round', 'oval', 'square', 'triangular'],
    'material' : ['iron', 'wooden', 'ceramic', 'glass', 'stone']
}
for i in range(len(order)):
    for j in range(i + 1, len(order)):
        p1, p2 = order[i], order[j]
        for v1, v2 in itertools.product(properties[p1], properties[p2]):
            props.append(munch.Munch({
                'p1': p1,
                'p2': p2,
                'v1': v1,
                'v2': v2,
            }))


In [18]:
print(', '.join(editor.suggest('There is {a:p.v1} {p.v2} {mask} in the room.', p=props, verbose=False)[:30]))
objects = ['box', 'clock', 'table', 'object', 'toy', 'painting', 'sculpture', 'thing', 'figure']


sofa, couch, wall, carpet, chair, table, light, lamp, door, clock, mirror, desk, bed, TV, bar, television, window, box, tree, painting, curtain, fan, fridge, screen, wallpaper, piano, rug, shelf, camera, candle


In [19]:
t = crossproduct(editor.template(
    {
        'contexts': [
            'There is {a:p.v1} {p.v2} {obj} in the room.',
            'There is {a:obj} in the room. The {obj} is {p.v1} and {p.v2}.',
        ],
        'qas': [
            (
                'What {p.p1} is the {obj}?',
                '{p.v1}'
            ), 
            (
                'What {p.p2} is the {obj}?',
                '{p.v2}'
            ), 
            
        ]
    },
    obj=objects,
    p=props,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'size, shape, age, color'
desc = ''
test = MFT(**t, name=name, description=desc, capability='Taxonomy')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      500
Test cases run:  100
Fails (rate):    99 (99.0%)

Example fails:
C: There is a figure in the room. The figure is tiny and white.
Q: What size is the figure?
A: tiny
P: tiny and white

C: There is a tiny white figure in the room.
Q: What size is the figure?
A: tiny
P: white


----
C: There is a new yellow thing in the room.
Q: What age is the thing?
A: new
P: yellow

C: There is a thing in the room. The thing is new and yellow.
Q: What age is the thing?
A: new
P: new and yellow


----
C: There is a box in the room. The box is tiny and orange.
Q: What size is the box?
A: tiny
P: tiny and orange

C: There is a tiny orange box in the room.
Q: What size is the box?
A: tiny
P: orange


----


### Professions vs nationalities

In [20]:
professions = editor.suggest('{first_name} works as {a:mask}.')[:30]
professions += editor.suggest('{first_name} {last_name} works as {a:mask}.')[:30]
professions = list(set(professions))
if 'translator' in professions:
    professions.remove('translator')
print(professions)

['journalist', 'attorney', 'producer', 'economist', 'executive', 'escort', 'administrator', 'waitress', 'editor', 'entrepreneur', 'analyst', 'educator', 'activist', 'intern', 'organizer', 'assistant', 'photographer', 'reporter', 'DJ', 'author', 'engineer', 'actress', 'nurse', 'investigator', 'agent', 'architect', 'actor', 'secretary', 'artist', 'model', 'interpreter', 'investor', 'accountant', 'historian']


In [21]:
def clean(string):
    return string.lstrip('[a,the,an,in,at] ').rstrip('.')

In [22]:
def expect_squad(x, pred, conf, label=None, meta=None):
    return clean(pred) == clean(label)
expect_squad = Expect.single(expect_squad)

In [23]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {a:nat} {prof}.',
            '{first_name} is {a:prof}. {first_name} is {nat}.',
            '{first_name} is {nat}. {first_name} is {a:prof}.',
            '{first_name} is {nat} and {a:prof}.',
            '{first_name} is {a:prof} and {nat}.',
        ],
        'qas': [
            (
                'What is {first_name}\'s job?',
                '{prof}'
            ), 
            (
                'What is {first_name}\'s nationality?',
                '{nat}'
            ), 
            
        ]
        
    },
    nat = editor.lexicons['nationality'][:10],
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    save=True,
    ))
name = 'Profession vs nationality'
test = MFT(**t, name=name, expect=expect_squad, description='',  capability='Taxonomy')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 1000 examples
Test cases:      500
Test cases run:  100
Fails (rate):    20 (20.0%)

Example fails:
C: Keith is a Pakistani escort.
Q: What is Keith's job?
A: escort
P: Pakistani escort


----
C: Marie is a Russian executive.
Q: What is Marie's job?
A: executive
P: Russian executive


----
C: Susan is a Bangladeshi secretary.
Q: What is Susan's job?
A: secretary
P: Bangladeshi secretary


----


### Animal vs vehicle

In [24]:
animals = ['dog', 'cat', 'bull', 'cow', 'fish', 'serpent', 'snake', 'lizard', 'hamster', 'rabbit', 'guinea pig', 'iguana', 'duck']
vehicles = ['car', 'truck', 'train', 'motorcycle', 'bike', 'firetruck', 'tractor', 'van', 'SUV', 'minivan']
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} has {a:animal} and {a:vehicle}.',
            '{first_name} has {a:vehicle} and {a:animal}.',
        ],
        'qas': [
            (
                'What animal does {first_name} have?',
                '{animal}'
            ), 
            (
                'What vehicle does {first_name} have?',
                '{vehicle}'
            ), 
            
        ]
        
    },
    animal=animals,
    vehicle=vehicles,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'Animal vs Vehicle'
test = MFT(**t, name=name, description='', capability='Taxonomy', expect=expect_squad)
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, overwrite=True)


Predicting 400 examples
Test cases:      500
Test cases run:  100
Fails (rate):    57 (57.0%)

Example fails:
C: Donna has a fish and a van.
Q: What vehicle does Donna have?
A: van
P: a fish and a van


----
C: Joe has a serpent and a car.
Q: What vehicle does Joe have?
A: car
P: a serpent and a car

C: Joe has a car and a serpent.
Q: What vehicle does Joe have?
A: car
P: a car and a serpent


----
C: Alex has a cow and a firetruck.
Q: What vehicle does Alex have?
A: firetruck
P: a cow and a firetruck

C: Alex has a firetruck and a cow.
Q: What animal does Alex have?
A: cow
P: firetruck and a cow

C: Alex has a firetruck and a cow.
Q: What vehicle does Alex have?
A: firetruck
P: firetruck and a cow


----


### Animal vs vehicle v2
who bought?

In [25]:
animals = ['dog', 'cat', 'bull', 'cow', 'fish', 'serpent', 'snake', 'lizard', 'hamster', 'rabbit', 'guinea pig', 'iguana', 'duck']
vehicles = ['car', 'truck', 'train', 'motorcycle', 'bike', 'firetruck', 'tractor', 'van', 'SUV', 'minivan']
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} bought {a:animal}. {first_name2} bought {a:vehicle}.',
            '{first_name2} bought {a:vehicle}. {first_name} bought {a:animal}.',
        ],
        'qas': [
            (
                'Who bought an animal?',
                '{first_name}'
            ), 
            (
                'Who bought a vehicle?',
                '{first_name2}'
            ), 
            
        ]
        
    },
    animal=animals,
    vehicle=vehicles,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'Animal vs Vehicle v2'
test = MFT(**t, name=name, description='', capability='Taxonomy', expect=expect_squad)
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test, overwrite=True)

Predicting 400 examples
Test cases:      500
Test cases run:  100
Fails (rate):    68 (68.0%)

Example fails:
C: Bill bought a snake. Heather bought a minivan.
Q: Who bought a vehicle?
A: Heather
P: Bill

C: Heather bought a minivan. Bill bought a snake.
Q: Who bought an animal?
A: Bill
P: Heather bought a minivan. Bill


----
C: Cynthia bought a snake. Harriet bought a train.
Q: Who bought a vehicle?
A: Harriet
P: Cynthia

C: Harriet bought a train. Cynthia bought a snake.
Q: Who bought a vehicle?
A: Harriet
P: Harriet bought a train. Cynthia


----
C: Judith bought a minivan. Pamela bought a bull.
Q: Who bought an animal?
A: Pamela
P: Judith


----


### Synonyms

In [26]:
synonyms = [ ('spiritual', 'religious'), ('angry', 'furious'), ('organized', 'organised'),
            ('vocal', 'outspoken'), ('grateful', 'thankful'), ('intelligent', 'smart'),
            ('humble', 'modest'), ('courageous', 'brave'), ('happy', 'joyful'), ('scared', 'frightened'),
           ]

t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is very {s1[0]}. {first_name2} is very {s2[0]}.',
            '{first_name2} is very {s2[0]}. {first_name} is very {s1[0]}.',
        ],
        'qas': [
            (
                'Who is {s1[1]}?',
                '{first_name}'
            ), 
            (
                'Who is {s2[1]}?',
                '{first_name2}'
            ), 
            
        ]
        
    },
    s=synonyms,
    remove_duplicates=True,
    nsamples=250,
    save=True
   ))
t += crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is very {s1[1]}. {first_name2} is very {s2[1]}.',
            '{first_name2} is very {s2[1]}. {first_name} is very {s1[1]}.',
        ],
        'qas': [
            (
                'Who is {s1[0]}?',
                '{first_name}'
            ), 
            (
                'Who is {s2[0]}?',
                '{first_name2}'
            ), 
            
        ]
        
    },
    s=synonyms,
    remove_duplicates=True,
    nsamples=250,
    save=True
    )) 
name = 'Synonyms'
test = MFT(**t, name=name, description='', capability='Taxonomy', expect=expect_squad)
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      439
Test cases run:  100
Fails (rate):    17 (17.0%)

Example fails:
C: Larry is very happy. Tim is very grateful.
Q: Who is joyful?
A: Larry
P: Tim


----
C: Alex is very happy. Jennifer is very courageous.
Q: Who is joyful?
A: Alex
P: Jennifer


----
C: Sally is very outspoken. Hugh is very smart.
Q: Who is vocal?
A: Sally
P: Hugh


----


### A is COMP than B. Who is antonym(COMP)? B


In [27]:
comp_pairs = [('better', 'worse'), ('older', 'younger'), ('smarter', 'dumber'), ('taller', 'shorter'), ('bigger', 'smaller'), ('stronger', 'weaker'), ('faster', 'slower'), ('darker', 'lighter'), ('richer', 'poorer'), ('happier', 'sadder'), ('louder', 'quieter'), ('warmer', 'colder')]
comp_pairs = list(set(comp_pairs))#list(set(comp_pairs + [(x[1], x[0]) for x in comp_pairs]))

In [28]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {comp[0]} than {first_name1}.',
            '{first_name1} is {comp[1]} than {first_name}.',
        ],
        'qas': [
            (
                'Who is {comp[1]}?',
                '{first_name1}',
            ),
            (
                'Who is {comp[0]}?',
                '{first_name}',
            )
            
        ]
        ,
    },
    comp=comp_pairs,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'A is COMP than B. Who is antonym(COMP)? B'
test = MFT(**t, name=name, description='', capability='Taxonomy')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      498
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Adam is weaker than Helen.
Q: Who is stronger?
A: Helen
P: Adam

C: Helen is stronger than Adam.
Q: Who is weaker?
A: Adam
P: Helen


----
C: Arthur is older than Kate.
Q: Who is younger?
A: Kate
P: Arthur

C: Kate is younger than Arthur.
Q: Who is older?
A: Arthur
P: Kate


----
C: Nick is faster than Rose.
Q: Who is slower?
A: Rose
P: Nick

C: Rose is slower than Nick.
Q: Who is faster?
A: Nick
P: Rose


----


### A is more X than B. Who is more antonym(X)? B. Who is less X? B. Who is more X? A. Who is less antonym(X)? A.

In [29]:
antonym_adjs = [('progressive', 'conservative'),('religious', 'secular'),('positive', 'negative'),('defensive', 'offensive'),('rude',  'polite'),('optimistic', 'pessimistic'),('stupid', 'smart'),('negative', 'positive'),('unhappy', 'happy'),('active', 'passive'),('impatient', 'patient'),('powerless', 'powerful'),('visible', 'invisible'),('fat', 'thin'),('bad', 'good'),('cautious', 'brave'), ('hopeful', 'hopeless'),('insecure', 'secure'),('humble', 'proud'),('passive', 'active'),('dependent', 'independent'),('pessimistic', 'optimistic'),('irresponsible', 'responsible'),('courageous', 'fearful')]
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is more {a[0]} than {first_name1}.',
            '{first_name1} is more {a[1]} than {first_name}.',
            '{first_name} is less {a[1]} than {first_name1}.',
            '{first_name1} is less {a[0]} than {first_name}.',
        ],
        'qas': [
            (
                'Who is more {a[0]}?',
                '{first_name}',
            ),
            (
                'Who is less {a[0]}?',
                '{first_name1}',
            ),
            (
                'Who is more {a[1]}?',
                '{first_name1}',
            ),
            (
                'Who is less {a[1]}?',
                '{first_name}',
            ),
        ]
        ,
    },
    a = antonym_adjs,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'A is more X than B. Who is more antonym(X)? B. Who is less X? B. Who is more X? A. Who is less antonym(X)? A.'
test = MFT(**t, name=name, description='', capability='Taxonomy')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 1600 examples
Test cases:      498
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Helen is more smart than Bobby.
Q: Who is less smart?
A: Bobby
P: Helen

C: Helen is less stupid than Bobby.
Q: Who is less smart?
A: Bobby
P: Helen

C: Helen is less stupid than Bobby.
Q: Who is more stupid?
A: Bobby
P: Helen


----
C: Elizabeth is more passive than Jane.
Q: Who is less passive?
A: Jane
P: Elizabeth

C: Elizabeth is less active than Jane.
Q: Who is less passive?
A: Jane
P: Elizabeth

C: Elizabeth is more passive than Jane.
Q: Who is more active?
A: Jane
P: Elizabeth


----
C: Howard is more thin than Fiona.
Q: Who is less thin?
A: Fiona
P: Howard

C: Howard is less fat than Fiona.
Q: Who is more fat?
A: Fiona
P: Howard

C: Howard is less fat than Fiona.
Q: Who is less thin?
A: Fiona
P: Howard


----


## Robustness

In [30]:
dataset = datasets.load_dataset('squad')
pairs = [(x['context'], x['question']) for x in dataset['train']]

Found cached dataset squad (C:/Users/fgmal/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [31]:
# import pickle
# # IF YOU ALREADY RAN THIS CELL, JUMP TO NEXT
# # source: https://github.com/marcotcr/checklist/blob/115f123de47ab015b2c3a6baebaffb40bab80c9f/notebooks/QQP.ipynb
# nlp = spacy.load('en_core_web_sm')
# all_questions = set() # a set of all questions and context
# for x in dataset['train']:
#     all_questions.add(x['question'])
#     all_questions.add(x['context'])


# all_questions = list(all_questions)
# parsed_questions = list(nlp.pipe(all_questions)) # this takes a while to run 
# spacy_map = dict([(x, y) for x, y in zip(all_questions, parsed_questions)])


# pickle.dump(spacy_map, open('processed_squad.pkl', 'wb'))

In [32]:
import pickle
spacy_map =  pickle.load(open('processed_squad.pkl', 'rb'))

In [33]:
processed_pairs = [(spacy_map[x[0]], spacy_map[x[1]]) for x in pairs]

In [34]:
# check
print("text:",pairs[0][0])
print("entities:",spacy_map[pairs[0][0]].ents)

text: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
entities: (Catholic, the Main Building's, the Virgin Mary., the Main Building, Venite Ad Me Omnes, the Main Building, the Sacred Heart, Grotto, Marian, Lourdes, France, the Virgin Mary, Saint Bernadette Soubirous, 1858, 3, the Gold Dome, Mary)


### Question Typo

In [35]:
def question_typo(x):
    """
    x[0]: context
    x[1]: question 
    Perturb.add_typos(x[1]): add a typo to question 
    """
    return (x[0], Perturb.add_typos(x[1]))
t = Perturb.perturb(pairs, question_typo, nsamples=500)
test = INV(**t, name='Question typo', capability='Robustness', description='')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad)
suite.add(test, overwrite=True)

Predicting 200 examples
Test cases:      500
Test cases run:  100
Fails (rate):    16 (16.0%)

Example fails:
Q: What is resistance to antibiotics a cause of?
P: evolutionary processes

Q: What is resistance to atnibiotics a cause of?
P: evolutionary processes that take place during antibiotic therapy


----
Q: What was the second reform?
P: the provision for the conscription of every male Prussian of military age

Q: What was the secon dreform?
P: reorganization of the army that integrated the regular army and the Landwehr reserves


----
Q: What was Darwin's response to the claims that certain animals could not have evolved through natural selection?
P: Darwin proposed scenarios

Q: What was Dariwn's response to the claims that certain animals could not have evolved through natural selection?
P: George Jackson Mivart


----


### Question contractions

In [36]:
def contractions(x):
    conts = Perturb.contractions(x[1])
    return [(x[0], a) for a in conts]
t = Perturb.perturb(pairs, contractions, nsamples=500)
test = INV(**t, name='Question contractions', capability='Robustness', description='')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad)
suite.add(test)

Predicting 200 examples
Test cases:      500
Test cases run:  100
Fails (rate):    11 (11.0%)

Example fails:
Q: Where is Paris economic Centre located?
P: west of the city

Q: Where's Paris economic Centre located?
P: west


----
Q: Where did Gershwin and Heyward write their folk opera?
P: Folly Beach

Q: Where'd Gershwin and Heyward write their folk opera?
P: Folly Beach outside of Charleston


----
Q: What is the longest half life of the isotopes?
P: 243.66 days

Q: What's the longest half life of the isotopes?
P: 13.76 h


----


Add random sentence

In [37]:
random_sentences = set()
for x, _ in processed_pairs:
    for y in x.sents:
        random_sentences.add(y.text)
random_sentences = list(random_sentences)

In [38]:
# check
for y in processed_pairs[0][0].sents:
    print(y.text)

Architecturally, the school has a Catholic character.
Atop the Main Building's gold dome is a golden statue of the Virgin Mary.
Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".
Next to the Main Building is the Basilica of the Sacred Heart.
Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection.
It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.
At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


In [39]:
len(random_sentences)

92328

### Add random sentence to context

In [40]:
def add_random_sentence(x, **kwargs):
    random_s = np.random.choice(random_sentences)
    while random_s in x[0]:
        random_s = np.random.choice(random_sentences)
    random_s = random_s.strip('.') + '. '
    meta = ['add to end: %s' % random_s, 'add to beg: %s' % random_s]
    return [(x[0] + random_s, x[1]), (random_s + x[0], x[1])], meta

def format_add(x, pred, conf, label=None, meta=None):
    ret = format_squad(x, pred, conf, label, meta)
    if meta:
        ret += 'Perturb: %s\n' % meta
    return ret

t = Perturb.perturb(pairs, add_random_sentence, nsamples=500, meta=True)
test = INV(**t, name='Add random sentence to context', capability='Robustness', description='')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_add)
suite.add(test)

Predicting 300 examples
Test cases:      500
Test cases run:  100
Fails (rate):    20 (20.0%)

Example fails:
Q: What does corruption disregard in politics?
P: democracy and good governance by flouting or even subverting formal processes

Q: What does corruption disregard in politics?
P: democracy and good governance
Perturb: add to beg: In May 2008, Israel confirmed it had been discussing a peace treaty with Syria for a year, with Turkey as a go-between. 


----
Q: There is a museum on Whitney Avenue that contain a variety of historical treasure, what is its' name?
P: Eli Whitney Museum

Q: There is a museum on Whitney Avenue that contain a variety of historical treasure, what is its' name?
P: New Haven Museum and Historical Society
Perturb: add to beg: During the embryonic stage of many insects and the postembryonic stage of primitive insects, 11 abdominal segments are present. 


----
Q: How were some of Marvel's genre titles published in the 1970s?
P: larger-format black and white 

## NER

In [41]:
import re
def change_thing(change_fn):
    def change_both(cq, **kwargs):
        context, question = cq
        a = change_fn(context, meta=True)
        if not a:
            return None
        changed, meta = a
        ret = []
        for c, m in zip(changed, meta):
            new_q = re.sub(r'\b%s\b' % re.escape(m[0]), m[1], question.text)
            ret.append((c, new_q))
        return ret, meta
    return change_both
            

In [42]:
def expect_same(orig_pred, pred, orig_conf, conf, labels=None, meta=None):
    if not meta:
        return pred == orig_pred
    return pred == re.sub(r'\b%s\b' % re.escape(meta[0]), meta[1], orig_pred)

def format_replace(x, pred, conf, label=None, meta=None):
    ret = format_squad(x, pred, conf, label, meta)
    if meta:
        ret += 'Perturb: %s -> %s\n' % meta
    return ret

def format_replace_context(x, pred, conf, label=None, meta=None):
    ret = format_squad_with_context(x, pred, conf, label, meta)
    if meta:
        ret += 'Perturb: %s -> %s\n' % meta
    return ret

In [43]:
t = Perturb.perturb(processed_pairs, change_thing(Perturb.change_names), nsamples=500, meta=True)

test = INV(**t, name='Change name everywhere', capability='NER',
          description='', expect=Expect.pairwise(expect_same))
test.run(predconfs, n=100)
test.summary(3, format_example_fn=format_replace)
suite.add(test, overwrite=True)

Predicting 1100 examples
Test cases:      500
Test cases run:  100
Fails (rate):    5 (5.0%)

Example fails:
Q: Who broke the world record for simultaneous fireworks?
P: Roy Lowry

Q: Who broke the world record for simultaneous fireworks?
P: Joseph Cook of the University of Plymouth
Perturb: Roy Lowry -> Joseph Cook


----
Q: What was Henry's reply to Bell's statement that he lacked information to complete his telegraph?
P: Get it!"

Q: What was Peter's reply to Bell's statement that he lacked information to complete his telegraph?
P: "Get it!"
Perturb: Henry -> Peter


----
Q: Prior to what century had there been little contact between western europe and greek civilization?
P: 18th

Q: Prior to what century had there been little contact between western europe and greek civilization?
P: 18th century
Perturb: James Stuart -> John Hall


----


### Change location everywhere

In [44]:
t = Perturb.perturb(processed_pairs, change_thing(Perturb.change_location), nsamples=500, meta=True)

test = INV(**t, name='Change location everywhere', capability='NER',
          description='', expect=Expect.pairwise(expect_same))
test.run(predconfs, n=100)
test.summary(3, format_example_fn=format_replace)
suite.add(test, overwrite=True)

Predicting 1100 examples
Test cases:      500
Test cases run:  100
Fails (rate):    10 (10.0%)

Example fails:
Q: Which was the last amateur team to compete for the Canadian football championship?
P: The Ontario Rugby Football Union

Q: Which was the last amateur team to compete for the Canadian football championship?
P: Ontario Rugby Football Union
Perturb: Canada -> Ukraine

Q: Which was the last amateur team to compete for the Canadian football championship?
P: Ontario Rugby Football Union
Perturb: Canada -> Thailand


----
Q: Demotic speaking Greek refugees were placed where?
P: Greek Isles and cities

Q: Demotic speaking Greek refugees were placed where?
P: the Greek Isles and cities
Perturb: Greece -> Nepal

Q: Demotic speaking Greek refugees were placed where?
P: the Greek Isles and cities
Perturb: Turkey -> Italy


----
Q: After Estonia achieved independence what was made their state language?
P: Estonian

Q: After Mexico achieved independence what was made their state language

## Temporal

### There was a change in profession

In [45]:
t = crossproduct(editor.template(
    {
        'contexts': [
            'Both {first_name} and {first_name2} were {prof1}s, but there was a change in {first_name}, who is now {a:prof2}.',
            'Both {first_name2} and {first_name} were {prof1}s, but there was a change in {first_name}, who is now {a:prof2}.',
        ],
        'qas': [
            (
                'Who is {a:prof2}?',
                '{first_name}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'There was a change in profession'
test = MFT(**t, expect=expect_squad, capability='Temporal', name=name, description='' )
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 200 examples
Test cases:      485
Test cases run:  100
Fails (rate):    0 (0.0%)


### Understanding before / after -> first / last.

In [46]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} became a {prof} before {first_name2} did.',
            '{first_name2} became a {prof} after {first_name} did.',
        ],
        'qas': [
            (
                'Who became a {prof} first?',
                '{first_name}'
            ), 
            (
                'Who became a {prof} last?',
                '{first_name2}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Understanding before / after -> first / last.'
test = MFT(**t, expect=expect_squad, capability='Temporal', name=name, description='' )
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)


Predicting 400 examples
Test cases:      495
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Victoria became a editor before Pamela did.
Q: Who became a editor first?
A: Victoria
P: Pamela

C: Pamela became a editor after Victoria did.
Q: Who became a editor first?
A: Victoria
P: Pamela


----
C: Ian became a actress before Stephanie did.
Q: Who became a actress last?
A: Stephanie
P: Ian

C: Stephanie became a actress after Ian did.
Q: Who became a actress first?
A: Ian
P: Stephanie


----
C: Philip became a attorney before Jonathan did.
Q: Who became a attorney last?
A: Jonathan
P: Philip

C: Jonathan became a attorney after Philip did.
Q: Who became a attorney last?
A: Jonathan
P: Philip


----


## Negation

### Negation in context, may or may not be in question

In [47]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is not {a:prof}. {first_name2} is.',
            '{first_name2} is {a:prof}. {first_name} is not.',
        ],
        'qas': [
            (
                'Who is {a:prof}?',
                '{first_name2}'
            ), 
            (
                'Who is not {a:prof}?',
                '{first_name}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Negation in context, may or may not be in question'
test = MFT(**t, expect=expect_squad, capability='Negation', name=name, description='' )
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      497
Test cases run:  100
Fails (rate):    92 (92.0%)

Example fails:
C: Amy is not an interpreter. Jean is.
Q: Who is an interpreter?
A: Jean
P: Amy


----
C: Martha is not a photographer. Jay is.
Q: Who is a photographer?
A: Jay
P: Martha


----
C: Sam is not an agent. Tom is.
Q: Who is an agent?
A: Tom
P: Sam


----


### Negation in question only.
Not in context:

In [None]:

t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} is {a:prof}. {first_name2} is {a:prof2}.',
            '{first_name2} is {a:prof2}. {first_name} is {a:prof}.',
        ],
        'qas': [
            (
                'Who is {a:prof}?',
                '{first_name}'
            ), 
            (
                'Who is not {a:prof}?',
                '{first_name2}'
            ), 
            (
                'Who is {a:prof2}?',
                '{first_name2}'
            ), 
            (
                'Who is not {a:prof2}?',
                '{first_name}'
            ), 
        ]
        
    },
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Negation in question only.'
test = MFT(**t, expect=expect_squad, capability='Negation', name=name, description='' )
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 800 examples
Test cases:      486
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Lucy is an investigator. Edward is an escort.
Q: Who is not an investigator?
A: Edward
P: Lucy

C: Lucy is an investigator. Edward is an escort.
Q: Who is not an escort?
A: Lucy
P: Edward

C: Edward is an escort. Lucy is an investigator.
Q: Who is not an investigator?
A: Edward
P: Lucy


----
C: Thomas is an agent. Steven is a journalist.
Q: Who is not an agent?
A: Steven
P: Thomas

C: Thomas is an agent. Steven is a journalist.
Q: Who is not a journalist?
A: Thomas
P: Steven

C: Steven is a journalist. Thomas is an agent.
Q: Who is not an agent?
A: Steven
P: Thomas


----
C: Melissa is an interpreter. Catherine is a waitress.
Q: Who is not an interpreter?
A: Catherine
P: Melissa

C: Melissa is an interpreter. Catherine is a waitress.
Q: Who is not a waitress?
A: Melissa
P: Catherine

C: Catherine is a waitress. Melissa is an interpreter.
Q: Who is not an interpreter?
A: C

## Fairness spinoff

### M/F failure rates should be similar for different professions

In [61]:
import collections
fewer_profs = ['doctor', 'nurse', 'secretary', 'CEO']
t = editor.template(
    [
        ('{male} is not {a:prof}, {female} is.', 'Who is {a:prof}?', '{female}', 'woman', '{prof}'),
        ('{female} is not {a:prof}, {male} is.', 'Who is {a:prof}?', '{male}', 'man', '{prof}'),
    ],
#     prof=professions + ['doctor'],
    prof=fewer_profs,
    remove_duplicates=True,
    nsamples=1000,
    unroll=True,
    save=True,
    )
data = [(d[0], d[1]) for d in t.data]
labels = [d[2] for d in t.data]
meta = [(d[3], d[4]) for d in t.data]

test = MFT(data, expect=expect_squad, labels=labels, meta=meta, templates=t.templates,
          name='M/F failure rates should be similar for different professions', capability='Fairness',
          description='Using negation in context.')
test.run(predconfs, n=100)

def print_fair(test):
    c = collections.Counter(test.meta)
    fail = collections.Counter([tuple(x) for x in np.array(test.meta)[test.fail_idxs()]])
    profs = set()
    for sex, prof in fail:
        profs.add(prof)
    prof_fail = {}
    get_fail = lambda f:fail[f] / c[f]
    for prof in profs:
        fail_m = get_fail(('man', prof))
        fail_f = get_fail(('woman', prof))
        prof_fail[prof] = (fail_m, fail_f)
    print('%-13s fail_men fail_women (count)' % 'profession')
    for prof, vs in sorted(prof_fail.items(), key=lambda x:max(x[1][0], x[1][1]), reverse=True):
        fail_m, fail_f = vs
        print('%-13s   %.1f      %.1f     (%d)' % (prof, 100 * fail_m, 100 * fail_f, c[('man', prof)]))
print_fair(test)
suite.add(test)

Predicting 100 examples
profession    fail_men fail_women (count)
doctor          5.2      5.6     (232)
CEO             5.1      4.8     (294)
secretary       3.0      4.2     (236)
nurse           4.2      4.2     (238)


In [52]:
test.summary(format_example_fn=format_squad_with_context)

Test cases:      2000
Test cases run:  100
Fails (rate):    84 (84.0%)

Example fails:
C: Emma is not a CEO, Alfred is.
Q: Who is a CEO?
A: Alfred
P: Emma

----
C: John is not a nurse, Kate is.
Q: Who is a nurse?
A: Kate
P: John

----
C: Lisa is not a CEO, Benjamin is.
Q: Who is a CEO?
A: Benjamin
P: Lisa

----


## Coref

### Basic coref, he / she

In [62]:
if 'actress' in professions:
    professions.remove('actress')

In [63]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{male} and {female} are friends. He is {a:prof1}, and she is {a:prof2}.',
            '{female} and {male} are friends. He is {a:prof1}, and she is {a:prof2}.',
            '{male} and {female} are friends. She is {a:prof2}, and he is {a:prof1}.',
            '{female} and {male} are friends. She is {a:prof2}, and he is {a:prof1}.',
        ],
        'qas': [
            (
                'Who is {a:prof1}?',
                '{male}'
            ), 
            (
                'Who is {a:prof2}?',
                '{female}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Basic coref, he / she'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='Coref')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 800 examples
Test cases:      488
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Kenneth and Betty are friends. He is an attorney, and she is an entrepreneur.
Q: Who is an attorney?
A: Kenneth
P: Kenneth and Betty

C: Kenneth and Betty are friends. He is an attorney, and she is an entrepreneur.
Q: Who is an entrepreneur?
A: Betty
P: Kenneth and Betty

C: Betty and Kenneth are friends. He is an attorney, and she is an entrepreneur.
Q: Who is an attorney?
A: Kenneth
P: Betty and Kenneth


----
C: Joseph and Nancy are friends. He is an editor, and she is an investigator.
Q: Who is an editor?
A: Joseph
P: Joseph and Nancy

C: Joseph and Nancy are friends. He is an editor, and she is an investigator.
Q: Who is an investigator?
A: Nancy
P: Joseph and Nancy

C: Nancy and Joseph are friends. He is an editor, and she is an investigator.
Q: Who is an editor?
A: Joseph
P: Nancy and Joseph


----
C: Lawrence and Amy are friends. He is an attorney, and she is an ar

### Basic coref, his / her

In [64]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{male} and {female} are friends. His mom is {a:prof}.',
            '{female} and {male} are friends. His mom is {a:prof}.',
        ],
        'qas': [
            (
                'Whose mom is {a:prof}?',
                '{male}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=250,
    ))
t += crossproduct(editor.template(
    {
        'contexts': [
            '{male} and {female} are friends. Her mom is {a:prof}.',
            '{female} and {male} are friends. Her mom is {a:prof}.',
        ],
        'qas': [
            (
                'Whose mom is {a:prof}?',
                '{female}'
            ), 
        ]
        
    },
    save=True,
    prof=professions,
    remove_duplicates=True,
    nsamples=250,
    ))

name = 'Basic coref, his / her'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='Coref')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 200 examples
Test cases:      500
Test cases run:  100
Fails (rate):    94 (94.0%)

Example fails:
C: Lawrence and Evelyn are friends. Her mom is an educator.
Q: Whose mom is an educator?
A: Evelyn
P: Lawrence and Evelyn

C: Evelyn and Lawrence are friends. Her mom is an educator.
Q: Whose mom is an educator?
A: Evelyn
P: Evelyn and Lawrence


----
C: James and Grace are friends. Her mom is a model.
Q: Whose mom is a model?
A: Grace
P: James and Grace

C: Grace and James are friends. Her mom is a model.
Q: Whose mom is a model?
A: Grace
P: Grace and James


----
C: Arthur and Michelle are friends. His mom is an actor.
Q: Whose mom is an actor?
A: Arthur
P: Arthur and Michelle

C: Michelle and Arthur are friends. His mom is an actor.
Q: Whose mom is an actor?
A: Arthur
P: Michelle and Arthur


----


### Former / latter

In [65]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} and {first_name2} are friends. The former is {a:prof1}.',
            '{first_name2} and {first_name} are friends. The latter is {a:prof1}.',
            '{first_name} and {first_name2} are friends. The former is {a:prof1} and the latter is {a:prof2}.',
            '{first_name2} and {first_name} are friends. The former is {a:prof2} and the latter is {a:prof1}.',
        ],
        'qas': [
            (
                'Who is {a:prof1}?',
                '{first_name}'
            ), 
        ]
        
    },
    prof=professions,
    remove_duplicates=True,
    nsamples=500,
    save=True
    ))
name = 'Former / Latter'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='Coref')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      485
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Catherine and Jim are friends. The former is an organizer.
Q: Who is an organizer?
A: Catherine
P: Catherine and Jim

C: Jim and Catherine are friends. The latter is an organizer.
Q: Who is an organizer?
A: Catherine
P: Jim and Catherine

C: Catherine and Jim are friends. The former is an organizer and the latter is an interpreter.
Q: Who is an organizer?
A: Catherine
P: Catherine and Jim


----
C: Alison and Frank are friends. The former is a model.
Q: Who is a model?
A: Alison
P: Alison and Frank

C: Frank and Alison are friends. The latter is a model.
Q: Who is a model?
A: Alison
P: Frank and Alison

C: Alison and Frank are friends. The former is a model and the latter is a nurse.
Q: Who is a model?
A: Alison
P: Alison and Frank


----
C: Jay and Keith are friends. The former is a producer.
Q: Who is a producer?
A: Jay
P: Jay and Keith

C: Keith and Jay are friends. The

## SRL
Semantic Role Labelling

### Agent / object distinction

In [67]:
import pattern
import pattern.en
pverb = ['love', 'hate', 'like', 'remember', 'recognize', 'trust', 'deserve', 'understand', 'blame', 'dislike', 'prefer', 'follow', 'notice', 'hurt', 'bother', 'support', 'believe', 'accept', 'attack']
a = pattern.en.tenses('loves')[0]
b = pattern.en.tenses('stolen')[0]
pverb = [(pattern.en.conjugate(v, *a), pattern.en.conjugate(v, *b)) for v in pverb]

t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} {v[0]} {first_name2}.',
            '{first_name2} is {v[1]} by {first_name}.',
        ],
        'qas': [
            (
                'Who {v[0]}?',
                '{first_name}'
            ), 
            (
                'Who is {v[1]}?',
                '{first_name2}'
            ), 
        ]
        
    },
    v=pverb,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Agent / object distinction'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='SRL')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)

Predicting 400 examples
Test cases:      499
Test cases run:  100
Fails (rate):    72 (72.0%)

Example fails:
C: Jonathan hurts Ann.
Q: Who hurts?
A: Jonathan
P: Ann

C: Ann is hurt by Jonathan.
Q: Who hurts?
A: Jonathan
P: Ann


----
C: David is preferred by Sally.
Q: Who prefers?
A: Sally
P: David is preferred by Sally


----
C: Caroline hates Nancy.
Q: Who is hated?
A: Nancy
P: Caroline hates Nancy

C: Nancy is hated by Caroline.
Q: Who hates?
A: Caroline
P: Nancy is hated by Caroline


----


In [72]:
pattern.en.tenses('loves')[0]

('present', 3, 'singular', 'indicative', 'imperfective')

### Agent / object distinction with 3 agents

In [74]:
t = crossproduct(editor.template(
    {
        'contexts': [
            '{first_name} {v[0]} {first_name2}. {first_name2} {v[0]} {first_name3}.',
            '{first_name} {v[0]} {first_name2}. {first_name3} is {v[1]} by {first_name2}.',
            '{first_name2} is {v[1]} by {first_name}. {first_name2} {v[0]} {first_name3}.',
            '{first_name2} is {v[1]} by {first_name}. {first_name3} is {v[1]} by {first_name2}.',
        ],
        'qas': [
            (
                'Who {v[0]} {first_name2}?',
                '{first_name}'
            ), 
            (
                'Who {v[0]} {first_name3}?',
                '{first_name2}'
            ), 
            (
                'Who is {v[1]} by {first_name}?',
                '{first_name2}'
            ), 
            (
                'Who is {v[1]} by {first_name2}?',
                '{first_name3}'
            ), 
        ]
        
    },
    save=True,
    v=pverb,
    remove_duplicates=True,
    nsamples=500,
    ))
name = 'Agent / object distinction with 3 agents'
test = MFT(**t, expect=expect_squad, name=name, description='', capability='SRL')
test.run(predconfs, n=100)
test.summary(n=3, format_example_fn=format_squad_with_context)
suite.add(test)


Predicting 1600 examples
Test cases:      495
Test cases run:  100
Fails (rate):    98 (98.0%)

Example fails:
C: Alan blames Jimmy. Jimmy blames Kevin.
Q: Who is blamed by Alan?
A: Jimmy
P: Kevin

C: Alan blames Jimmy. Kevin is blamed by Jimmy.
Q: Who blames Kevin?
A: Jimmy
P: Alan

C: Alan blames Jimmy. Kevin is blamed by Jimmy.
Q: Who is blamed by Alan?
A: Jimmy
P: Kevin


----
C: Edward prefers Rebecca. Rebecca prefers Judy.
Q: Who prefers Judy?
A: Rebecca
P: Edward

C: Edward prefers Rebecca. Judy is preferred by Rebecca.
Q: Who prefers Judy?
A: Rebecca
P: Edward

C: Edward prefers Rebecca. Judy is preferred by Rebecca.
Q: Who is preferred by Edward?
A: Rebecca
P: Judy


----
C: Julia follows Nicole. Nicole follows Marie.
Q: Who follows Marie?
A: Nicole
P: Julia

C: Julia follows Nicole. Nicole follows Marie.
Q: Who is followed by Nicole?
A: Marie
P: Julia

C: Julia follows Nicole. Marie is followed by Nicole.
Q: Who follows Marie?
A: Nicole
P: Julia


----


In [None]:
path = 'squad_suite2.pkl'
suite.save(path)

In [None]:
suite.summary(n=3, format_example_fn=format_squad_with_context)

Vocabulary

A is COMP than B. Who is more COMP?
Test cases:      497
Test cases run:  100
Fails (rate):    1 (1.0%)

Example fails:
C: Alison is faster than Sue.
Q: Who is faster?
A: Alison
P: Alison is faster than Sue

----


A is COMP than B. Who is less COMP?
Test cases:      497
Test cases run:  100
Fails (rate):    98 (98.0%)

Example fails:
C: Marilyn is happier than Ed.
Q: Who is less happier?
A: Ed
P: Marilyn

----
C: Susan is happier than Matt.
Q: Who is less happier?
A: Matt
P: Susan

----
C: Ed is thinner than Ann.
Q: Who is less thinner?
A: Ann
P: Ed

----


Intensifiers (very, super, extremely) and reducers (somewhat, kinda, etc)?
Test cases:      496
Test cases run:  100
Fails (rate):    100 (100.0%)

Example fails:
C: Evelyn is incredibly open about the project. Kathy is open about the project.
Q: Who is most open about the project?
A: Evelyn
P: Kathy

C: Evelyn is incredibly open about the project. Kathy is a little open about the project.
Q: Who is most open about the 

# save the generated Q&As in json

In [None]:
import json

In [None]:
format_fn = lambda x: json.dumps({'passage': x[0], 'question': x[1]})
suite.to_raw_file('squad.jsonl', format_fn=format_fn)

In [None]:

format_fn = lambda x: {'passage': x[0], 'question': x[1]}
suite.to_raw_file('squad.json', format_fn=format_fn, file_format='squad')

In [76]:
test_ranges = {}
current_idx = 0
all_examples = []
for name, t in suite.tests.items():
    examples = t.to_raw_examples()
    suite.test_ranges[name] = (current_idx, current_idx + len(examples))
    current_idx += len(examples)
    all_examples.extend(examples)


In [77]:
suite.test_ranges

{'A is COMP than B. Who is more COMP?': (0, 498),
 'A is COMP than B. Who is less COMP?': (498, 997),
 'Intensifiers (very, super, extremely) and reducers (somewhat, kinda, etc)?': (997,
  6997),
 'size, shape, age, color': (6997, 8997),
 'Profession vs nationality': (8997, 13997),
 'Animal vs Vehicle': (13997, 15997),
 'Animal vs Vehicle v2': (15997, 17997),
 'Synonyms': (17997, 19753),
 'A is COMP than B. Who is antonym(COMP)? B': (19753, 21745),
 'A is more X than B. Who is more antonym(X)? B. Who is less X? B. Who is more X? A. Who is less antonym(X)? A.': (21745,
  29713),
 'Question typo': (29713, 30713),
 'Question contractions': (30713, 31716),
 'Add random sentence to context': (31716, 33216),
 'Change name everywhere': (33216, 38716),
 'Change location everywhere': (38716, 44216),
 'There was a change in profession': (44216, 45186),
 'Understanding before / after -> first / last.': (45186, 47166),
 'Negation in context, may or may not be in question': (47166, 49154),
 'M/F fa

In [None]:
all_examples

["('Victoria is stranger than Frank.', 'Who is stranger?')",
 "('Ashley is stronger than Alan.', 'Who is stronger?')",
 "('Bobby is nicer than Thomas.', 'Who is nicer?')",
 "('Benjamin is slower than Simon.', 'Who is slower?')",
 "('Sally is older than Melissa.', 'Who is older?')",
 "('Martha is richer than Hugh.', 'Who is richer?')",
 "('Charlie is nicer than Colin.', 'Who is nicer?')",
 "('Ron is nicer than Christopher.', 'Who is nicer?')",
 "('Ken is cooler than Al.', 'Who is cooler?')",
 "('Jane is smarter than Alice.', 'Who is smarter?')",
 "('Angela is older than Amanda.', 'Who is older?')",
 "('Fred is slower than Lucy.', 'Who is slower?')",
 "('Don is faster than Patricia.', 'Who is faster?')",
 "('Judith is richer than Jessica.', 'Who is richer?')",
 "('Samuel is cleaner than Bob.', 'Who is cleaner?')",
 "('Edith is weaker than Lisa.', 'Who is weaker?')",
 "('Kathy is smaller than Walter.', 'Who is smaller?')",
 "('Michael is faster than Rose.', 'Who is faster?')",
 "('Ed is c