In this notebook, I use the CheckList framework to create a template-based evaluation set which includes 4 tests concerning pronoun-related capacities:
* linking names and pronouns
* the usage of multiple pronouns by an individual
* distinguishing the singular and plural usage of hen
* distinguishing the usage of die as a relative and a personal pronoun

The suite is stored in the file `suite.pkl` and the individual sentences are stored in `suite.txt`. The evaluation is done in the notebook `Test_suite_Evaluation.ipynb`

In [1]:
%cd /hpc/uu_cs_nlpsoc/gvanboven/wl-coref

/hpc/uu_cs_nlpsoc/gvanboven/wl-coref


In [21]:
import argparse

import jsonlines
import torch
from tqdm import tqdm
import numpy as np

from coref.config import Config
from coref.const import Doc, Span
from typing import List, TextIO, Dict, Callable, Type
from collections import defaultdict

import checklist
from checklist.pred_wrapper import PredictorWrapper
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.expect import Expect
from checklist.test_suite import TestSuite
import json

from coref import CorefModel
from coref.tokenizer_customization import *

In [7]:
config_file = 'config.toml'
experiment = 'xlm-roberta'
input_file = 'sample_input.jsonlines'

weights_debiased = 'data/model_checkpoints/xlm_gn_comb_fine_248/xlm-roberta_e26.pt'
weights_regular = 'data/model_checkpoints/xlm_regular_248/xlm-roberta_e18.pt'

In [8]:
def build_doc(doc: dict, model: CorefModel) -> dict:
    filter_func = TOKENIZER_FILTERS.get(model.config.bert_model,
                                        lambda _: True)
    token_map = TOKENIZER_MAPS.get(model.config.bert_model, {})

    word2subword = []
    subwords = []
    word_id = []
    for i, word in enumerate(doc["cased_words"]):
        tokenized_word = (token_map[word]
                          if word in token_map
                          else model.tokenizer.tokenize(word))
        tokenized_word = list(filter(filter_func, tokenized_word))
        word2subword.append((len(subwords), len(subwords) + len(tokenized_word)))
        subwords.extend(tokenized_word)
        word_id.extend([i] * len(tokenized_word))
    doc["word2subword"] = word2subword
    doc["subwords"] = subwords
    doc["word_id"] = word_id

    doc["head2span"] = []
    if "speaker" not in doc:
        doc["speaker"] = ["_" for _ in doc["cased_words"]]
    doc["word_clusters"] = []
    doc["span_clusters"] = []

    return doc

In [9]:
def return_predictions(doc: Doc,
                clusters: List[List[Span]]):
    """ Writes span/cluster information to f_obj, which is assumed to be a file
    object open for writing """
    placeholder = "  -" * 7
    doc_id = doc["document_id"]
    words = doc["cased_words"]
    sents = doc["sent_id"]

    max_word_len = max(len(w) for w in words)

    starts = defaultdict(lambda: [])
    ends = defaultdict(lambda: [])
    single_word = defaultdict(lambda: [])

    for cluster_id, cluster in enumerate(clusters):
        for start, end in cluster:
            if end - start == 1:
                single_word[start].append(cluster_id)
            else:
                starts[start].append(cluster_id)
                ends[end - 1].append(cluster_id)
    cluster_list = []
    word_number = 0
    for word_id, word in enumerate(words):

        cluster_info_lst = []
        for cluster_marker in starts[word_id]:
            cluster_info_lst.append(f"({cluster_marker}")
        for cluster_marker in single_word[word_id]:
            cluster_info_lst.append(f"({cluster_marker})")
        for cluster_marker in ends[word_id]:
            cluster_info_lst.append(f"{cluster_marker})")
        cluster_info = "|".join(cluster_info_lst) if cluster_info_lst else "-"
        cluster_list.append(cluster_info)

        if word_id == 0 or sents[word_id] != sents[word_id - 1]:
            word_number = 0

        word_number += 1
    
    return {'words': words, 'preds': cluster_list}


In [10]:
def preprocess_sent(sent: str):
    tokens = sent.split()
    data = {
            "document_id": "",
            "cased_words": tokens,
            "sent_id": [0 for i in range(len(tokens))]
    }
    return data

In [11]:
def load_model(weights):
        model = CorefModel(config_file, experiment, build_optimizers=False, lr=5e-4, bert_lr=3e-5)

        model.load_weights(path=weights, map_location="cpu",
                           ignore={"bert_optimizer", "general_optimizer",
                                  "bert_scheduler", "general_scheduler"})
        model.training = False
        return model

def make_pred(model, input_data):
    docs = [build_doc(doc, model) for doc in input_data]
    
    outputs = []
    with torch.no_grad():
        for doc in tqdm(docs, unit="docs"):
            result, _ = model.run(doc)
            #print(result)
            doc["span_clusters"] = result.span_clusters
            doc["word_clusters"] = result.word_clusters

            for key in ("word2subword", "subwords", "word_id", "head2span"):
                del doc[key]
            
            #print(result.word_clusters)

            output = return_predictions(doc, [[(i, i + 1) for i in cluster]
                                           for cluster in result.word_clusters])
            outputs.append(output)
    return outputs

In [12]:
def get_arg_2mentions(predictions: Dict, mention1: str, mention2: str) -> (str, str):
    """ 
    Helper function to extract target arguments, when there are two targets
    :param pred: the model prediction
    :param mention1: the token of the first mention
    :param mention2: the token of the second mention

    :returns the predictions for the first and second mention

    """
    words = predictions['words']
    preds = predictions['preds']
    
    m1_idx = words.index(mention1)
    m2_idx = words.index(mention2)
    
    return(preds[m1_idx], preds[m2_idx])

def get_arg_3mentions(predictions: Dict, mention1: str, mention2: str, mention3: str) -> (str, str):
    """ 
    Helper function to extract target arguments, when there are three targets
    :param pred: the model prediction
    :param mention1: the token of the first mention
    :param mention2: the token of the second mention
    :param mention3: the token of the thrid mention

    :returns the predictions for the first, second and third mention

    """
    words = predictions['words']
    preds = predictions['preds']
    
    m1_idx = words.index(mention1)
    m2_idx = words.index(mention2)
    m3_idx = words.index(mention3)
    
    return(preds[m1_idx], preds[m2_idx], preds[m3_idx])

def get_arg_4mentions(predictions: Dict, mention1: str, mention2: str, mention3: str, mention4: str) -> (str, str):
    """ 
    Helper function to extract target arguments, when there are three targets
    :param pred: the model prediction
    :param mention1: the token of the first mention
    :param mention2: the token of the second mention
    :param mention3: the token of the thrid mention

    :returns the predictions for the first, second and third mention

    """
    words = predictions['words']
    preds = predictions['preds']
    
    
    m1_idx = words.index(mention1)
    m2_idx = words.index(mention2)
    m3_idx = words.index(mention3)
    m4_idx = words.index(mention4)
    
    return(preds[m1_idx], preds[m2_idx], preds[m3_idx], preds[m4_idx])


def get_model_results(model: Type[CorefModel], n : int, sentence : str, capability : str, testcase_name : str) -> Dict:
    """
    Extracts the predictions of a given model for a given sentence
    And returns a dict with this info, which is to be stored in an output file

    :param model: a pretrained srl model
    :param n: the number of the current data sample
    :param sentence: the current data sample sentence
    :param capability: the name of the linguistic capability of interest
    :param testcase_name: the name of the current test

    :returns results: dict with all relevant info to store in output file
    """
    results = {'sentence' : sentence,
               'capability' : capability,
               'testcase_name' : testcase_name,
               'preds' : model.results['preds'][n],
               'confs': model.results['confs'][n],
               'passed' : bool(model.results['passed'][n])}
    return results

def extract_data_and_predictions(t: Dict, capability: str, testcase_name: str, test_regular: Dict, 
                                 test_debiased: Dict, test_data: List,  regular_predictions: List, 
                                 debiased_predictions : List) -> (List, List, List):
    """ 
    Function that extracts all relevant test data and predictions information that is to be stored in the output files

    :param t: dict containing all test case information
    :param capability: the name of the linguistic capability of interest
    :param testcase_name: the name of the current test
    :param test_regular: predictions for the first SRL model
    :param test_debiased: predictions for the second SRL model
    :param test_data: output list in which all test cases should be stored
    :param regular_predictions: output list in which all predictions of the first model should be stored
    :param debiased_predictions: output list in which all predictions of the second model should be stored

    :returns test_data: output list to which new test cases are added
    :returns regular_predictions: output list to which new predictions of the first model are added
    :returns debiased_predictions: output list to which new predictions of the second model are added
    """
    for n, sentence in enumerate(t['data']):
        #extract input sentence info
              
        input_item = {'sentence' : sentence,
                        'meta' : t['meta'][n],
                        'capability' : capability,
                        'testcase_name': testcase_name}
        #extract predictions info for the two models
        regular_prediction =  get_model_results(test_regular, n, sentence, capability, testcase_name)
        debiased_prediction =  get_model_results(test_debiased, n, sentence, capability, testcase_name)
        #save extracted info
        test_data.append(input_item)
        regular_predictions.append(regular_prediction)
        debiased_predictions.append(debiased_prediction)
    return test_data, regular_predictions, debiased_predictions


def predict_and_store(t: Dict, capability: str, testcase_name: str, expect: Callable, formattype: Callable, 
                      predict_regular: Type[CorefModel], predict_debiased: Type[CorefModel], \
                      test_data: List, regular_predictions: List, debiased_predictions: List) -> (List, List, List):
    """ 
    Function that creates test cases given a template, makes predictions for the given models and stores the test cases as 
    well as the predictions

    :param t: dict containing all test case information
    :param capability: the name of the linguistic capability of interest
    :param testcase_name: the name of the current test
    :param expect: function that checks if the argument of interest is predicted as expected
    :param formattype: function that creates the correct formatting for the test
    :param predict_regular: first pretrained SRL model to be tested
    :param predict_srlbert: second pretrained SRL model to be tested
    :param test_data: output list in which all test cases should be stored
    :param regular_predictions: output list in which all predictions of the first model should be stored
    :param debiased_predictions: output list in which all predictions of the second model should be stored

    :returns test_data: output list to which new test cases are added
    :returns regular_predictions: output list to which new predictions of the first model are added
    :returns debiased_predictions: output list to which new predictions of the second model are added
    """
    #test the srl model
    print('regular model')
    test_regular = MFT(**t, expect=expect)
    test_regular.run(predict_regular)
    test_regular.summary(format_example_fn=formattype)
    
    print('debiased model')
    test_debiased = MFT(**t, expect=expect)
    test_debiased.run(predict_debiased)
    test_debiased.summary(format_example_fn=formattype)

    #store samples and predictions
    test_data, regular_predictions, debiased_predictions = extract_data_and_predictions(t, capability, testcase_name, \
                                                                                 test_regular, test_debiased, test_data, \
                                                                                 regular_predictions, debiased_predictions)
    return test_data, regular_predictions, debiased_predictions

def store_data(path: str, data: List, new_file: bool=True):
    """
    Function that saves a given list to a json file on the given path

    :param path: path to a .json file to store data in
    :param data: list containing information to be stored
    :param new_file: setting indicating whether previous information in the file should be deleted (True) or not (False)
    """
    #if there already is content in the file, make sure we do not lose it. 
    if new_file == False:
        with open(path, "r") as file:
            old_data = json.load(file)

        old_data.append(data)
        data = old_data

    with open(path, "w") as file:
        json.dump(data, file, indent=4, sort_keys=True)


# Helper function to display failures 
def format_sent(x, pred, conf, label=None, meta=None):

    predicate_structure = [*zip(pred['words'], pred['preds'])]
        
    return predicate_structure


def found_name_with_pronouns(x, pred, conf, label=None, meta=None):
    
    # the name and the pronoun should have the same cluster annotation
    m1 = meta['pronoun']
    m2 = meta['name']
    
    arg1 = get_arg_2mentions(pred, m1, m2)

    if arg1[0] == arg1[1] and arg1[0]!= '-':
        pass_ = True
    else:
        pass_ = False
    return pass_

def found_pronoun_with_pronoun(x, pred, conf, label=None, meta=None):
    
    # the two pronouns should have the same cluster annotations
    m1 = meta['pronoun_subj']
    m2 = meta['pronoun_poss']
    
    arg1 = get_arg_2mentions(pred, m1, m2)

    if arg1[0] == arg1[1] and arg1[0]!= '-':
        pass_ = True
    else:
        pass_ = False
    return pass_


def found_name_pronoun_pronoun(x, pred, conf, label=None, meta=None):
    
    m1 = meta['name']
    m2 = meta['pronoun_poss']
    m3 = meta['pronoun_subj']
    
    arg1 = get_arg_3mentions(pred, m1, m2, m3)
    
    #the three mentions should have the same non-zero cluster annotation
    if arg1[0] == arg1[1]== arg1[2] and arg1[0]!= '-':
        pass_ = True
    else:
        pass_ = False
    return pass_

def found_name_sing_plur(x, pred, conf, label=None, meta=None):
    
    # the first two sing mentions should be the same and the latter two plural mentions should be the same
    m1 = meta['name']
    m2 = meta['sport']+'team'
    m3 = meta['pronoun_singular']
    m4 = meta['pronoun_plural']
    
    words = pred['words']
    predictions = pred['preds']
    
    
    m1_idx = words.index(m1)
    m2_idx = words.index(m2)
    m3_idx = words.index(m3)
    m4_idx = [idx for idx,w in enumerate(words) if w==m4][-1]#words.index(mention4)
    
    preds = predictions[m1_idx], predictions[m2_idx], predictions[m3_idx], predictions[m4_idx]

    if preds[0] == preds[2] and preds[0]!= '-' and preds[1] == preds[3] and preds[1]!= '-' and preds[0]!= preds[1]:
        pass_ = True
    else:
        pass_ = False
    return pass_

def found_name_plur_sing(x, pred, conf, label=None, meta=None):
    
    # the first two sing mentions should be the same and the latter two plural mentions should be the same
    m1 = meta['name']
    m2 = meta['sport']+'team'
    m3 = meta['pronoun_plural']
    m4 = meta['pronoun_singular']

    
    preds = get_arg_4mentions(pred, m1, m2, m3, m4)

    if preds[0] == preds[3] and preds[0]!= '-' and preds[1] == preds[2] and preds[1]!= '-' and preds[0]!= preds[1]:
        pass_ = True
    else:
        pass_ = False
    return pass_

def found_die_die(x, pred, conf, label=None, meta=None):
    
    # the first two sing mentions should be the same and the latter two plural mentions should be the same
    m1 = meta['name']
    m2 = meta['die']
    m3 = meta['pronoun_subj']

    words = pred['words']
    predictions = pred['preds']
    
    m1_idx = words.index(m1)
    m2_idx = words.index(m2)
    m3_idx = [idx for idx,w in enumerate(words) if w==m3][-1]
    
    preds = predictions[m1_idx], predictions[m2_idx], predictions[m3_idx]
    

    if preds[0] == preds[2] and preds[0]!= '-' and preds[1]== '-':
        pass_ = True
    else:
        pass_ = False
    return pass_

In [None]:
editor = Editor()

## First names + pronoun recognition

In [178]:
with open("../eval/gn_names.txt", "r") as namelist:
    gn_names = namelist.read().split()
pronouns_poss = ['zijn', 'haar', 'diens', 'hun']
pronouns_poss_gendered = ['zijn', 'haar']
pronouns_poss_gn = ['diens', 'hun']
pronouns_subj = ['hij', 'zij', 'die', 'hen']
pronouns_subj_gendered = ['hij', 'zij']
pronouns_subj_gn = ['die', 'hen']
pronouns_obj = ['hem', 'haar', 'hen', 'hen']
pronouns_obj_gendered = ['hem', 'haar']
pronouns_obj_gn= ['hen', 'hen']

testcase_name = 'gn_names'

In [179]:
expect_name_pronouns = Expect.single(found_name_with_pronouns)
expect_pronoun_pronoun = Expect.single(found_pronoun_with_pronoun)
expect_name_pronoun_pronoun = Expect.single(found_name_pronoun_pronoun)
expect_sing_plur = Expect.single(found_name_sing_plur)
expect_plur_sing = Expect.single(found_name_plur_sing)
expect_die_die = Expect.single(found_die_die)

In [199]:
suite = TestSuite()

### gender neutral names

In [200]:
capability = 'Name pronoun link'
test_data = []

regular_predictions = []
debiased_predictions = []

#create samples
testcase_name = 'gn_names'
t = editor.template('{name} gaat naar {pronoun} {place}.',
                       name=gn_names, 
                       pronoun=pronouns_poss_gendered, 
                       place=['werk', 'dokter', 'afspraak', 'buren', 'ouders', 'interview'], meta=True, 
                        nsamples=100)

name_pronoun_gn_g = MFT(**t, name='Name pronoun, gender neutral name, gendered pronouns', \
                      description='Linking a pronoun with a name, using gender neutral names and gendered pronouns', \
                      expect=expect_name_pronouns)

In [201]:
test_data = []

regular_predictions = []
debiased_predictions = []

#create samples
testcase_name = 'gn_names'
t = editor.template('{name} gaat naar {pronoun} {place}.',
                       name=gn_names, 
                       pronoun=pronouns_poss_gn, 
                       place=['werk', 'dokter', 'afspraak', 'buren', 'ouders', 'interview'], meta=True, 
                        nsamples=100)

name_pronoun_gn_gn = MFT(**t, name='Name pronoun, gender neutral name, gender neutral pronouns', \
                      description='Linking a pronoun with a name, using gender neutral names and gender neutral pronouns', \
                      expect=expect_name_pronouns)

### male names

In [202]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} gaat naar {pronoun} {place}.',
                       name=editor.lexicons.male_from['the_Netherlands'], 
                       pronoun=pronouns_poss_gendered, 
                       place=['werk', 'dokter', 'afspraak', 'buren', 'ouders', 'interview'], meta=True, 
                        nsamples=100)

name_pronoun_m_g = MFT(**t, name='Name pronoun, male name, gendered pronouns', \
                      description='Linking a pronoun with a name, using male names and gendered pronouns', \
                      expect=expect_name_pronouns)

In [203]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} gaat naar {pronoun} {place}.',
                       name=editor.lexicons.male_from['the_Netherlands'], 
                       pronoun=pronouns_poss_gn, 
                       place=['werk', 'dokter', 'afspraak', 'buren', 'ouders', 'interview'], meta=True, 
                        nsamples=100)

name_pronoun_m_gn = MFT(**t, name='Name pronoun, male name, gender neutral pronouns', \
                      description='Linking a pronoun with a name, using male names and gender neutral pronouns', \
                      expect=expect_name_pronouns)

### female names

In [204]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} gaat naar {pronoun} {place}.',
                       name=editor.lexicons.female_from['the_Netherlands'], 
                       pronoun=pronouns_poss_gendered, 
                       place=['werk', 'dokter', 'afspraak', 'buren', 'ouders', 'interview'], meta=True, 
                        nsamples=100)

name_pronoun_f_g = MFT(**t, name='Name pronoun, female name, gendered pronouns', \
                      description='Linking a pronoun with a name, using female names and gendered pronouns', \
                      expect=expect_name_pronouns)

In [205]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} gaat naar {pronoun} {place}.',
                       name=editor.lexicons.female_from['the_Netherlands'], 
                       pronoun=pronouns_poss_gn, 
                       place=['werk', 'dokter', 'afspraak', 'buren', 'ouders', 'interview'], meta=True, 
                        nsamples=100)

name_pronoun_f_gn = MFT(**t, name='Name pronoun, female name, gender neutral pronouns', \
                      description='Linking a pronoun with a name, using female names and gender neutral pronouns', \
                      expect=expect_name_pronouns)

In [206]:
suite.add(name_pronoun_gn_g, capability=capability)
suite.add(name_pronoun_gn_gn, capability=capability)
suite.add(name_pronoun_m_g, capability=capability)
suite.add(name_pronoun_m_gn, capability=capability)
suite.add(name_pronoun_f_g, capability=capability)
suite.add(name_pronoun_f_gn, capability=capability)


## Multiple pronouns per entity
### short sentence with only pronouns

In [207]:
capability = 'pronoun-pronoun'
test_data = []

regular_predictions = []
debiased_predictions = []


t = editor.template('{pronoun_subj} gaat naar {pronoun_poss} {place}.',
                       pronoun_subj = pronouns_subj_gendered, 
                       pronoun_poss=pronouns_poss_gendered, 
                       place=['werk', 'dokter', 'afspraak', 'buren', 'ouders', 'interview'], meta=True, 
                       nsamples=100)
pronoun_pronoun_g = MFT(**t, name='Pronoun pronoun, gendered pronouns', \
                      description='Linking a pronoun with a pronoun, using gendered pronouns', \
                      expect=expect_pronoun_pronoun)

In [208]:
test_data = []

regular_predictions = []
debiased_predictions = []

np.random.seed(2020)
t = editor.template('{pronoun_subj} gaat naar {pronoun_poss} {place}.',
                       pronoun_subj = pronouns_subj_gn, 
                       pronoun_poss=pronouns_poss_gn, 
                       place=['werk', 'dokter', 'afspraak', 'buren', 'ouders', 'interview'], meta=True, 
                       nsamples=100)
pronoun_pronoun_gn = MFT(**t, name='Pronoun pronoun, gender neutral pronouns', \
                      description='Linking a pronoun with a pronoun, using gender neutral pronouns', \
                      expect=expect_pronoun_pronoun)

In [209]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{pronoun_subj} gaat naar {pronoun_poss} {place}.',
                       pronoun_subj = pronouns_subj, 
                       pronoun_poss=pronouns_poss, 
                       place=['werk', 'dokter', 'afspraak', 'buren', 'ouders', 'interview'], meta=True, 
                       nsamples=100)
pronoun_pronoun_mix = MFT(**t, name='Pronoun pronoun, mixed pronouns', \
                      description='Linking a pronoun with a pronoun, using mixed pronouns', \
                      expect=expect_pronoun_pronoun)

In [210]:
suite.add(pronoun_pronoun_g, capability=capability)
suite.add(pronoun_pronoun_gn, capability=capability)
suite.add(pronoun_pronoun_mix, capability=capability)

### longer sentence with a name and different pronouns

In [211]:
capability = 'pronoun-pronoun-name'
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} heeft {pronoun_poss} afspraak bij de {place} om 3 uur. {pronoun_subj} gaat zo heen.',
                       name=gn_names,
                       pronoun_subj = pronouns_subj_gendered, 
                       pronoun_poss=pronouns_poss_gendered, 
                       place=['dokters', 'huisartsenpost', 'buren', 'gemeente', 'winkel', 'sportschool'], meta=True, 
                       nsamples=100)

pronoun_pronoun_name_g = MFT(**t, name='Pronoun pronoun name, gendered pronouns', \
                      description='Linking a pronoun with a pronoun and a name, using gendered pronouns', \
                      expect=expect_name_pronoun_pronoun)

In [212]:
test_data = []

regular_predictions = []
debiased_predictions = []


t = editor.template('{name} heeft {pronoun_poss} afspraak bij de {place} om 3 uur. {pronoun_subj} gaat zo heen.',
                       name=gn_names,
                       pronoun_subj = pronouns_subj_gn, 
                       pronoun_poss=pronouns_poss_gn, 
                       place=['dokters', 'huisartsenpost', 'buren', 'gemeente', 'winkel', 'sportschool'], meta=True, 
                       nsamples=100)

pronoun_pronoun_name_gn = MFT(**t, name='Pronoun pronoun name, gender neutral pronouns', \
                      description='Linking a pronoun with a pronoun and a name, using gender neutral pronouns', \
                      expect=expect_name_pronoun_pronoun)

In [213]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} heeft {pronoun_poss} afspraak bij de {place} om 3 uur. {pronoun_subj} gaat zo heen.',
                       name=gn_names,
                       pronoun_subj = pronouns_subj, 
                       pronoun_poss=pronouns_poss, 
                       place=['dokters', 'huisartsenpost', 'buren', 'gemeente', 'winkel', 'sportschool'], meta=True, 
                       nsamples=100)

pronoun_pronoun_name_mix = MFT(**t, name='Pronoun pronoun name, mixed pronouns', \
                      description='Linking a pronoun with a pronoun and a name, using mixed pronouns', \
                      expect=expect_name_pronoun_pronoun)

In [214]:
suite.add(pronoun_pronoun_name_g, capability=capability)
suite.add(pronoun_pronoun_name_gn, capability=capability)
suite.add(pronoun_pronoun_name_mix, capability=capability)

## singular plural distinction

In [223]:
capability = 'hen-hen-amb'
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} gaat met het {sport}team op vakantie. {pronoun_singular} is al vaker met {pronoun_plural} weggeweest.',
                       name = gn_names, 
                       pronoun_singular=pronouns_subj_gendered,
                       pronoun_plural = ['hen'],
                       sport=['voetbal', 'volleybal', 'hockey', 'handbal', 'softbal', 'honkbal', 'basketbal'], meta=True, 
                       nsamples=100)


hen_hen_p_g = MFT(**t, name='Hen-hen ambiguity, hen plural, gendered pronouns', \
                      description='Distinguishing the plural and singular usage of hen, using gendered pronouns', \
                      expect=expect_sing_plur)

In [224]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} gaat met het {sport}team op vakantie. {pronoun_singular} is al vaker met {pronoun_plural} weggeweest.',
                       name = gn_names, 
                       pronoun_singular=pronouns_subj_gn,
                       pronoun_plural = ['hen'],
                       sport=['voetbal', 'volleybal', 'hockey', 'handbal', 'softbal', 'honkbal', 'basketbal'], meta=True, 
                       nsamples=100)

hen_hen_p_gn = MFT(**t, name='Hen-hen ambiguity, hen plural, gender neutral pronouns', \
                      description='Distinguishing the plural and singular usage of hen, using gender neutral pronouns', \
                      expect=expect_sing_plur)

In [225]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} gaat met het {sport}team op vakantie. {pronoun_plural} zijn al vaker met {pronoun_singular} weggeweest .',
                       name = gn_names, 
                       pronoun_singular=pronouns_obj_gendered, 
                       pronoun_plural = ['Zij'],
                       sport=['voetbal', 'volleybal', 'hockey', 'handbal', 'softbal', 'honkbal', 'basketbal'], meta=True, 
                       nsamples=100, )

hen_hen_s_g = MFT(**t, name='Hen-hen ambiguity, hen singular, gendered pronouns', \
                      description='Distinguishing the plural and singular usage of hen, using gendered pronouns', \
                      expect=expect_plur_sing)

In [226]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} gaat met het {sport}team op vakantie. {pronoun_plural} zijn al vaker met {pronoun_singular} weggeweest .',
                       name = gn_names, 
                       pronoun_singular=pronouns_obj_gn, 
                       pronoun_plural = ['Zij'],
                       sport=['voetbal', 'volleybal', 'hockey', 'handbal', 'softbal', 'honkbal', 'basketbal'], meta=True, 
                       nsamples=100, )

hen_hen_s_gn = MFT(**t, name='Hen-hen ambiguity, hen singular, gender neutral pronouns', \
                      description='Distinguishing the plural and singular usage of hen, using gender neutral pronouns', \
                      expect=expect_plur_sing)

In [227]:
suite.add(hen_hen_p_g, capability=capability)
suite.add(hen_hen_p_gn, capability=capability)
suite.add(hen_hen_s_g, capability=capability)
suite.add(hen_hen_s_gn, capability=capability)

## die/die distinction

In [233]:
test_data = []

regular_predictions = []
debiased_predictions = []


t = editor.template('{name} {die} hier net werkt is te laat omdat {pronoun_subj} een lekke band had .',
                       name = gn_names, 
                       pronoun_subj=pronouns_subj_gendered, 
                       die=['die'],
                       meta=True, 
                       nsamples=100)

die_die_g = MFT(**t, name='Die-die ambiguity, gendered pronouns', \
                      description='Distinguishing two usages of die, using gendered pronouns', \
                      expect=expect_die_die)


In [234]:
test_data = []

regular_predictions = []
debiased_predictions = []

t = editor.template('{name} {die} hier net werkt is te laat omdat {pronoun_subj} een lekke band had .',
                       name = gn_names, 
                       pronoun_subj=['die'], 
                       die=['die'],
                       meta=True, 
                       nsamples=100)

die_die_gn = MFT(**t, name='Die-die ambiguity, gender neutral pronouns', \
                      description='Distinguishing two usages of die, using gender neutral pronouns', \
                      expect=expect_die_die)

In [235]:
suite.add(die_die_g, capability=capability)
suite.add(die_die_gn, capability=capability)

# Storing the data

In [238]:
suite.to_raw_file("suite.txt")

for test in suite.tests:
    suite.tests[test].name = test
    suite.tests[test].description = suite.info[test]['description]']
    suite.tests[test].capability = suite.info[test]['capability']

In [239]:
path = "suite.pkl"
suite.save(path)