In [1]:
%cd /hpc/uu_cs_nlpsoc/gvanboven/wl-coref

/hpc/uu_cs_nlpsoc/gvanboven/wl-coref


In [2]:
import argparse

import jsonlines
import torch
from tqdm import tqdm
import numpy as np

from coref.config import Config
from coref.const import Doc, Span
from typing import List, TextIO, Dict, Callable, Type
from collections import defaultdict

import checklist
from checklist.pred_wrapper import PredictorWrapper
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.expect import Expect
from checklist.test_suite import TestSuite
import json

from coref import CorefModel
from coref.tokenizer_customization import *

In [3]:
config_file = 'config.toml'
experiment = 'xlm-roberta'
input_file = 'sample_input.jsonlines'

weights_regular1 = 'data/model_checkpoints/xlm_regular_248/xlm-roberta_e18.pt'
weights_regular2 = 'data/model_checkpoints/xlm_regular_2020/xlm-roberta_e15.pt'
weights_regular3 = 'data/model_checkpoints/xlm_regular_2023/xlm-roberta_e17.pt'
weights_regular4 = 'data/model_checkpoints/xlm_regular_123/xlm-roberta_e18.pt'
weights_regular5 = 'data/model_checkpoints/xlm_regular_1234/xlm-roberta_e15.pt'

weights_debiased1 = 'data/model_checkpoints/xlm_gn_comb_fine_248/xlm-roberta_e26.pt'
weights_debiased2 = 'data/model_checkpoints/xlm_gn_comb_fine_2020/xlm-roberta_e25.pt'
weights_debiased3 = 'data/model_checkpoints/xlm_gn_comb_fine_2023/xlm-roberta_e23.pt'
weights_debiased4 = 'data/model_checkpoints/xlm_gn_comb_fine_123/xlm-roberta_e23.pt'
weights_debiased5 = 'data/model_checkpoints/xlm_gn_comb_fine_1234/xlm-roberta_e23.pt'

In [4]:
def build_doc(doc: dict, model: CorefModel) -> dict:
    filter_func = TOKENIZER_FILTERS.get(model.config.bert_model,
                                        lambda _: True)
    token_map = TOKENIZER_MAPS.get(model.config.bert_model, {})

    word2subword = []
    subwords = []
    word_id = []
    for i, word in enumerate(doc["cased_words"]):
        tokenized_word = (token_map[word]
                          if word in token_map
                          else model.tokenizer.tokenize(word))
        tokenized_word = list(filter(filter_func, tokenized_word))
        word2subword.append((len(subwords), len(subwords) + len(tokenized_word)))
        subwords.extend(tokenized_word)
        word_id.extend([i] * len(tokenized_word))
    doc["word2subword"] = word2subword
    doc["subwords"] = subwords
    doc["word_id"] = word_id

    doc["head2span"] = []
    if "speaker" not in doc:
        doc["speaker"] = ["_" for _ in doc["cased_words"]]
    doc["word_clusters"] = []
    doc["span_clusters"] = []

    return doc

In [5]:
def return_predictions(doc: Doc,
                clusters: List[List[Span]]):
    """ Writes span/cluster information to f_obj, which is assumed to be a file
    object open for writing """
    placeholder = "  -" * 7
    doc_id = doc["document_id"]
    words = doc["cased_words"]
    sents = doc["sent_id"]

    max_word_len = max(len(w) for w in words)

    starts = defaultdict(lambda: [])
    ends = defaultdict(lambda: [])
    single_word = defaultdict(lambda: [])

    for cluster_id, cluster in enumerate(clusters):
        for start, end in cluster:
            if end - start == 1:
                single_word[start].append(cluster_id)
            else:
                starts[start].append(cluster_id)
                ends[end - 1].append(cluster_id)
    cluster_list = []
    word_number = 0
    for word_id, word in enumerate(words):

        cluster_info_lst = []
        for cluster_marker in starts[word_id]:
            cluster_info_lst.append(f"({cluster_marker}")
        for cluster_marker in single_word[word_id]:
            cluster_info_lst.append(f"({cluster_marker})")
        for cluster_marker in ends[word_id]:
            cluster_info_lst.append(f"{cluster_marker})")
        cluster_info = "|".join(cluster_info_lst) if cluster_info_lst else "-"
        cluster_list.append(cluster_info)

        if word_id == 0 or sents[word_id] != sents[word_id - 1]:
            word_number = 0

        word_number += 1
    
    return {'words': words, 'preds': cluster_list}

In [6]:
def preprocess_sent(sent: str):
    tokens = sent.split()
    data = {
            "document_id": "",
            "cased_words": tokens,
            "sent_id": [0 for i in range(len(tokens))]
    }
    return data

In [7]:
def load_model(weights):
        model = CorefModel(config_file, experiment, build_optimizers=False, lr=5e-4, bert_lr=3e-5)

        model.load_weights(path=weights, map_location="cpu",
                           ignore={"bert_optimizer", "general_optimizer",
                                  "bert_scheduler", "general_scheduler"})
        model.training = False
        return model

def make_pred(model, input_data):
    docs = [build_doc(doc, model) for doc in input_data]
    
    outputs = []
    with torch.no_grad():
        for doc in tqdm(docs, unit="docs"):
            result, _ = model.run(doc)
            #print(result)
            doc["span_clusters"] = result.span_clusters
            doc["word_clusters"] = result.word_clusters

            for key in ("word2subword", "subwords", "word_id", "head2span"):
                del doc[key]
            
            #print(result.word_clusters)

            output = return_predictions(doc, [[(i, i + 1) for i in cluster]
                                           for cluster in result.word_clusters])
            outputs.append(output)
    return outputs

In [8]:
def get_arg_2mentions(predictions: Dict, mention1: str, mention2: str) -> (str, str):
    """ 
    Helper function to extract target arguments, when there are two targets
    :param pred: the model prediction
    :param mention1: the token of the first mention
    :param mention2: the token of the second mention

    :returns the predictions for the first and second mention

    """
    words = predictions['words']
    preds = predictions['preds']
    
    m1_idx = words.index(mention1)
    m2_idx = words.index(mention2)
    
    return(preds[m1_idx], preds[m2_idx])

def get_arg_3mentions(predictions: Dict, mention1: str, mention2: str, mention3: str) -> (str, str):
    """ 
    Helper function to extract target arguments, when there are three targets
    :param pred: the model prediction
    :param mention1: the token of the first mention
    :param mention2: the token of the second mention
    :param mention3: the token of the thrid mention

    :returns the predictions for the first, second and third mention

    """
    words = predictions['words']
    preds = predictions['preds']
    
    m1_idx = words.index(mention1)
    m2_idx = words.index(mention2)
    m3_idx = words.index(mention3)
    
    return(preds[m1_idx], preds[m2_idx], preds[m3_idx])

def get_arg_4mentions(predictions: Dict, mention1: str, mention2: str, mention3: str, mention4: str) -> (str, str):
    """ 
    Helper function to extract target arguments, when there are three targets
    :param pred: the model prediction
    :param mention1: the token of the first mention
    :param mention2: the token of the second mention
    :param mention3: the token of the thrid mention

    :returns the predictions for the first, second and third mention

    """
    words = predictions['words']
    preds = predictions['preds']
    
    
    m1_idx = words.index(mention1)
    m2_idx = words.index(mention2)
    m3_idx = words.index(mention3)
    m4_idx = words.index(mention4)
    
    return(preds[m1_idx], preds[m2_idx], preds[m3_idx], preds[m4_idx])


def get_model_results(model: Type[CorefModel], n : int, sentence : str, capability : str, testcase_name : str) -> Dict:
    """
    Extracts the predictions of a given model for a given sentence
    And returns a dict with this info, which is to be stored in an output file

    :param model: a pretrained srl model
    :param n: the number of the current data sample
    :param sentence: the current data sample sentence
    :param capability: the name of the linguistic capability of interest
    :param testcase_name: the name of the current test

    :returns results: dict with all relevant info to store in output file
    """
    results = {'sentence' : sentence,
               'capability' : capability,
               'testcase_name' : testcase_name,
               'preds' : model.results['preds'][n],
               'confs': model.results['confs'][n],
               'passed' : bool(model.results['passed'][n])}
    return results

def extract_data_and_predictions(t: Dict, capability: str, testcase_name: str, test_regular: Dict, 
                                 test_debiased: Dict, test_data: List,  regular_predictions: List, 
                                 debiased_predictions : List) -> (List, List, List):
    """ 
    Function that extracts all relevant test data and predictions information that is to be stored in the output files

    :param t: dict containing all test case information
    :param capability: the name of the linguistic capability of interest
    :param testcase_name: the name of the current test
    :param test_regular: predictions for the first SRL model
    :param test_debiased: predictions for the second SRL model
    :param test_data: output list in which all test cases should be stored
    :param regular_predictions: output list in which all predictions of the first model should be stored
    :param debiased_predictions: output list in which all predictions of the second model should be stored

    :returns test_data: output list to which new test cases are added
    :returns regular_predictions: output list to which new predictions of the first model are added
    :returns debiased_predictions: output list to which new predictions of the second model are added
    """
    for n, sentence in enumerate(t['data']):
        #extract input sentence info
              
        input_item = {'sentence' : sentence,
                        'meta' : t['meta'][n],
                        'capability' : capability,
                        'testcase_name': testcase_name}
        #extract predictions info for the two models
        regular_prediction =  get_model_results(test_regular, n, sentence, capability, testcase_name)
        debiased_prediction =  get_model_results(test_debiased, n, sentence, capability, testcase_name)
        #save extracted info
        test_data.append(input_item)
        regular_predictions.append(regular_prediction)
        debiased_predictions.append(debiased_prediction)
    return test_data, regular_predictions, debiased_predictions


def predict_and_store(t: Dict, capability: str, testcase_name: str, expect: Callable, formattype: Callable, 
                      predict_regular: Type[CorefModel], predict_debiased: Type[CorefModel], \
                      test_data: List, regular_predictions: List, debiased_predictions: List) -> (List, List, List):
    """ 
    Function that creates test cases given a template, makes predictions for the given models and stores the test cases as 
    well as the predictions

    :param t: dict containing all test case information
    :param capability: the name of the linguistic capability of interest
    :param testcase_name: the name of the current test
    :param expect: function that checks if the argument of interest is predicted as expected
    :param formattype: function that creates the correct formatting for the test
    :param predict_regular: first pretrained SRL model to be tested
    :param predict_srlbert: second pretrained SRL model to be tested
    :param test_data: output list in which all test cases should be stored
    :param regular_predictions: output list in which all predictions of the first model should be stored
    :param debiased_predictions: output list in which all predictions of the second model should be stored

    :returns test_data: output list to which new test cases are added
    :returns regular_predictions: output list to which new predictions of the first model are added
    :returns debiased_predictions: output list to which new predictions of the second model are added
    """
    #test the srl model
    print('regular model')
    test_regular = MFT(**t, expect=expect)
    test_regular.run(predict_regular)
    test_regular.summary(format_example_fn=formattype)
    
    print('debiased model')
    test_debiased = MFT(**t, expect=expect)
    test_debiased.run(predict_debiased)
    test_debiased.summary(format_example_fn=formattype)

    #store samples and predictions
    test_data, regular_predictions, debiased_predictions = extract_data_and_predictions(t, capability, testcase_name, \
                                                                                 test_regular, test_debiased, test_data, \
                                                                                 regular_predictions, debiased_predictions)
    return test_data, regular_predictions, debiased_predictions

def store_data(path: str, data: List, new_file: bool=True):
    """
    Function that saves a given list to a json file on the given path

    :param path: path to a .json file to store data in
    :param data: list containing information to be stored
    :param new_file: setting indicating whether previous information in the file should be deleted (True) or not (False)
    """
    #if there already is content in the file, make sure we do not lose it. 
    if new_file == False:
        with open(path, "r") as file:
            old_data = json.load(file)

        old_data.append(data)
        data = old_data

    with open(path, "w") as file:
        json.dump(data, file, indent=4, sort_keys=True)


# Helper function to display failures 
def format_sent(x, pred, conf, label=None, meta=None):

    predicate_structure = [*zip(pred['words'], pred['preds'])]
        
    return predicate_structure

# Load models


In [15]:
debiased_model1 = load_model(weights_debiased1)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_gn_comb_fine_248/xlm-roberta_e26.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [9]:
debiased_model2 = load_model(weights_debiased2)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_gn_comb_fine_2020/xlm-roberta_e25.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [10]:
debiased_model3 = load_model(weights_debiased3)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_gn_comb_fine_2023/xlm-roberta_e23.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [9]:
debiased_model4 = load_model(weights_debiased4)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_gn_comb_fine_123/xlm-roberta_e23.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [10]:
debiased_model5 = load_model(weights_debiased5)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_gn_comb_fine_1234/xlm-roberta_e23.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [9]:
regular_model1 = load_model(weights_regular1)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_regular_248/xlm-roberta_e18.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [10]:
regular_model2 = load_model(weights_regular2)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_regular_2020/xlm-roberta_e15.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [9]:
regular_model3 = load_model(weights_regular3)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_regular_2023/xlm-roberta_e17.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [9]:
regular_model4 = load_model(weights_regular4)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_regular_123/xlm-roberta_e18.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [10]:
regular_model5 = load_model(weights_regular5)

Loading xlm-roberta-base...
Bert successfully loaded.
Loading from data/model_checkpoints/xlm_regular_1234/xlm-roberta_e15.pt...
Loaded bert
Loaded we
Loaded rough_scorer
Loaded pw
Loaded a_scorer
Loaded sp


In [11]:
def predict_regular1(data):
    return make_pred(regular_model1, [preprocess_sent(d) for d in data])
def predict_regular2(data):
    return make_pred(regular_model2, [preprocess_sent(d) for d in data])
def predict_regular3(data):
    return make_pred(regular_model3, [preprocess_sent(d) for d in data])
def predict_regular4(data):
    return make_pred(regular_model4, [preprocess_sent(d) for d in data])
def predict_regular5(data):
    return make_pred(regular_model5, [preprocess_sent(d) for d in data])

def predict_debiased1(data):
    return make_pred(debiased_model1, [preprocess_sent(d) for d in data])
def predict_debiased2(data):
    return make_pred(debiased_model2, [preprocess_sent(d) for d in data])
def predict_debiased3(data):
    return make_pred(debiased_model3, [preprocess_sent(d) for d in data])
def predict_debiased4(data):
    return make_pred(debiased_model4, [preprocess_sent(d) for d in data])
def predict_debiased5(data):
    return make_pred(debiased_model5, [preprocess_sent(d) for d in data])


In [13]:
predict_regular1 = PredictorWrapper.wrap_predict(predict_regular1)
predict_regular2 = PredictorWrapper.wrap_predict(predict_regular2)

In [11]:
predict_regular3 = PredictorWrapper.wrap_predict(predict_regular3)

In [12]:

predict_regular4 = PredictorWrapper.wrap_predict(predict_regular4)


In [13]:
predict_regular5 = PredictorWrapper.wrap_predict(predict_regular5)

In [16]:
predict_debiased1 = PredictorWrapper.wrap_predict(predict_debiased1)

In [12]:
predict_debiased2 = PredictorWrapper.wrap_predict(predict_debiased2)
predict_debiased3 = PredictorWrapper.wrap_predict(predict_debiased3)

In [None]:
predict_debiased4 = PredictorWrapper.wrap_predict(predict_debiased4)
predict_debiased5 = PredictorWrapper.wrap_predict(predict_debiased5)

# Load suite

In [14]:
suite = TestSuite().from_file('suite.pkl')

# Eval regular model, using 5 seeds

In [23]:
suite.run(predict_regular1, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.26docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.30docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.51docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.27docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.33docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.14docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.42docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.92docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.23docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.36docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.39docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.35docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.36docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.31docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.39docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.32docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.41docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.46docs/s]


In [24]:
suite.summary(format_example_fn = format_sent)

pronoun-pronoun

Pronoun pronoun, gendered pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Pronoun pronoun, gender neutral pronouns
Test cases:      100
Fails (rate):    44 (44.0%)

Example fails:
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('dokter.', '-')]
----
[('die', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('werk.', '-')]
----
[('die', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('afspraak.', '-')]
----


Pronoun pronoun, mixed pronouns
Test cases:      100
Fails (rate):    23 (23.0%)

Example fails:
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('hun', '-'), ('dokter.', '-')]
----
[('die', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('werk.', '-')]
----
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('interview.', '-')]
----




die-die-amb

Die-die ambiguity, gendered pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Die-die ambiguity, gender neutral pronouns
Test cases:      100
Fails (rate):    3 (3.0%

In [17]:
suite.run(predict_regular2, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.08docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.39docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.42docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.91docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.53docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.35docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.73docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.75docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.82docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.13docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.66docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.47docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.30docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.32docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.28docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.30docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.56docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.96docs/s]


In [18]:
suite.summary(format_example_fn = format_sent)

pronoun-pronoun

Pronoun pronoun, gendered pronouns
Test cases:      100
Fails (rate):    8 (8.0%)

Example fails:
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----
[('zij', '-'), ('gaat', '-'), ('naar', '-'), ('zijn', '-'), ('interview.', '-')]
----
[('zij', '-'), ('gaat', '-'), ('naar', '-'), ('zijn', '-'), ('ouders.', '-')]
----


Pronoun pronoun, gender neutral pronouns
Test cases:      100
Fails (rate):    65 (65.0%)

Example fails:
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('buren.', '-')]
----
[('die', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('dokter.', '-')]
----
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('dokter.', '-')]
----


Pronoun pronoun, mixed pronouns
Test cases:      100
Fails (rate):    40 (40.0%)

Example fails:
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('werk.', '-')]
----
[('die', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('werk.', '-')]
----
[('hij',

In [13]:
suite.run(predict_regular3, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 38.87docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.03docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.55docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.88docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.50docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.14docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.07docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.90docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.04docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.42docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.85docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.62docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.31docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.29docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.39docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.37docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.24docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.50docs/s]


In [14]:
suite.summary(format_example_fn = format_sent)

pronoun-pronoun-name

Pronoun pronoun name, gendered pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Pronoun pronoun name, gender neutral pronouns
Test cases:      100
Fails (rate):    60 (60.0%)

Example fails:
[('Sydney', '(0)'), ('heeft', '-'), ('diens', '(0)'), ('afspraak', '-'), ('bij', '-'), ('de', '-'), ('huisartsenpost', '-'), ('om', '-'), ('3', '-'), ('uur.', '-'), ('die', '-'), ('gaat', '-'), ('zo', '-'), ('heen.', '-')]
----
[('Charly', '(0)'), ('heeft', '-'), ('hun', '(0)'), ('afspraak', '-'), ('bij', '-'), ('de', '-'), ('buren', '-'), ('om', '-'), ('3', '-'), ('uur.', '-'), ('die', '-'), ('gaat', '-'), ('zo', '-'), ('heen.', '-')]
----
[('Reese', '-'), ('heeft', '-'), ('hun', '-'), ('afspraak', '-'), ('bij', '-'), ('de', '-'), ('dokters', '-'), ('om', '-'), ('3', '-'), ('uur.', '-'), ('die', '-'), ('gaat', '-'), ('zo', '-'), ('heen.', '-')]
----


Pronoun pronoun name, mixed pronouns
Test cases:      100
Fails (rate):    49 (49.0%)

Example fails:
[('Haven', '(0)

In [15]:
suite.run(predict_regular4, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 34.16docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.27docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.64docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.43docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.94docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.14docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.76docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.69docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.67docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.56docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.64docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.56docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.62docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.46docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.61docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.54docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.52docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.53docs/s]


In [16]:
suite.summary(format_example_fn = format_sent)

pronoun-pronoun

Pronoun pronoun, gendered pronouns
Test cases:      100
Fails (rate):    2 (2.0%)

Example fails:
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----


Pronoun pronoun, gender neutral pronouns
Test cases:      100
Fails (rate):    62 (62.0%)

Example fails:
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('afspraak.', '-')]
----
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('interview.', '-')]
----
[('die', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('interview.', '-')]
----


Pronoun pronoun, mixed pronouns
Test cases:      100
Fails (rate):    29 (29.0%)

Example fails:
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('werk.', '-')]
----
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('interview.', '-')]
----
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('ouders.', '-')

In [17]:
suite.run(predict_regular5, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.73docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.18docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.69docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.40docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.16docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.26docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.50docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 43.27docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 42.81docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.53docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.98docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.70docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.55docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.56docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.56docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.57docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.30docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 41.29docs/s]


In [18]:
suite.summary(format_example_fn = format_sent)

pronoun-pronoun

Pronoun pronoun, gendered pronouns
Test cases:      100
Fails (rate):    22 (22.0%)

Example fails:
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('buren.', '-')]
----
[('zij', '-'), ('gaat', '-'), ('naar', '-'), ('zijn', '-'), ('werk.', '-')]
----
[('zij', '-'), ('gaat', '-'), ('naar', '-'), ('zijn', '-'), ('werk.', '-')]
----


Pronoun pronoun, gender neutral pronouns
Test cases:      100
Fails (rate):    76 (76.0%)

Example fails:
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('ouders.', '-')]
----
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('ouders.', '-')]
----
[('die', '-'), ('gaat', '-'), ('naar', '-'), ('diens', '-'), ('afspraak.', '-')]
----


Pronoun pronoun, mixed pronouns
Test cases:      100
Fails (rate):    62 (62.0%)

Example fails:
[('hen', '-'), ('gaat', '-'), ('naar', '-'), ('hun', '-'), ('afspraak.', '-')]
----
[('die', '-'), ('gaat', '-'), ('naar', '-'), ('hun', '-'), ('werk.', '-')]
----
[('hen', '-'),

# Eval debiased model, using 5 seeds

In [17]:
suite.run(predict_debiased1, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 39.85docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.44docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.41docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.41docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.45docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.48docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.39docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.48docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.47docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.13docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.33docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.24docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.26docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.42docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.33docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.38docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.43docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.65docs/s]


In [18]:
suite.summary(format_example_fn = format_sent)

pronoun-pronoun-name

Pronoun pronoun name, gendered pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Pronoun pronoun name, gender neutral pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Pronoun pronoun name, mixed pronouns
Test cases:      100
Fails (rate):    0 (0.0%)




Name pronoun link

Name pronoun, gender neutral name, gendered pronouns
Test cases:      100
Fails (rate):    1 (1.0%)

Example fails:
[('London', '-'), ('gaat', '-'), ('naar', '-'), ('zijn', '-'), ('interview.', '-')]
----


Name pronoun, gender neutral name, gender neutral pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Name pronoun, male name, gendered pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Name pronoun, male name, gender neutral pronouns
Test cases:      100
Fails (rate):    1 (1.0%)

Example fails:
[('Cor', '-'), ('gaat', '-'), ('naar', '-'), ('hun', '-'), ('interview.', '-')]
----


Name pronoun, female name, gendered pronouns
Test cases:      100
Fails (rate):  

In [15]:
suite.run(predict_debiased2, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:03<00:00, 27.32docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.70docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.72docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.67docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.76docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.81docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.74docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.72docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.78docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.62docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.73docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.62docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.62docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.67docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.70docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.65docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.60docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.78docs/s]


In [16]:
suite.summary(format_example_fn = format_sent)

die-die-amb

Die-die ambiguity, gendered pronouns
Test cases:      100
Fails (rate):    3 (3.0%)

Example fails:
[('Lyric', '-'), ('die', '-'), ('hier', '-'), ('net', '-'), ('werkt', '-'), ('is', '-'), ('te', '-'), ('laat', '-'), ('omdat', '-'), ('zij', '-'), ('een', '-'), ('lekke', '-'), ('band', '-'), ('had', '-'), ('.', '-')]
----
[('Jazz', '-'), ('die', '-'), ('hier', '-'), ('net', '-'), ('werkt', '-'), ('is', '-'), ('te', '-'), ('laat', '-'), ('omdat', '-'), ('zij', '-'), ('een', '-'), ('lekke', '-'), ('band', '-'), ('had', '-'), ('.', '-')]
----
[('Lyric', '-'), ('die', '-'), ('hier', '-'), ('net', '-'), ('werkt', '-'), ('is', '-'), ('te', '-'), ('laat', '-'), ('omdat', '-'), ('zij', '-'), ('een', '-'), ('lekke', '-'), ('band', '-'), ('had', '-'), ('.', '-')]
----


Die-die ambiguity, gender neutral pronouns
Test cases:      100
Fails (rate):    5 (5.0%)

Example fails:
[('Haven', '-'), ('die', '(0)'), ('hier', '-'), ('net', '-'), ('werkt', '-'), ('is', '-'), ('te', '-'), ('laat'

In [17]:
suite.run(predict_debiased3, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.80docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.68docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.64docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.76docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.71docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.77docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.74docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.77docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.71docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.08docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.58docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.58docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.65docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.49docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.59docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.33docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.50docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.67docs/s]


In [18]:
suite.summary(format_example_fn = format_sent)

die-die-amb

Die-die ambiguity, gendered pronouns
Test cases:      100
Fails (rate):    1 (1.0%)

Example fails:
[('Jos', '-'), ('die', '(0)'), ('hier', '-'), ('net', '-'), ('werkt', '-'), ('is', '-'), ('te', '-'), ('laat', '-'), ('omdat', '-'), ('zij', '(0)'), ('een', '-'), ('lekke', '-'), ('band', '-'), ('had', '-'), ('.', '-')]
----


Die-die ambiguity, gender neutral pronouns
Test cases:      100
Fails (rate):    0 (0.0%)




pronoun-pronoun

Pronoun pronoun, gendered pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Pronoun pronoun, gender neutral pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Pronoun pronoun, mixed pronouns
Test cases:      100
Fails (rate):    0 (0.0%)




hen-hen-amb

Hen-hen ambiguity, hen plural, gendered pronouns
Test cases:      100
Fails (rate):    100 (100.0%)

Example fails:
[('Phoenix', '(0)'), ('gaat', '-'), ('met', '-'), ('het', '-'), ('softbalteam', '-'), ('op', '-'), ('vakantie.', '-'), ('hij', '(0)'), ('is', '-'), ('al', '-'), ('

In [14]:
suite.run(predict_debiased4, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:03<00:00, 26.53docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.46docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.44docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.31docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.43docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.47docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.52docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.39docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.59docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.34docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.41docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.38docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.40docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.41docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.39docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.40docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.48docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.55docs/s]


In [15]:
suite.summary(format_example_fn = format_sent)

pronoun-pronoun

Pronoun pronoun, gendered pronouns
Test cases:      100
Fails (rate):    2 (2.0%)

Example fails:
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----


Pronoun pronoun, gender neutral pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Pronoun pronoun, mixed pronouns
Test cases:      100
Fails (rate):    2 (2.0%)

Example fails:
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----




Name pronoun link

Name pronoun, gender neutral name, gendered pronouns
Test cases:      100
Fails (rate):    1 (1.0%)

Example fails:
[('London', '-'), ('gaat', '-'), ('naar', '-'), ('zijn', '-'), ('interview.', '-')]
----


Name pronoun, gender neutral name, gender neutral pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Name pronoun, ma

In [16]:
suite.run(predict_debiased5, overwrite=True)

Running Name pronoun, gender neutral name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.29docs/s]


Running Name pronoun, gender neutral name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.50docs/s]


Running Name pronoun, male name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.45docs/s]


Running Name pronoun, male name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.56docs/s]


Running Name pronoun, female name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.46docs/s]


Running Name pronoun, female name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.48docs/s]


Running Pronoun pronoun, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.50docs/s]


Running Pronoun pronoun, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.56docs/s]


Running Pronoun pronoun, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.47docs/s]


Running Pronoun pronoun name, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.40docs/s]


Running Pronoun pronoun name, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.37docs/s]


Running Pronoun pronoun name, mixed pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.30docs/s]


Running Hen-hen ambiguity, hen plural, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.38docs/s]


Running Hen-hen ambiguity, hen plural, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.35docs/s]


Running Hen-hen ambiguity, hen singular, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.24docs/s]


Running Hen-hen ambiguity, hen singular, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 39.98docs/s]


Running Die-die ambiguity, gendered pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.18docs/s]


Running Die-die ambiguity, gender neutral pronouns
Predicting 100 examples


100%|██████████| 100/100 [00:02<00:00, 40.33docs/s]


In [17]:
suite.summary(format_example_fn = format_sent)

pronoun-pronoun

Pronoun pronoun, gendered pronouns
Test cases:      100
Fails (rate):    2 (2.0%)

Example fails:
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----


Pronoun pronoun, gender neutral pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Pronoun pronoun, mixed pronouns
Test cases:      100
Fails (rate):    2 (2.0%)

Example fails:
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----
[('hij', '-'), ('gaat', '-'), ('naar', '-'), ('haar', '-'), ('interview.', '-')]
----




Name pronoun link

Name pronoun, gender neutral name, gendered pronouns
Test cases:      100
Fails (rate):    1 (1.0%)

Example fails:
[('London', '-'), ('gaat', '-'), ('naar', '-'), ('zijn', '-'), ('interview.', '-')]
----


Name pronoun, gender neutral name, gender neutral pronouns
Test cases:      100
Fails (rate):    0 (0.0%)


Name pronoun, ma