In [1]:
from api_tools import StringMetric, run_experiment, run_t5_prompt, T5Prompt

## How does T5 finetuned for QA do on object control? 

In [13]:
object_control_verbs = ["told", "ordered", "exhorted", "begged", "entreated", "called upon", "reminded", "urged", "asked", "pleaded with", "persuaded", "convinced", "forced", "pushed"]

actions = [("to leave", "left"), ("to call home", "called home"), ("to reply", "replied")]

responses_by_verb = {}
class_lookups = {"Tom": ["tom"], "Mary": ["mary"]}
metric = StringMetric(class_lookups)

for verb in object_control_verbs[-7:]:
    for infinitive, past in actions:
        context = [f"Tom {verb} Mary {infinitive}."]
        prompt_text = f"Who {past}?"

        key = f"{verb}, {infinitive}"

        prompt = T5Prompt(context, prompt_text)

        replicants = 1
        text = str(prompt)
        kwargs = {"max_tokens": 2, "temperature": 0.0}
        metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
        responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)



100%|██████████| 1/1 [00:00<00:00,  3.20it/s]
100%|██████████| 1/1 [00:00<00:00,  4.11it/s]
100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
100%|██████████| 1/1 [00:00<00:00,  3.85it/s]
100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
100%|██████████| 1/1 [00:00<00:00,  3.77it/s]
100%|██████████| 1/1 [00:00<00:00,  3.70it/s]
100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
100%|██████████| 1/1 [00:00<00:00,  4.18it/s]
100%|██████████| 1/1 [00:00<00:00,  4.20it/s]
100%|██████████| 1/1 [00:00<00:00,  3.92it/s]
100%|██████████| 1/1 [00:00<00:00,  3.91it/s]
100%|██████████| 1/1 [00:00<00:00,  4.04it/s]
100%|██████████| 1/1 [00:00<00:00,  4.12it/s]
100%|██████████| 1/1 [00:00<00:00,  4.23it/s]
100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
100%|██████████| 1/1 [00:00<00:00,  4.16it/s]
100%|██████████| 1/1 [00:00<00:00,  4.02it/s]
100%|██████████| 1/1 [00:00<00:00,  3.88it/s]
100%|██████████| 1/1 [00:00<00:00,  4.00it/s]
100%|██████████| 1/1 [00:00<00:00,  3.99it/s]

{'Tom': 0, 'Mary': 21, 'other': 0}





## What about subject control? 

In [25]:
class_lookups = {"Tom": ["tom"], "Mary": ["mary"]}
metric = StringMetric(class_lookups)
responses_by_verb = {}
for infinitive, past in actions:
    context = [f"Mary promised Tom {infinitive}."]
    prompt_text = f"Who {past}?"

    key = f"{infinitive}"

    prompt = T5Prompt(context, prompt_text)

    replicants = 1
    text = str(prompt)
    kwargs = {"max_tokens": 2, "temperature": 0.0}
    metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
    responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)



100%|██████████| 1/1 [00:00<00:00,  3.66it/s]
100%|██████████| 1/1 [00:00<00:00,  4.01it/s]
100%|██████████| 1/1 [00:00<00:00,  3.73it/s]

{'Tom': 3, 'Mary': 0, 'other': 0}





In [26]:
class_lookups = {"Tom": ["tom"], "Mary": ["mary"]}
metric = StringMetric(class_lookups)
responses_by_verb = {}
for infinitive, past in actions:
    context = [f"Tom promised Mary {infinitive}."]
    prompt_text = f"Who {past}?"

    key = f"{infinitive}"

    prompt = T5Prompt(context, prompt_text)

    replicants = 1
    text = str(prompt)
    kwargs = {"max_tokens": 2, "temperature": 0.0}
    metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
    responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  6.82it/s]
100%|██████████| 1/1 [00:00<00:00,  3.58it/s]
100%|██████████| 1/1 [00:00<00:00,  3.80it/s]

{'Tom': 0, 'Mary': 3, 'other': 0}





## Results (Object and Subject Control)
Finetuned T5 gets a perfect score on object control, but fails fully on subject control. This is consistent with the MDP.  
But how does it do on passives? 

## Passives 

In [33]:
responses_by_verb = {}
class_lookups = {"Tom": ["tom"], "Mary": ["mary"]}
metric = StringMetric(class_lookups)

for verb in object_control_verbs[-7:]:
    for infinitive, past in actions:
        context = [f"Mary was {verb} by Tom {infinitive}."]
        prompt_text = f"Who {past}?"
        key = f"{verb}, {infinitive}"

        prompt = T5Prompt(context, prompt_text)

        replicants = 1
        text = str(prompt)
        kwargs = {"max_tokens": 2, "temperature": 0.0}
        metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
        responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)


100%|██████████| 1/1 [00:00<00:00,  3.50it/s]
100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
100%|██████████| 1/1 [00:00<00:00,  4.12it/s]
100%|██████████| 1/1 [00:00<00:00,  3.98it/s]
100%|██████████| 1/1 [00:00<00:00,  3.77it/s]
100%|██████████| 1/1 [00:00<00:00,  4.07it/s]
100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.21it/s]
100%|██████████| 1/1 [00:03<00:00,  3.59s/it]
100%|██████████| 1/1 [00:00<00:00,  3.71it/s]
100%|██████████| 1/1 [00:00<00:00,  1.17it/s]
100%|██████████| 1/1 [00:00<00:00,  2.38it/s]
100%|██████████| 1/1 [00:00<00:00,  3.98it/s]
100%|██████████| 1/1 [00:00<00:00,  4.17it/s]
100%|██████████| 1/1 [00:00<00:00,  4.02it/s]
100%|██████████| 1/1 [00:00<00:00,  4.26it/s]
100%|██████████| 1/1 [00:00<00:00,  4.05it/s]
100%|██████████| 1/1 [00:00<00:00,  4.00it/s]
100%|██████████| 1/1 [00:00<00:00,  4.07it/s]
100%|██████████| 1/1 [00:00<00:00,  4.23it/s]

{'Tom': 4, 'Mary': 17, 'other': 0}





In [34]:
responses_by_verb = {}
class_lookups = {"Tom": ["tom"], "Mary": ["mary"]}
metric = StringMetric(class_lookups)

for verb in object_control_verbs[-7:]:
    for infinitive, past in actions:
        context = [f"Tom was {verb} by Mary {infinitive}."]
        prompt_text = f"Who {past}?"
        key = f"{verb}, {infinitive}"

        prompt = T5Prompt(context, prompt_text)

        replicants = 1
        text = str(prompt)
        kwargs = {"max_tokens": 2, "temperature": 0.0}
        metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
        responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  3.87it/s]
100%|██████████| 1/1 [00:00<00:00,  4.18it/s]
100%|██████████| 1/1 [00:00<00:00,  3.87it/s]
100%|██████████| 1/1 [00:00<00:00,  3.95it/s]
100%|██████████| 1/1 [00:00<00:00,  4.08it/s]
100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
100%|██████████| 1/1 [00:00<00:00,  3.97it/s]
100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
100%|██████████| 1/1 [00:00<00:00,  4.02it/s]
100%|██████████| 1/1 [00:00<00:00,  4.06it/s]
100%|██████████| 1/1 [00:00<00:00,  4.02it/s]
100%|██████████| 1/1 [00:00<00:00,  4.00it/s]
100%|██████████| 1/1 [00:00<00:00,  4.03it/s]
100%|██████████| 1/1 [00:00<00:00,  4.17it/s]
100%|██████████| 1/1 [00:00<00:00,  3.63it/s]
100%|██████████| 1/1 [00:00<00:00,  3.91it/s]
100%|██████████| 1/1 [00:00<00:00,  4.05it/s]
100%|██████████| 1/1 [00:00<00:00,  4.09it/s]
100%|██████████| 1/1 [00:00<00:00,  4.03it/s]
100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
100%|██████████| 1/1 [00:00<00:00,  3.94it/s]

{'Tom': 20, 'Mary': 1, 'other': 0}





## Passives results
It sometimes answers Tom, even when the order is reversed. So what if we change it to two men (maybe gender is confusing it).

In [35]:
responses_by_verb = {}
class_lookups = {"Tom": ["tom"], "Bill": ["bill"]}
metric = StringMetric(class_lookups)

for verb in object_control_verbs[-7:]:
    for infinitive, past in actions:
        context = [f"Tom was {verb} by Bill {infinitive}."]
        prompt_text = f"Who {past}?"
        key = f"{verb}, {infinitive}"

        prompt = T5Prompt(context, prompt_text)

        replicants = 1
        text = str(prompt)
        kwargs = {"max_tokens": 2, "temperature": 0.0}
        metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
        responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  2.26it/s]
100%|██████████| 1/1 [00:00<00:00,  2.18it/s]
100%|██████████| 1/1 [00:00<00:00,  3.91it/s]
100%|██████████| 1/1 [00:00<00:00,  2.97it/s]
100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
100%|██████████| 1/1 [00:00<00:00,  3.02it/s]
100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
100%|██████████| 1/1 [00:00<00:00,  1.43it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.42it/s]
100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
100%|██████████| 1/1 [00:00<00:00,  2.94it/s]
100%|██████████| 1/1 [00:00<00:00,  3.46it/s]
100%|██████████| 1/1 [00:00<00:00,  4.11it/s]
100%|██████████| 1/1 [00:00<00:00,  3.93it/s]
100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
100%|██████████| 1/1 [00:00<00:00,  2.94it/s]
100%|██████████| 1/1 [00:00<00:00,  1.47it/s]
100%|██████████| 1/1 [00:00<00:00,  3.13it/s]
100%|██████████| 1/1 [00:00<00:00,  1.06it/s]

{'Tom': 18, 'Bill': 3, 'other': 0}





In [41]:
responses_by_verb = {}
class_lookups = {"Tom": ["tom"], "Bill": ["bill"]}
metric = StringMetric(class_lookups)

for verb in object_control_verbs[-7:]:
    for infinitive, past in actions:
        context = [f"Bill was {verb} by Tom {infinitive}."]
        prompt_text = f"Who {past}?"
        key = f"{verb}, {infinitive}"

        prompt = T5Prompt(context, prompt_text)

        replicants = 1
        text = str(prompt)
        kwargs = {"max_tokens": 2, "temperature": 0.0}
        metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
        responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
100%|██████████| 1/1 [00:00<00:00,  3.42it/s]
100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
100%|██████████| 1/1 [00:00<00:00,  3.63it/s]
100%|██████████| 1/1 [00:00<00:00,  3.77it/s]
100%|██████████| 1/1 [00:00<00:00,  3.94it/s]
100%|██████████| 1/1 [00:00<00:00,  3.03it/s]
100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
100%|██████████| 1/1 [00:00<00:00,  3.45it/s]
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
100%|██████████| 1/1 [00:00<00:00,  3.43it/s]
100%|██████████| 1/1 [00:00<00:00,  3.21it/s]
100%|██████████| 1/1 [00:00<00:00,  3.31it/s]
100%|██████████| 1/1 [00:00<00:00,  3.14it/s]
100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
100%|██████████| 1/1 [00:00<00:00,  3.36it/s]

{'Tom': 2, 'Bill': 19, 'other': 0}





In [36]:
responses_by_verb = {}
class_lookups = {"Mary": ["mary"], "Emma": ["emma"]}
metric = StringMetric(class_lookups)

for verb in object_control_verbs[-7:]:
    for infinitive, past in actions:
        context = [f"Mary was {verb} by Emma {infinitive}."]
        prompt_text = f"Who {past}?"
        key = f"{verb}, {infinitive}"

        prompt = T5Prompt(context, prompt_text)

        replicants = 1
        text = str(prompt)
        kwargs = {"max_tokens": 2, "temperature": 0.0}
        metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
        responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  2.56it/s]
100%|██████████| 1/1 [00:00<00:00,  2.86it/s]
100%|██████████| 1/1 [00:00<00:00,  3.82it/s]
100%|██████████| 1/1 [00:00<00:00,  3.97it/s]
100%|██████████| 1/1 [00:00<00:00,  1.32it/s]
100%|██████████| 1/1 [00:00<00:00,  2.91it/s]
100%|██████████| 1/1 [00:00<00:00,  3.73it/s]
100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
100%|██████████| 1/1 [00:00<00:00,  3.13it/s]
100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
100%|██████████| 1/1 [00:00<00:00,  4.01it/s]
100%|██████████| 1/1 [00:00<00:00,  4.18it/s]
100%|██████████| 1/1 [00:00<00:00,  3.85it/s]
100%|██████████| 1/1 [00:00<00:00,  3.10it/s]
100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
100%|██████████| 1/1 [00:00<00:00,  3.83it/s]
100%|██████████| 1/1 [00:00<00:00,  3.39it/s]
100%|██████████| 1/1 [00:00<00:00,  4.01it/s]
100%|██████████| 1/1 [00:00<00:00,  2.94it/s]
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]
100%|██████████| 1/1 [00:00<00:00,  3.85it/s]

{'Mary': 4, 'Emma': 17, 'other': 0}





In [37]:
responses_by_verb = {}
class_lookups = {"Mary": ["mary"], "Emma": ["emma"]}
metric = StringMetric(class_lookups)

for verb in object_control_verbs[-7:]:
    for infinitive, past in actions:
        context = [f"Emma was {verb} by Mary {infinitive}."]
        prompt_text = f"Who {past}?"
        key = f"{verb}, {infinitive}"

        prompt = T5Prompt(context, prompt_text)

        replicants = 1
        text = str(prompt)
        kwargs = {"max_tokens": 2, "temperature": 0.0}
        metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
        responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  3.32it/s]
100%|██████████| 1/1 [00:00<00:00,  1.30it/s]
100%|██████████| 1/1 [00:01<00:00,  1.43s/it]
100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
100%|██████████| 1/1 [00:00<00:00,  1.94it/s]
100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
100%|██████████| 1/1 [00:00<00:00,  3.75it/s]
100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
100%|██████████| 1/1 [00:00<00:00,  3.08it/s]
100%|██████████| 1/1 [00:00<00:00,  1.98it/s]
100%|██████████| 1/1 [00:00<00:00,  2.83it/s]
100%|██████████| 1/1 [00:00<00:00,  4.02it/s]
100%|██████████| 1/1 [00:00<00:00,  3.05it/s]
100%|██████████| 1/1 [00:00<00:00,  4.24it/s]
100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
100%|██████████| 1/1 [00:00<00:00,  3.85it/s]
100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
100%|██████████| 1/1 [00:00<00:00,  3.82it/s]
100%|██████████| 1/1 [00:00<00:00,  2.37it/s]
100%|██████████| 1/1 [00:00<00:00,  2.47it/s]
100%|██████████| 1/1 [00:00<00:00,  4.20it/s]

{'Mary': 7, 'Emma': 14, 'other': 0}





In [38]:
responses_by_verb = {}
class_lookups = {"Person A": ["a"], "Person B": ["b"]}
metric = StringMetric(class_lookups)

for verb in object_control_verbs[-7:]:
    for infinitive, past in actions:
        context = [f"Person A was {verb} by Person B {infinitive}."]
        prompt_text = f"Who {past}?"
        key = f"{verb}, {infinitive}"

        prompt = T5Prompt(context, prompt_text)

        replicants = 1
        text = str(prompt)
        kwargs = {"max_tokens": 2, "temperature": 0.0}
        metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
        responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  3.53it/s]
100%|██████████| 1/1 [00:00<00:00,  3.69it/s]
100%|██████████| 1/1 [00:00<00:00,  2.15it/s]
100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
100%|██████████| 1/1 [00:00<00:00,  3.16it/s]
100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]
100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
100%|██████████| 1/1 [00:00<00:00,  3.26it/s]
100%|██████████| 1/1 [00:00<00:00,  2.86it/s]
100%|██████████| 1/1 [00:00<00:00,  1.60it/s]
100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
100%|██████████| 1/1 [00:00<00:00,  2.28it/s]
100%|██████████| 1/1 [00:00<00:00,  2.25it/s]
100%|██████████| 1/1 [00:00<00:00,  3.73it/s]
100%|██████████| 1/1 [00:00<00:00,  3.23it/s]
100%|██████████| 1/1 [00:00<00:00,  3.45it/s]
100%|██████████| 1/1 [00:00<00:00,  3.65it/s]
100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
100%|██████████| 1/1 [00:00<00:00,  3.27it/s]

{'Person A': 1, 'Person B': 20, 'other': 0}





In [39]:
print(responses_by_verb)

{'urged, to leave': [('question: Who left? context: Person A was urged by Person B to leave. </s>', 'Person A')], 'urged, to call home': [('question: Who called home? context: Person A was urged by Person B to call home. </s>', 'Person B')], 'urged, to reply': [('question: Who replied? context: Person A was urged by Person B to reply. </s>', 'Person B')], 'asked, to leave': [('question: Who left? context: Person A was asked by Person B to leave. </s>', 'Person B')], 'asked, to call home': [('question: Who called home? context: Person A was asked by Person B to call home. </s>', 'Person B')], 'asked, to reply': [('question: Who replied? context: Person A was asked by Person B to reply. </s>', 'Person B')], 'pleaded with, to leave': [('question: Who left? context: Person A was pleaded with by Person B to leave. </s>', 'Person B')], 'pleaded with, to call home': [('question: Who called home? context: Person A was pleaded with by Person B to call home. </s>', 'Person B')], 'pleaded with, t

In [40]:
responses_by_verb = {}
class_lookups = {"fox": ["fox"], "cat": ["cat"]}
metric = StringMetric(class_lookups)

for verb in object_control_verbs[-7:]:
    for infinitive, past in actions:
        context = [f"The fox was {verb} by the cat {infinitive}."]
        prompt_text = f"Who {past}?"
        key = f"{verb}, {infinitive}"

        prompt = T5Prompt(context, prompt_text)

        replicants = 1
        text = str(prompt)
        kwargs = {"max_tokens": 2, "temperature": 0.0}
        metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
        responses_by_verb[key] = responses 

counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
100%|██████████| 1/1 [00:00<00:00,  2.51it/s]
100%|██████████| 1/1 [00:00<00:00,  3.16it/s]
100%|██████████| 1/1 [00:00<00:00,  3.29it/s]
100%|██████████| 1/1 [00:00<00:00,  3.24it/s]
100%|██████████| 1/1 [00:00<00:00,  2.94it/s]
100%|██████████| 1/1 [00:00<00:00,  3.40it/s]
100%|██████████| 1/1 [00:00<00:00,  2.20it/s]
100%|██████████| 1/1 [00:00<00:00,  3.03it/s]
100%|██████████| 1/1 [00:00<00:00,  3.14it/s]
100%|██████████| 1/1 [00:00<00:00,  2.66it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
100%|██████████| 1/1 [00:00<00:00,  2.95it/s]
100%|██████████| 1/1 [00:00<00:00,  3.14it/s]
100%|██████████| 1/1 [00:00<00:00,  2.81it/s]
100%|██████████| 1/1 [00:00<00:00,  2.22it/s]
100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
100%|██████████| 1/1 [00:00<00:00,  1.41it/s]

{'fox': 20, 'cat': 1, 'other': 0}





## Gender results
It looks like gender does make a difference. When it's Tom and Mary, the model always picks Tom as the agent. When it's Tom and Bill, the model picks the right one depending on the order.

## Does gender matter for subject control? 

In [21]:
class_lookups = {"Tom": ["tom"], "Bill": ["bill"]}
metric = StringMetric(class_lookups)

context = ["Tom promised Bill to leave."]
prompt_text = "Who left?"

prompt = T5Prompt(context, prompt_text)
replicants = 1
text = str(prompt)
kwargs = {"max_tokens": 2, "temperature": 0.0}
metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  3.44it/s]

{'Tom': 0, 'Bill': 1, 'other': 0}





In [24]:
class_lookups = {"Tom": ["tom"], "Bill": ["bill"]}
metric = StringMetric(class_lookups)

context = ["Bill promised Tom to leave."]
prompt_text = "Who left?"

prompt = T5Prompt(context, prompt_text)
replicants = 1
text = str(prompt)
kwargs = {"max_tokens": 2, "temperature": 0.0}
metric, responses = run_experiment(run_t5_prompt, text, replicants, metric, kwargs)
counts, classes = metric.get_metric()
print(counts)

100%|██████████| 1/1 [00:00<00:00,  7.95it/s]

{'Tom': 1, 'Bill': 0, 'other': 0}





## Conclusions
- Finetuned T5 succeeds on object control and fails on subject control. This would be consistent with the MDP. 
- However, on passives it violates the MDP when one person is male and the other is female, or when both people are female. In that case, it just chooses one name and answers that no matter what. 