In [1]:
import pandas as pd 

from experiment import Experiment
from api_tools import (FixedGPTPrompt, 
                       FixedPassiveGPTPrompt, 
                       FixedT5Prompt, 
                       FixedPassiveT5Prompt, 
                       run_ai21_prompt, 
                       run_gpt_prompt, 
                       run_t5_prompt)

from metrics import accuracy_report


# Subject Control
Starting with the **subject** control experiments, we will look at how 4 LMs do:
1. GPT3
2. T5 for QA 
3. Jurassic large 
4. Jurassic Jumbo

## Experimental settings
We're looking at different name pairs to control for gender. We have 2 male-female name pairs, 1 male-male pair, 1 female-female pair, and 1 neutral-neutral. 
We're still looking at 5 different actions, but only one verb: promise. 

We can later coerce (with gender) subject control on "suggested" and "proposed" 

In [2]:
# names = [("Tom", "Mary"), ("Tom", "Bill"), ("Mary", "Bill"), ("Ellen", "Mary"), ("Morgan", "Jaime"), ("Iago", "Hamlet"), ("Jules", "Yves"), ("Kurt", "Lena")]
names = [("Tom", "Mary"), ("Tom", "Bill"), ("Mary", "Bill"), ("Ellen", "Mary"), ("Morgan", "Jaime")]
# verbs = ["told", "ordered", "called upon", "reminded", "urged", "asked", "persuaded", "convinced", "forced", "pushed"]
verbs = ["promised"]
actions = [("to leave", "left"), ("to call home", "called home"), ("to reply", "replied"), ("to wipe the counter", "wiped the counter"), ("to dance", "danced")]
correct_index = 0


## GPT 3
For GPT3, inference is not deterministic, so we're running 5 replicants per prompt 

In [None]:

# gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
# gpt_subject_control_experiment  = Experiment("gpt3", "subject-control", FixedGPTPrompt, run_gpt_prompt, 5, gpt_kwargs)

# gpt_subject_control_experiment.run(names, correct_index, verbs, actions)

In [None]:
# gpt_df = gpt_subject_control_experiment.format_results()

# gpt_df.to_csv("/Users/Elias/child-lm/results/gpt_subject_control_swap_names.csv")

gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_subject_control_swap_names.csv")

In [None]:
correct = gpt_df[gpt_df['true'] == gpt_df['pred']]
incorrect = gpt_df[gpt_df['true'] != gpt_df['pred']]

print(f"accuraccy: {(len(correct)/ len(gpt_df)) * 100:.2f}")

## T5 for QA

In [None]:

# t5_subject_control_experiment  = Experiment("t5", "subject-control", FixedT5Prompt, run_t5_prompt, 1, None)

# t5_subject_control_experiment.run(names, correct_index, verbs, actions, do_swap = False)

In [None]:
# t5_df = t5_subject_control_experiment.format_results()

# t5_df.to_csv("/Users/Elias/child-lm/results/t5_subject_control.csv")

t5_df = pd.read_csv("/Users/Elias/child-lm/results/t5_subject_control.csv")

In [None]:
correct = t5_df[t5_df['true'] == t5_df['pred']]
incorrect = t5_df[t5_df['true'] != t5_df['pred']]

print(f"accuraccy: {(len(correct)/ len(t5_df)) * 100:.2f}")

## Jurassic Large

In [None]:

# jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
# jurassic_subject_control_experiment  = Experiment("jurassic-large", "subject-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)

# jurassic_subject_control_experiment.run(names, correct_index, verbs, actions)

In [None]:
# jurassic_df = jurassic_subject_control_experiment.format_results()

# jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_subject_control_swap_names.csv")

jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_subject_control_swap_names.csv")

In [None]:
correct = jurassic_df[jurassic_df['true'] == jurassic_df['pred']]
incorrect = jurassic_df[jurassic_df['true'] != jurassic_df['pred']]

print(f"accuraccy: {(len(correct)/ len(jurassic_df)) * 100:.2f}")


## Coerced examples with gender

By using gendered names and pronouns, we can coerce subject or object control from "suggested", "offered", and "proposed", e.g. 

- Mary proposed to Tom to be his editor
- Tom suggested to Mary to be her editor 
- Mary offered to Tom to be his editor 


In [None]:
verbs = ["promised", "offered", "suggested", "proposed"]
his_names = [("Tom", "Mary"), ("Bill", "Mary"), ("James", "Mary"), ("Tom", "Sally"), ("Bill", "Sally"), ("James", "Sally")]
actions = [("to be her editor", "was the editor")]
correct_index = 0

## GPT 3

In [None]:

# gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
# gendered_gpt_subject_control_experiment  = Experiment("gpt3", "subject-control", FixedGPTPrompt, run_gpt_prompt, 5, gpt_kwargs)
# gendered_gpt_subject_control_experiment.run(his_names, correct_index, verbs, actions)


In [None]:
# gendered_gpt_df = gendered_gpt_subject_control_experiment.format_results()

# gendered_gpt_df.to_csv("/Users/Elias/child-lm/results/gpt_gendered_subject_control_swap_names.csv")
gendered_gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_gendered_subject_control_swap_names.csv")


In [None]:
correct = gendered_gpt_df[gendered_gpt_df['true'] == gendered_gpt_df['pred']]
# incorrect = gpt_df[gpt_df['true'] != gpt_df['pred']]

print(f"accuraccy: {(len(correct)/ len(gendered_gpt_df)) * 100:.2f}")

## T5 for QA 

In [None]:

gendered_t5_subject_control_experiment  = Experiment("t5", "subject-control", FixedT5Prompt, run_t5_prompt, 1, None)
gendered_t5_subject_control_experiment.run(his_names, correct_index, verbs, actions)
gendered_t5_df = gendered_t5_subject_control_experiment.format_results()
gendered_t5_df.to_csv("/Users/Elias/child-lm/results/t5_gendered_subject_control_swap_names.csv")


In [None]:
correct = gendered_t5_df[gendered_t5_df['true'] == gendered_t5_df['pred']]
# incorrect = gpt_df[gpt_df['true'] != gpt_df['pred']]

print(f"accuraccy: {(len(correct)/ len(gendered_t5_df)) * 100:.2f}")

## Jurassic Large

In [None]:
# jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
# gendered_jurassic_subject_control_experiment  = Experiment("jurassic", "subject-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)
# gendered_jurassic_subject_control_experiment.run(his_names, correct_index, verbs, actions)
# gendered_jurassic_df = gendered_jurassic_subject_control_experiment.format_results()
# gendered_jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_gendered_subject_control_swap_names.csv")
gendered_jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_gendered_subject_control_swap_names.csv")



In [None]:
correct = gendered_jurassic_df[gendered_jurassic_df['true'] == gendered_jurassic_df['pred']]
# incorrect = gpt_df[gpt_df['true'] != gpt_df['pred']]

print(f"accuraccy: {(len(correct)/ len(gendered_jurassic_df)) * 100:.2f}")

In [None]:
from metrics import accuracy_report

accuracy_report(gendered_jurassic_df)

# Passives 

Do passives here make sense? To me 
- Mary was promised by Tom to leave
Does not sound acceptable, or if it is accepetable, Mary is the one leaving, unlike "Tom promised Mary to leave" 