In [1]:
import pandas as pd 

from experiment import Experiment
from api_tools import (FixedGPTPrompt, 
                       FixedPassiveGPTPrompt, 
                       FixedT5Prompt, 
                       FixedPassiveT5Prompt, 
                       run_ai21_prompt, 
                       run_gpt_prompt, 
                       run_t5_prompt)

from metrics import accuracy_report


# Subject Control
Starting with the **subject** control experiments, we will look at how 4 LMs do:
1. GPT3
2. T5 for QA 
3. Jurassic large 
4. Jurassic Jumbo

## Experimental settings
We're looking at different name pairs to control for gender. We have 2 male-female name pairs, 1 male-male pair, 1 female-female pair, and 1 neutral-neutral. 
We're still looking at 5 different actions, but only one verb: promise. 

We can later coerce (with gender) subject control on "suggested" and "proposed" 

In [2]:
# names = [("Tom", "Mary"), ("Tom", "Bill"), ("Mary", "Bill"), ("Ellen", "Mary"), ("Morgan", "Jaime"), ("Iago", "Hamlet"), ("Jules", "Yves"), ("Kurt", "Lena")]
names = [("Tom", "Mary"), ("Tom", "Bill"), ("Mary", "Bill"), ("Ellen", "Mary"), ("Morgan", "Jaime")]
# verbs = ["told", "ordered", "called upon", "reminded", "urged", "asked", "persuaded", "convinced", "forced", "pushed"]
verbs = ["promised"]
actions = [("to leave", "left"), ("to call home", "called home"), ("to reply", "replied"), ("to wipe the counter", "wiped the counter"), ("to dance", "danced")]
correct_index = 0


## GPT 3
For GPT3, inference is not deterministic, so we're running 5 replicants per prompt 

In [None]:

# gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
# gpt_subject_control_experiment  = Experiment("gpt3", "subject-control", FixedGPTPrompt, run_gpt_prompt, 5, gpt_kwargs)

# gpt_subject_control_experiment.run(names, correct_index, verbs, actions)

In [3]:
# gpt_df = gpt_subject_control_experiment.format_results()

# gpt_df.to_csv("/Users/Elias/child-lm/results/gpt_subject_control_swap_names.csv")

gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_subject_control_swap_names.csv")

In [4]:
accuracy_report(gpt_df)

{'total': (0.38, 50),
 'acc_by_name': {'Tom,Mary': (0.3, 10),
  'Tom,Bill': (0.5, 10),
  'Mary,Bill': (0.7, 10),
  'Morgan,Jaime': (0.4, 10),
  'Ellen,Mary': (0.0, 10)},
 'acc_by_action': {'to call home': (0.5, 10),
  'to reply': (0.1, 10),
  'to leave': (0.4, 10),
  'to wipe the counter': (0.5, 10),
  'to dance': (0.4, 10)},
 'acc_by_verb': {'promised': (0.38, 50)},
 'acc_by_action_by_verb': {'to call home,promised': (0.5, 10),
  'to reply,promised': (0.1, 10),
  'to leave,promised': (0.4, 10),
  'to wipe the counter,promised': (0.5, 10),
  'to dance,promised': (0.4, 10)}}

## T5 for QA

In [None]:

# t5_subject_control_experiment  = Experiment("t5", "subject-control", FixedT5Prompt, run_t5_prompt, 1, None)

# t5_subject_control_experiment.run(names, correct_index, verbs, actions, do_swap = False)

In [5]:
# t5_df = t5_subject_control_experiment.format_results()

# t5_df.to_csv("/Users/Elias/child-lm/results/t5_subject_control.csv")

t5_df = pd.read_csv("/Users/Elias/child-lm/results/t5_subject_control.csv")

In [6]:
accuracy_report(t5_df)

{'total': (0.08, 25),
 'acc_by_name': {'Tom,Mary': (0.0, 5),
  'Tom,Bill': (0.0, 5),
  'Mary,Bill': (0.0, 5),
  'Morgan,Jaime': (0.4, 5),
  'Ellen,Mary': (0.0, 5)},
 'acc_by_action': {'to call home': (0.2, 5),
  'to reply': (0.0, 5),
  'to leave': (0.0, 5),
  'to wipe the counter': (0.2, 5),
  'to dance': (0.0, 5)},
 'acc_by_verb': {'promised': (0.08, 25)},
 'acc_by_action_by_verb': {'to call home,promised': (0.2, 5),
  'to reply,promised': (0.0, 5),
  'to leave,promised': (0.0, 5),
  'to wipe the counter,promised': (0.2, 5),
  'to dance,promised': (0.0, 5)}}

## Jurassic Large

In [7]:

# jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
# jurassic_subject_control_experiment  = Experiment("jurassic-large", "subject-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)

# jurassic_subject_control_experiment.run(names, correct_index, verbs, actions)

In [8]:
# jurassic_df = jurassic_subject_control_experiment.format_results()

# jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_subject_control_swap_names.csv")

jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_subject_control_swap_names.csv")

In [9]:
accuracy_report(jurassic_df)

{'total': (0.84, 50),
 'acc_by_name': {'Tom,Mary': (0.9, 10),
  'Tom,Bill': (0.9, 10),
  'Mary,Bill': (0.5, 10),
  'Morgan,Jaime': (1.0, 10),
  'Ellen,Mary': (0.9, 10)},
 'acc_by_action': {'to call home': (0.9, 10),
  'to reply': (0.9, 10),
  'to leave': (0.7, 10),
  'to wipe the counter': (0.8, 10),
  'to dance': (0.9, 10)},
 'acc_by_verb': {'promised': (0.84, 50)},
 'acc_by_action_by_verb': {'to call home,promised': (0.9, 10),
  'to reply,promised': (0.9, 10),
  'to leave,promised': (0.7, 10),
  'to wipe the counter,promised': (0.8, 10),
  'to dance,promised': (0.9, 10)}}


## Coerced examples with gender

By using gendered names and pronouns, we can coerce subject or object control from "suggested", "offered", and "proposed", e.g. 

- Mary proposed to Tom to be his editor
- Tom suggested to Mary to be her editor 
- Mary offered to Tom to be his editor 


In [10]:
verbs = ["promised", "offered", "suggested", "proposed"]
his_names = [("Tom", "Mary"), ("Bill", "Mary"), ("James", "Mary"), ("Tom", "Sally"), ("Bill", "Sally"), ("James", "Sally")]
actions = [("to be her editor", "was the editor")]
correct_index = 0

## GPT 3

In [None]:

# gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
# gendered_gpt_subject_control_experiment  = Experiment("gpt3", "subject-control", FixedGPTPrompt, run_gpt_prompt, 5, gpt_kwargs)
# gendered_gpt_subject_control_experiment.run(his_names, correct_index, verbs, actions)


In [11]:
# gendered_gpt_df = gendered_gpt_subject_control_experiment.format_results()

# gendered_gpt_df.to_csv("/Users/Elias/child-lm/results/gpt_gendered_subject_control_swap_names.csv")
gendered_gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_gendered_subject_control_swap_names.csv")


In [12]:
accuracy_report(gendered_gpt_df)

{'total': (0.3125, 48),
 'acc_by_name': {'James,Sally': (0.5, 8),
  'James,Mary': (0.0, 8),
  'Tom,Sally': (0.625, 8),
  'Tom,Mary': (0.125, 8),
  'Bill,Sally': (0.625, 8),
  'Bill,Mary': (0.0, 8)},
 'acc_by_action': {'to be her editor': (0.3125, 48)},
 'acc_by_verb': {'promised': (0.4166666666666667, 12),
  'suggested': (0.25, 12),
  'proposed': (0.3333333333333333, 12),
  'offered': (0.25, 12)},
 'acc_by_action_by_verb': {'to be her editor,promised': (0.4166666666666667,
   12),
  'to be her editor,suggested': (0.25, 12),
  'to be her editor,proposed': (0.3333333333333333, 12),
  'to be her editor,offered': (0.25, 12)}}

## T5 for QA 

In [13]:

# gendered_t5_subject_control_experiment  = Experiment("t5", "subject-control", FixedT5Prompt, run_t5_prompt, 1, None)
# gendered_t5_subject_control_experiment.run(his_names, correct_index, verbs, actions)
# gendered_t5_df = gendered_t5_subject_control_experiment.format_results()
# gendered_t5_df.to_csv("/Users/Elias/child-lm/results/t5_gendered_subject_control_swap_names.csv")
gendered_t5_df = pd.read_csv("/Users/Elias/child-lm/results/t5_gendered_subject_control_swap_names.csv")


In [14]:
accuracy_report(gendered_t5_df)

{'total': (0.0, 48),
 'acc_by_name': {'James,Sally': (0.0, 8),
  'James,Mary': (0.0, 8),
  'Tom,Sally': (0.0, 8),
  'Tom,Mary': (0.0, 8),
  'Bill,Sally': (0.0, 8),
  'Bill,Mary': (0.0, 8)},
 'acc_by_action': {'to be her editor': (0.0, 48)},
 'acc_by_verb': {'promised': (0.0, 12),
  'suggested': (0.0, 12),
  'proposed': (0.0, 12),
  'offered': (0.0, 12)},
 'acc_by_action_by_verb': {'to be her editor,promised': (0.0, 12),
  'to be her editor,suggested': (0.0, 12),
  'to be her editor,proposed': (0.0, 12),
  'to be her editor,offered': (0.0, 12)}}

## Jurassic Large

In [15]:
# jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
# gendered_jurassic_subject_control_experiment  = Experiment("jurassic", "subject-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)
# gendered_jurassic_subject_control_experiment.run(his_names, correct_index, verbs, actions)
# gendered_jurassic_df = gendered_jurassic_subject_control_experiment.format_results()
# gendered_jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_gendered_subject_control_swap_names.csv")
gendered_jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_gendered_subject_control_swap_names.csv")



In [16]:

accuracy_report(gendered_jurassic_df)

{'total': (0.6875, 48),
 'acc_by_name': {'James,Sally': (1.0, 8),
  'James,Mary': (0.625, 8),
  'Tom,Sally': (0.875, 8),
  'Tom,Mary': (0.625, 8),
  'Bill,Sally': (0.5, 8),
  'Bill,Mary': (0.5, 8)},
 'acc_by_action': {'to be her editor': (0.6875, 48)},
 'acc_by_verb': {'promised': (0.8333333333333334, 12),
  'suggested': (0.5833333333333334, 12),
  'proposed': (0.6666666666666666, 12),
  'offered': (0.6666666666666666, 12)},
 'acc_by_action_by_verb': {'to be her editor,promised': (0.8333333333333334,
   12),
  'to be her editor,suggested': (0.5833333333333334, 12),
  'to be her editor,proposed': (0.6666666666666666, 12),
  'to be her editor,offered': (0.6666666666666666, 12)}}

# Passives 

Do passives here make sense? To me 
- Mary was promised by Tom to leave
Does not sound acceptable, or if it is accepetable, Mary is the one leaving, unlike "Tom promised Mary to leave" 