In [1]:
import pandas as pd 

from experiment import Experiment
from api_tools import (FixedGPTPrompt, 
                       FixedPassiveGPTPrompt, 
                       FixedT5Prompt, 
                       FixedPassiveT5Prompt, 
                       run_ai21_prompt, 
                       run_gpt_prompt, 
                       run_t5_prompt)

from metrics import accuracy_report


# Object Control
Starting with the object control experiments, we will look at how 4 LMs do:
1. GPT3
2. T5 for QA 
3. Jurassic large 
4. Jurassic Jumbo

## Experimental settings
We're looking at different name pairs to control for gender. We have 2 male-female name pairs, 1 male-male pair, 1 female-female pair, and 1 neutral-neutral. 
We're also looking at 10 object control verbs and 5 different actions. 

In [2]:
# names = [("Tom", "Mary"), ("Tom", "Bill"), ("Mary", "Bill"), ("Ellen", "Mary"), ("Morgan", "Jaime"), ("Iago", "Hamlet"), ("Jules", "Yves"), ("Kurt", "Lena")]
names = [("Tom", "Mary"), ("Tom", "Bill"), ("Mary", "Bill"), ("Ellen", "Mary"), ("Morgan", "Jaime")]
verbs = ["told", "ordered", "called upon", "reminded", "urged", "asked", "persuaded", "convinced", "forced", "pushed"]
actions = [("to leave", "left"), ("to call home", "called home"), ("to reply", "replied"), ("to wipe the counter", "wiped the counter"), ("to dance", "danced")]
correct_index = 1


## GPT 3
For GPT3, inference is not deterministic, so we're running 5 replicants per prompt 

In [10]:

# gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
# gpt_object_control_experiment  = Experiment("gpt3", "object-control", FixedGPTPrompt, run_gpt_prompt, 5, gpt_kwargs)

# gpt_object_control_experiment.run(names, correct_index, verbs, actions)

# gpt_df = gpt_object_control_experiment.format_results()

# gpt_df.to_csv("/Users/Elias/child-lm/results/gpt_object_control_swap_names.csv")

gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_object_control_swap_names.csv")

In [11]:
accuracy_report(gpt_df)

{'total': (0.562, 500),
 'acc_by_name': {'Ellen,Mary': (0.93, 100),
  'Morgan,Jaime': (0.0, 100),
  'Tom,Bill': (0.65, 100),
  'Tom,Mary': (0.81, 100),
  'Mary,Bill': (0.42, 100)},
 'acc_by_action': {'to call home': (0.56, 100),
  'to reply': (0.43, 100),
  'to dance': (0.59, 100),
  'to wipe the counter': (0.55, 100),
  'to leave': (0.68, 100)},
 'acc_by_verb': {'asked': (0.52, 50),
  'pushed': (0.56, 50),
  'ordered': (0.48, 50),
  'reminded': (0.62, 50),
  'told': (0.64, 50),
  'persuaded': (0.54, 50),
  'urged': (0.66, 50),
  'forced': (0.6, 50),
  'called upon': (0.42, 50),
  'convinced': (0.58, 50)},
 'acc_by_action_by_verb': {'to call home,asked': (0.5, 10),
  'to call home,pushed': (0.6, 10),
  'to call home,ordered': (0.5, 10),
  'to call home,reminded': (0.6, 10),
  'to call home,told': (0.6, 10),
  'to call home,persuaded': (0.5, 10),
  'to call home,urged': (0.7, 10),
  'to call home,forced': (0.6, 10),
  'to call home,called upon': (0.5, 10),
  'to call home,convinced': (0

In [8]:

gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
gpt_object_control_experiment  = Experiment("gpt3", "object-control", FixedGPTPrompt, run_gpt_prompt, 5, gpt_kwargs)
gpt_object_control_experiment.run(names, correct_index, verbs, actions, qa_pair=("Q", "A"))
gpt_df = gpt_object_control_experiment.format_results()

gpt_df.to_csv("/Users/Elias/child-lm/results/gpt_object_control_qa_swap_names.csv")

gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_object_control_qa_swap_names.csv")

100%|██████████| 5/5 [09:38<00:00, 115.63s/it]


In [9]:
accuracy_report(gpt_df)

{'total': (0.504, 500),
 'acc_by_name': {'Ellen,Mary': (0.88, 100),
  'Morgan,Jaime': (0.0, 100),
  'Tom,Bill': (0.71, 100),
  'Tom,Mary': (0.46, 100),
  'Mary,Bill': (0.47, 100)},
 'acc_by_action': {'to call home': (0.53, 100),
  'to reply': (0.33, 100),
  'to dance': (0.52, 100),
  'to wipe the counter': (0.49, 100),
  'to leave': (0.65, 100)},
 'acc_by_verb': {'asked': (0.44, 50),
  'pushed': (0.6, 50),
  'ordered': (0.46, 50),
  'reminded': (0.56, 50),
  'told': (0.52, 50),
  'persuaded': (0.5, 50),
  'urged': (0.54, 50),
  'forced': (0.54, 50),
  'called upon': (0.36, 50),
  'convinced': (0.52, 50)},
 'acc_by_action_by_verb': {'to call home,asked': (0.5, 10),
  'to call home,pushed': (0.6, 10),
  'to call home,ordered': (0.4, 10),
  'to call home,reminded': (0.5, 10),
  'to call home,told': (0.6, 10),
  'to call home,persuaded': (0.5, 10),
  'to call home,urged': (0.6, 10),
  'to call home,forced': (0.6, 10),
  'to call home,called upon': (0.5, 10),
  'to call home,convinced': (0.

## T5 for QA

In [None]:

# t5_object_control_experiment  = Experiment("t5", "object-control", FixedT5Prompt, run_t5_prompt, 1, None)

# t5_object_control_experiment.run(names, correct_index, verbs, actions, do_swap = False)

In [None]:
# t5_df = t5_object_control_experiment.format_results()

# t5_df.to_csv("/Users/Elias/child-lm/results/t5_object_control.csv")

t5_df = pd.read_csv("/Users/Elias/child-lm/results/t5_object_control.csv")

In [None]:
correct = t5_df[t5_df['true'] == t5_df['pred']]
incorrect = t5_df[t5_df['true'] != t5_df['pred']]

print(f"accuraccy: {(len(correct)/ len(t5_df)) * 100:.2f}")

## Jurassic Large

In [14]:

# jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
# jurassic_object_control_experiment  = Experiment("jurassic-large", "object-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)

# jurassic_object_control_experiment.run(names, correct_index, verbs, actions)

# jurassic_df = jurassic_object_control_experiment.format_results()

# jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_object_control_swap_names.csv")

jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_object_control_swap_names.csv")

In [15]:
accuracy_report(jurassic_df)

{'total': (0.26, 500),
 'acc_by_name': {'Ellen,Mary': (0.26, 100),
  'Morgan,Jaime': (0.0, 100),
  'Tom,Bill': (0.2, 100),
  'Tom,Mary': (0.35, 100),
  'Mary,Bill': (0.49, 100)},
 'acc_by_action': {'to call home': (0.18, 100),
  'to reply': (0.27, 100),
  'to dance': (0.2, 100),
  'to wipe the counter': (0.34, 100),
  'to leave': (0.31, 100)},
 'acc_by_verb': {'asked': (0.36, 50),
  'pushed': (0.26, 50),
  'ordered': (0.24, 50),
  'reminded': (0.18, 50),
  'told': (0.26, 50),
  'persuaded': (0.18, 50),
  'urged': (0.28, 50),
  'forced': (0.28, 50),
  'called upon': (0.28, 50),
  'convinced': (0.28, 50)},
 'acc_by_action_by_verb': {'to call home,asked': (0.3, 10),
  'to call home,pushed': (0.1, 10),
  'to call home,ordered': (0.2, 10),
  'to call home,reminded': (0.1, 10),
  'to call home,told': (0.2, 10),
  'to call home,persuaded': (0.1, 10),
  'to call home,urged': (0.2, 10),
  'to call home,forced': (0.3, 10),
  'to call home,called upon': (0.1, 10),
  'to call home,convinced': (0.2

In [12]:

jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
jurassic_object_control_experiment  = Experiment("jurassic-large", "object-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)
jurassic_object_control_experiment.run(names, correct_index, verbs, actions, qa_pair=("Q","A"))
jurassic_df = jurassic_object_control_experiment.format_results()
jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_object_control_qa_swap_names.csv")

# jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_object_control_qa_swap_names.csv")

100%|██████████| 5/5 [09:37<00:00, 115.49s/it]


In [13]:
accuracy_report(jurassic_df)

{'total': (0.328, 500),
 'acc_by_name': {'Ellen,Mary': (0.49, 100),
  'Morgan,Jaime': (0.0, 100),
  'Tom,Bill': (0.3, 100),
  'Tom,Mary': (0.36, 100),
  'Mary,Bill': (0.49, 100)},
 'acc_by_action': {'to call home': (0.29, 100),
  'to reply': (0.33, 100),
  'to dance': (0.29, 100),
  'to wipe the counter': (0.38, 100),
  'to leave': (0.35, 100)},
 'acc_by_verb': {'asked': (0.34, 50),
  'pushed': (0.34, 50),
  'ordered': (0.3, 50),
  'reminded': (0.26, 50),
  'told': (0.3, 50),
  'persuaded': (0.3, 50),
  'urged': (0.36, 50),
  'forced': (0.32, 50),
  'called upon': (0.38, 50),
  'convinced': (0.38, 50)},
 'acc_by_action_by_verb': {'to call home,asked': (0.4, 10),
  'to call home,pushed': (0.3, 10),
  'to call home,ordered': (0.3, 10),
  'to call home,reminded': (0.2, 10),
  'to call home,told': (0.3, 10),
  'to call home,persuaded': (0.2, 10),
  'to call home,urged': (0.3, 10),
  'to call home,forced': (0.3, 10),
  'to call home,called upon': (0.3, 10),
  'to call home,convinced': (0.3,

# Passives 

The passive form reverses the order and also syntactic role of the agent/patient. E.g. 

- Mary told Tom to wipe the counter 
- Tom was told by Mary to wipe the counter 

Now Tom is linearly further away from "wipe" and also the grammatical subject of the matrix clause (though still the patient) 

In [None]:
correct_index = 0 

## GPT3 

In [None]:

# gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
# passive_gpt_object_control_experiment  = Experiment("gpt3", "object-control-passive", FixedPassiveGPTPrompt, run_gpt_prompt, 5, gpt_kwargs) 

# passive_gpt_object_control_experiment.run(names, correct_index, verbs, actions)

# passive_gpt_df = passive_gpt_object_control_experiment.format_results()

# passive_gpt_df.to_csv("/Users/Elias/child-lm/results/gpt_passive_object_control_swap_names.csv")
passive_gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_passive_object_control_swap_names.csv")



In [None]:
accuracy_report(passive_gpt_df)


## T5 for QA

In [None]:

# passive_t5_object_control_experiment  = Experiment("t5", "object-control-passive", FixedPassiveT5Prompt, run_t5_prompt, 1, None)

# passive_t5_object_control_experiment.run(names, correct_index, verbs, actions, do_swap = False)

# passive_t5_df = passive_t5_object_control_experiment.format_results()

# passive_t5_df.to_csv("/Users/Elias/child-lm/results/t5_passive_object_control.csv")
passive_t5_df = pd.read_csv("/Users/Elias/child-lm/results/t5_passive_object_control.csv")

In [None]:
accuracy_report(passive_t5_df)

## Jurassic Large 

In [None]:

# jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
# passive_jurassic_object_control_experiment  = Experiment("jurassic-large", "object-control-passive", FixedPassiveGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)

# passive_jurassic_object_control_experiment.run(names, correct_index, verbs, actions)
# passive_jurassic_df = passive_jurassic_object_control_experiment.format_results()

# passive_jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_passive_object_control_swap_names.csv")

passive_jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_object_control_swap_names.csv")

In [None]:
accuracy_report(passive_jurassic_df)