In [1]:
import pandas as pd 
import json 

from experiment import Experiment
from api_tools import (FixedGPTPrompt, 
                       FixedPassiveGPTPrompt, 
                       FixedT5Prompt, 
                       FixedPassiveT5Prompt, 
                       run_ai21_prompt, 
                       run_ai21_jumbo_prompt, 
                       run_gpt_prompt, 
                       run_t5_prompt)

from metrics import accuracy_report


# Subject Control
Starting with the **subject** control experiments, we will look at how 4 LMs do:
1. GPT3
2. T5 for QA 
3. Jurassic large 
4. Jurassic Jumbo

## Experimental settings
We're looking at different name pairs to control for gender. We have 2 male-female name pairs, 1 male-male pair, 1 female-female pair, and 1 neutral-neutral. 
We're still looking at 5 different actions, but only one verb: promise. 

We can later coerce (with gender) subject control on "suggested" and "proposed" 

In [2]:
# names = [("Tom", "Mary"), ("Tom", "Bill"), ("Mary", "Bill"), ("Ellen", "Mary"), ("Morgan", "Jaime"), ("Iago", "Hamlet"), ("Jules", "Yves"), ("Kurt", "Lena")]
# names = [("Tom", "Mary"), ("Tom", "Bill"), ("Mary", "Bill"), ("Ellen", "Mary"), ("Morgan", "Jaime")]
names = json.load(open("../data/names_top_2.json"))
# verbs = ["told", "ordered", "called upon", "reminded", "urged", "asked", "persuaded", "convinced", "forced", "pushed"]
verbs = ["promised"]
# actions = [("to leave", "left"), ("to call home", "called home"), ("to reply", "replied"), ("to wipe the counter", "wiped the counter"), ("to dance", "danced")]
actions = json.load(open("../data/verbs.json"))
correct_index = 0

nicknames = json.load(open("../data/nicknames.json"))


## GPT 3
For GPT3, inference is not deterministic, so we're running 5 replicants per prompt 

In [3]:

gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
gpt_subject_control_experiment  = Experiment("gpt3", "subject-control", FixedGPTPrompt, run_gpt_prompt, 1, gpt_kwargs)

gpt_subject_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames)

100%|██████████| 30/30 [05:58<00:00, 11.94s/it]


In [5]:
gpt_df = gpt_subject_control_experiment.format_results()

gpt_df.to_csv("../results/gpt_subject_control_swap_names.csv")

# gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_subject_control_swap_names.csv")

In [6]:
accuracy_report(gpt_df)

{'total': (0.48333333333333334, 300, 0.5471698113207547, 265),
 'acc_by_name': {'Avery,Casey': (0.4, 20, 0.47058823529411764, 17),
  'Avery,Nicole': (0.5, 20, 0.5, 20),
  'Avery,Stephanie': (0.75, 20, 0.75, 20),
  'Avery,William': (0.55, 20, 0.5789473684210527, 19),
  'Avery,Joseph': (0.5, 20, 0.5263157894736842, 19),
  'Casey,Nicole': (0.4, 20, 0.7272727272727273, 11),
  'Casey,Stephanie': (0.35, 20, 0.4375, 16),
  'Casey,William': (0.45, 20, 0.5625, 16),
  'Casey,Joseph': (0.45, 20, 0.5294117647058824, 17),
  'Nicole,Stephanie': (0.5, 20, 0.5, 20),
  'Nicole,William': (0.5, 20, 0.5555555555555556, 18),
  'Nicole,Joseph': (0.5, 20, 0.5, 20),
  'Stephanie,William': (0.5, 20, 0.5, 20),
  'Stephanie,Joseph': (0.3, 20, 0.375, 16),
  'William,Joseph': (0.6, 20, 0.75, 16)},
 'acc_by_action': {'to read': (0.45, 60, 0.5294117647058824, 51),
  'to go': (0.5166666666666667, 60, 0.5535714285714286, 56),
  'to come': (0.4666666666666667, 60, 0.5283018867924528, 53),
  'to call': (0.46666666666666

## T5 base 

In [3]:
from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("t5-base", device="cpu", constrained=True) 

t5_base_subject_control_experiment  = Experiment("t5-base", "subject-control", FixedGPTPrompt, wrapper_fxn, 1, None)

t5_base_subject_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=True)

t5_base_df = t5_base_subject_control_experiment.format_results()

t5_base_df.to_csv("../results/t5_base_subject_control.csv")

accuracy_report(t5_base_df)


Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/30 [00:00<?, ?it/s]

> [0;32m/home/estengel/child-lm/src/hf_tools/hf.py[0m(36)[0;36m__call__[0;34m()[0m
[0;32m     34 [0;31m[0;34m[0m[0m
[0m[0;32m     35 [0;31m[0;34m[0m[0m
[0m[0;32m---> 36 [0;31m        [0;32mreturn[0m [0moutput_text[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     37 [0;31m[0;34m[0m[0m
[0m[0;32m     38 [0;31m[0;34m[0m[0m
[0m
1
1
tensor([[   0, 6320,   63,   42,   71, 8461,   58,    1]])
tensor([[   0, 6320,   63,   42,   71, 8461,   58,    1]])
tensor([[   0, 6320,   63,   42,   71, 8461,   58,    1]])
tensor([[   0, 6320,   63,   42,   71, 8461,   58,    1]])
tensor([[   0, 6320,   63,   42,   71, 8461,   58,    1]])
--KeyboardInterrupt--


  0%|          | 0/30 [00:43<?, ?it/s]


KeyboardInterrupt: Interrupted by user





UnboundLocalError: local variable 'output_text' referenced before assignment

## T5 large 

In [4]:
from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("t5-large", "cuda:3")

t5_large_subject_control_experiment  = Experiment("t5-large", "subject-control", FixedGPTPrompt, wrapper_fxn, 1, None)

t5_large_subject_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=True)




Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-large and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 30/30 [01:29<00:00,  2.98s/it]


{'total': (0.86, 150, 0.8716216216216216, 148),
 'acc_by_name': {'Avery,William': (0.8, 10, 0.8, 10),
  'Avery,Casey': (0.6, 10, 0.6, 10),
  'Avery,Nicole': (0.8, 10, 0.8, 10),
  'Avery,Stephanie': (1.0, 10, 1.0, 10),
  'Avery,Joseph': (0.4, 10, 0.4, 10),
  'William,Casey': (1.0, 10, 1.0, 10),
  'William,Nicole': (1.0, 10, 1.0, 10),
  'William,Stephanie': (1.0, 10, 1.0, 10),
  'William,Joseph': (1.0, 10, 1.0, 10),
  'Casey,Nicole': (0.7, 10, 0.7, 10),
  'Casey,Stephanie': (0.9, 10, 0.9, 10),
  'Casey,Joseph': (0.8, 10, 0.8888888888888888, 9),
  'Nicole,Stephanie': (1.0, 10, 1.0, 10),
  'Nicole,Joseph': (0.9, 10, 1.0, 9),
  'Stephanie,Joseph': (1.0, 10, 1.0, 10)},
 'acc_by_action': {'to come': (0.8, 30, 0.8571428571428571, 28),
  'to go': (0.8, 30, 0.8, 30),
  'to run': (0.8666666666666667, 30, 0.8666666666666667, 30),
  'to call': (0.9333333333333333, 30, 0.9333333333333333, 30),
  'to read': (0.9, 30, 0.9, 30)},
 'acc_by_verb': {'promised': (0.86, 150, 0.8716216216216216, 148)},
 'acc

In [8]:
t5_large_df = t5_large_subject_control_experiment.format_results()

t5_large_df.to_csv("../results/t5_large_subject_control.csv")

accuracy_report(t5_large_df)

{'total': (0.9933333333333333, 150, 0.9933333333333333, 150),
 'acc_by_name': {'Avery,William': (1.0, 10, 1.0, 10),
  'Avery,Casey': (1.0, 10, 1.0, 10),
  'Avery,Nicole': (1.0, 10, 1.0, 10),
  'Avery,Stephanie': (0.9, 10, 0.9, 10),
  'Avery,Joseph': (1.0, 10, 1.0, 10),
  'William,Casey': (1.0, 10, 1.0, 10),
  'William,Nicole': (1.0, 10, 1.0, 10),
  'William,Stephanie': (1.0, 10, 1.0, 10),
  'William,Joseph': (1.0, 10, 1.0, 10),
  'Casey,Nicole': (1.0, 10, 1.0, 10),
  'Casey,Stephanie': (1.0, 10, 1.0, 10),
  'Casey,Joseph': (1.0, 10, 1.0, 10),
  'Nicole,Stephanie': (1.0, 10, 1.0, 10),
  'Nicole,Joseph': (1.0, 10, 1.0, 10),
  'Stephanie,Joseph': (1.0, 10, 1.0, 10)},
 'acc_by_action': {'to come': (0.9666666666666667, 30, 0.9666666666666667, 30),
  'to go': (1.0, 30, 1.0, 30),
  'to run': (1.0, 30, 1.0, 30),
  'to call': (1.0, 30, 1.0, 30),
  'to read': (1.0, 30, 1.0, 30)},
 'acc_by_verb': {'promised': (0.9933333333333333,
   150,
   0.9933333333333333,
   150)},
 'acc_by_action_by_verb': 

## T5 3B 

In [10]:
from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("t5-3b", "cuda:3")

t5_3b_subject_control_experiment  = Experiment("t5-3b", "subject-control", FixedGPTPrompt, wrapper_fxn, 1, None)

t5_3b_subject_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=True)

t5_3b_df = t5_3b_subject_control_experiment.format_results()

t5_3b_df.to_csv("../results/t5_3b_subject_control.csv")

accuracy_report(t5_3b_df)

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-3b and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 30/30 [01:29<00:00,  2.98s/it]


{'total': (0.28, 150, 1.0, 42),
 'acc_by_name': {'Avery,William': (0.7, 10, 1.0, 7),
  'Avery,Casey': (0.8, 10, 1.0, 8),
  'Avery,Nicole': (0.0, 10, -1, 0),
  'Avery,Stephanie': (0.3, 10, 1.0, 3),
  'Avery,Joseph': (0.2, 10, 1.0, 2),
  'William,Casey': (0.6, 10, 1.0, 6),
  'William,Nicole': (0.0, 10, -1, 0),
  'William,Stephanie': (0.3, 10, 1.0, 3),
  'William,Joseph': (0.5, 10, 1.0, 5),
  'Casey,Nicole': (0.3, 10, 1.0, 3),
  'Casey,Stephanie': (0.0, 10, -1, 0),
  'Casey,Joseph': (0.1, 10, 1.0, 1),
  'Nicole,Stephanie': (0.2, 10, 1.0, 2),
  'Nicole,Joseph': (0.0, 10, -1, 0),
  'Stephanie,Joseph': (0.2, 10, 1.0, 2)},
 'acc_by_action': {'to come': (0.36666666666666664, 30, 1.0, 11),
  'to go': (0.3333333333333333, 30, 1.0, 10),
  'to run': (0.2, 30, 1.0, 6),
  'to call': (0.4, 30, 1.0, 12),
  'to read': (0.1, 30, 1.0, 3)},
 'acc_by_verb': {'promised': (0.28, 150, 1.0, 42)},
 'acc_by_action_by_verb': {'to come,promised': (0.36666666666666664,
   30,
   1.0,
   11),
  'to go,promised': (0.

## T5 for QA

In [5]:
from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("valhalla/t5-base-qa-qg-hl", "cuda:3")

t5_subject_control_experiment  = Experiment("t5-qa", "subject-control", FixedT5Prompt, wrapper_fxn, 1, None)

t5_subject_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=True)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

100%|██████████| 30/30 [00:08<00:00,  3.66it/s]


In [6]:
t5_df = t5_subject_control_experiment.format_results()

t5_df.to_csv("../results/t5_qa_subject_control.csv")

# t5_df = pd.read_csv("/Users/Elias/child-lm/results/t5_subject_control.csv")

In [7]:
accuracy_report(t5_df)

{'total': (0.006666666666666667, 150, 0.006666666666666667, 150),
 'acc_by_name': {'Avery,William': (0.0, 10, 0.0, 10),
  'Avery,Casey': (0.0, 10, 0.0, 10),
  'Avery,Nicole': (0.0, 10, 0.0, 10),
  'Avery,Stephanie': (0.0, 10, 0.0, 10),
  'Avery,Joseph': (0.0, 10, 0.0, 10),
  'William,Casey': (0.0, 10, 0.0, 10),
  'William,Nicole': (0.0, 10, 0.0, 10),
  'William,Stephanie': (0.0, 10, 0.0, 10),
  'William,Joseph': (0.0, 10, 0.0, 10),
  'Casey,Nicole': (0.0, 10, 0.0, 10),
  'Casey,Stephanie': (0.0, 10, 0.0, 10),
  'Casey,Joseph': (0.1, 10, 0.1, 10),
  'Nicole,Stephanie': (0.0, 10, 0.0, 10),
  'Nicole,Joseph': (0.0, 10, 0.0, 10),
  'Stephanie,Joseph': (0.0, 10, 0.0, 10)},
 'acc_by_action': {'to come': (0.0, 30, 0.0, 30),
  'to go': (0.0, 30, 0.0, 30),
  'to run': (0.0, 30, 0.0, 30),
  'to call': (0.0, 30, 0.0, 30),
  'to read': (0.03333333333333333, 30, 0.03333333333333333, 30)},
 'acc_by_verb': {'promised': (0.006666666666666667,
   150,
   0.006666666666666667,
   150)},
 'acc_by_action_

## T0

In [3]:
from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("bigscience/T0pp", device="cpu", constrained=False)

t0_subject_control_experiment  = Experiment("t0", "subject-control", FixedGPTPrompt, wrapper_fxn, 1, None)

t0_subject_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=True)

t0_df = t0_subject_control_experiment.format_results()

t0_df.to_csv("../results/t0_subject_control.csv")

accuracy_report(t0_df)


100%|██████████| 30/30 [57:55<00:00, 115.84s/it]


{'total': (0.6733333333333333, 150, 0.6733333333333333, 150),
 'acc_by_name': {'Casey,Joseph': (0.9, 10, 0.9, 10),
  'Casey,Stephanie': (0.9, 10, 0.9, 10),
  'Casey,William': (0.6, 10, 0.6, 10),
  'Casey,Nicole': (1.0, 10, 1.0, 10),
  'Casey,Avery': (0.6, 10, 0.6, 10),
  'Joseph,Stephanie': (0.5, 10, 0.5, 10),
  'Joseph,William': (0.6, 10, 0.6, 10),
  'Joseph,Nicole': (0.9, 10, 0.9, 10),
  'Joseph,Avery': (0.4, 10, 0.4, 10),
  'Stephanie,William': (0.6, 10, 0.6, 10),
  'Stephanie,Nicole': (0.7, 10, 0.7, 10),
  'Stephanie,Avery': (0.6, 10, 0.6, 10),
  'William,Nicole': (0.8, 10, 0.8, 10),
  'William,Avery': (0.4, 10, 0.4, 10),
  'Nicole,Avery': (0.6, 10, 0.6, 10)},
 'acc_by_action': {'to read': (0.6, 30, 0.6, 30),
  'to call': (0.9666666666666667, 30, 0.9666666666666667, 30),
  'to run': (0.7, 30, 0.7, 30),
  'to go': (0.5666666666666667, 30, 0.5666666666666667, 30),
  'to come': (0.5333333333333333, 30, 0.5333333333333333, 30)},
 'acc_by_verb': {'promised': (0.6733333333333333,
   150,

## GPT Neo

In [3]:
from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("EleutherAI/gpt-neo-2.7B", device="cpu", constrained=False)

gptneo_subject_control_experiment  = Experiment("gpt-neo-2.7b", "subject-control", FixedGPTPrompt, wrapper_fxn, 1, None)

gptneo_subject_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=True)

gptneo_df = gptneo_subject_control_experiment.format_results()

gptneo_df.to_csv("../results/gpt_neo_2.7b_subject_control.csv")

accuracy_report(gptneo_df)


  0%|          | 0/30 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 1/30 [04:37<2:14:18, 277.88s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  7%|▋         | 2/30 [09:23<2:11:41, 282.19s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'total': (1.0, 150, 1.0, 150),
 'acc_by_name': {'Casey,William': (1.0, 10, 1.0, 10),
  'Casey,Stephanie': (1.0, 10, 1.0, 10),
  'Casey,Joseph': (1.0, 10, 1.0, 10),
  'Casey,Avery': (1.0, 10, 1.0, 10),
  'Casey,Nicole': (1.0, 10, 1.0, 10),
  'William,Stephanie': (1.0, 10, 1.0, 10),
  'William,Joseph': (1.0, 10, 1.0, 10),
  'William,Avery': (1.0, 10, 1.0, 10),
  'William,Nicole': (1.0, 10, 1.0, 10),
  'Stephanie,Joseph': (1.0, 10, 1.0, 10),
  'Stephanie,Avery': (1.0, 10, 1.0, 10),
  'Stephanie,Nicole': (1.0, 10, 1.0, 10),
  'Nicole,Joseph': (1.0, 10, 1.0, 10),
  'Nicole,Avery': (1.0, 10, 1.0, 10),
  'Joseph,Avery': (1.0, 10, 1.0, 10)},
 'acc_by_action': {'to read': (1.0, 30, 1.0, 30),
  'to come': (1.0, 30, 1.0, 30),
  'to run': (1.0, 30, 1.0, 30),
  'to go': (1.0, 30, 1.0, 30),
  'to call': (1.0, 30, 1.0, 30)},
 'acc_by_verb': {'promised': (1.0, 150, 1.0, 150)},
 'acc_by_action_by_verb': {'to read,promised': (1.0, 30, 1.0, 30),
  'to come,promised': (1.0, 30, 1.0, 30),
  'to run,promis

## Jurassic Large

In [7]:

jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
jurassic_subject_control_experiment  = Experiment("jurassic-large", "subject-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)

jurassic_subject_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames)

100%|██████████| 30/30 [05:58<00:00, 11.93s/it]


In [8]:
jurassic_df = jurassic_subject_control_experiment.format_results()

jurassic_df.to_csv("../results/jurassic_subject_control_swap_names.csv")

# jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_subject_control_swap_names.csv")

In [9]:
accuracy_report(jurassic_df)

{'total': (0.63, 300, 0.6385135135135135, 296),
 'acc_by_name': {'Avery,Casey': (0.6, 20, 0.6, 20),
  'Avery,Nicole': (0.65, 20, 0.7222222222222222, 18),
  'Avery,Stephanie': (0.75, 20, 0.75, 20),
  'Avery,William': (0.65, 20, 0.65, 20),
  'Avery,Joseph': (0.7, 20, 0.7, 20),
  'Casey,Nicole': (0.75, 20, 0.75, 20),
  'Casey,Stephanie': (0.55, 20, 0.55, 20),
  'Casey,William': (0.5, 20, 0.5, 20),
  'Casey,Joseph': (0.55, 20, 0.55, 20),
  'Nicole,Stephanie': (0.45, 20, 0.5, 18),
  'Nicole,William': (0.6, 20, 0.6, 20),
  'Nicole,Joseph': (0.7, 20, 0.7, 20),
  'Stephanie,William': (0.5, 20, 0.5, 20),
  'Stephanie,Joseph': (0.75, 20, 0.75, 20),
  'William,Joseph': (0.75, 20, 0.75, 20)},
 'acc_by_action': {'to read': (0.6333333333333333, 60, 0.6333333333333333, 60),
  'to go': (0.6166666666666667, 60, 0.6271186440677966, 59),
  'to come': (0.65, 60, 0.65, 60),
  'to call': (0.5666666666666667, 60, 0.5964912280701754, 57),
  'to run': (0.6833333333333333, 60, 0.6833333333333333, 60)},
 'acc_by

## Jurassic Jumbo

In [3]:

jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
jurassic_jumbo_subject_control_experiment  = Experiment("jurassic-jumbo", "subject-control", FixedGPTPrompt, run_ai21_jumbo_prompt, 1, jurassic_kwargs)

jurassic_jumbo_subject_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames, do_swap=True, rate_limit_delay=60, rate_limit_count=19)
jurassic_jumbo_df = jurassic_jumbo_subject_control_experiment.format_results()

jurassic_jumbo_df.to_csv("../results/jurassic_jumbo_subject_control_swap_names.csv")

# jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_subject_control_swap_names.csv")

100%|██████████| 30/30 [17:30<00:00, 35.00s/it]



## Coerced examples with gender

By using gendered names and pronouns, we can coerce subject or object control from "suggested", "offered", and "proposed", e.g. 

- Mary proposed to Tom to be his editor
- Tom suggested to Mary to be her editor 
- Mary offered to Tom to be his editor 


In [10]:
verbs = ["promised", "offered", "suggested", "proposed"]
his_names = [("Tom", "Mary"), ("Bill", "Mary"), ("James", "Mary"), ("Tom", "Sally"), ("Bill", "Sally"), ("James", "Sally")]
actions = [("to be her editor", "was the editor")]
correct_index = 0

## GPT 3

In [None]:

# gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
# gendered_gpt_subject_control_experiment  = Experiment("gpt3", "subject-control", FixedGPTPrompt, run_gpt_prompt, 5, gpt_kwargs)
# gendered_gpt_subject_control_experiment.run(his_names, correct_index, verbs, actions)


In [11]:
# gendered_gpt_df = gendered_gpt_subject_control_experiment.format_results()

# gendered_gpt_df.to_csv("/Users/Elias/child-lm/results/gpt_gendered_subject_control_swap_names.csv")
gendered_gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_gendered_subject_control_swap_names.csv")


In [12]:
accuracy_report(gendered_gpt_df)

{'total': (0.3125, 48),
 'acc_by_name': {'James,Sally': (0.5, 8),
  'James,Mary': (0.0, 8),
  'Tom,Sally': (0.625, 8),
  'Tom,Mary': (0.125, 8),
  'Bill,Sally': (0.625, 8),
  'Bill,Mary': (0.0, 8)},
 'acc_by_action': {'to be her editor': (0.3125, 48)},
 'acc_by_verb': {'promised': (0.4166666666666667, 12),
  'suggested': (0.25, 12),
  'proposed': (0.3333333333333333, 12),
  'offered': (0.25, 12)},
 'acc_by_action_by_verb': {'to be her editor,promised': (0.4166666666666667,
   12),
  'to be her editor,suggested': (0.25, 12),
  'to be her editor,proposed': (0.3333333333333333, 12),
  'to be her editor,offered': (0.25, 12)}}

## T5 for QA 

In [13]:

# gendered_t5_subject_control_experiment  = Experiment("t5", "subject-control", FixedT5Prompt, run_t5_prompt, 1, None)
# gendered_t5_subject_control_experiment.run(his_names, correct_index, verbs, actions)
# gendered_t5_df = gendered_t5_subject_control_experiment.format_results()
# gendered_t5_df.to_csv("/Users/Elias/child-lm/results/t5_gendered_subject_control_swap_names.csv")
gendered_t5_df = pd.read_csv("/Users/Elias/child-lm/results/t5_gendered_subject_control_swap_names.csv")


In [14]:
accuracy_report(gendered_t5_df)

{'total': (0.0, 48),
 'acc_by_name': {'James,Sally': (0.0, 8),
  'James,Mary': (0.0, 8),
  'Tom,Sally': (0.0, 8),
  'Tom,Mary': (0.0, 8),
  'Bill,Sally': (0.0, 8),
  'Bill,Mary': (0.0, 8)},
 'acc_by_action': {'to be her editor': (0.0, 48)},
 'acc_by_verb': {'promised': (0.0, 12),
  'suggested': (0.0, 12),
  'proposed': (0.0, 12),
  'offered': (0.0, 12)},
 'acc_by_action_by_verb': {'to be her editor,promised': (0.0, 12),
  'to be her editor,suggested': (0.0, 12),
  'to be her editor,proposed': (0.0, 12),
  'to be her editor,offered': (0.0, 12)}}

## Jurassic Large

In [15]:
# jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
# gendered_jurassic_subject_control_experiment  = Experiment("jurassic", "subject-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)
# gendered_jurassic_subject_control_experiment.run(his_names, correct_index, verbs, actions)
# gendered_jurassic_df = gendered_jurassic_subject_control_experiment.format_results()
# gendered_jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_gendered_subject_control_swap_names.csv")
gendered_jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_gendered_subject_control_swap_names.csv")



In [16]:

accuracy_report(gendered_jurassic_df)

{'total': (0.6875, 48),
 'acc_by_name': {'James,Sally': (1.0, 8),
  'James,Mary': (0.625, 8),
  'Tom,Sally': (0.875, 8),
  'Tom,Mary': (0.625, 8),
  'Bill,Sally': (0.5, 8),
  'Bill,Mary': (0.5, 8)},
 'acc_by_action': {'to be her editor': (0.6875, 48)},
 'acc_by_verb': {'promised': (0.8333333333333334, 12),
  'suggested': (0.5833333333333334, 12),
  'proposed': (0.6666666666666666, 12),
  'offered': (0.6666666666666666, 12)},
 'acc_by_action_by_verb': {'to be her editor,promised': (0.8333333333333334,
   12),
  'to be her editor,suggested': (0.5833333333333334, 12),
  'to be her editor,proposed': (0.6666666666666666, 12),
  'to be her editor,offered': (0.6666666666666666, 12)}}

# Passives 

Do passives here make sense? To me 
- Mary was promised by Tom to leave
Does not sound acceptable, or if it is accepetable, Mary is the one leaving, unlike "Tom promised Mary to leave" 