In [1]:
import json
import pandas as pd 

from experiment import Experiment
from api_tools import (FixedGPTPrompt, 
                       FixedPassiveGPTPrompt, 
                       FixedT5Prompt, 
                       FixedPassiveT5Prompt, 
                       run_ai21_prompt, 
                       run_gpt_prompt, 
                       run_t5_prompt)

from metrics import accuracy_report


# Object Control
Starting with the object control experiments, we will look at how 4 LMs do:
1. GPT3
2. T5 for QA 
3. Jurassic large 
4. Jurassic Jumbo

## Experimental settings
We're looking at different name pairs to control for gender. We have 2 male-female name pairs, 1 male-male pair, 1 female-female pair, and 1 neutral-neutral. 
We're also looking at 10 object control verbs and 5 different actions. 

In [2]:
names = json.load(open("../data/names_top_2.json"))
verbs = ["told", "ordered", "called upon", "reminded", "urged", "asked", "persuaded", "convinced", "forced", "pushed"]
# actions = [("to leave", "left"), ("to call home", "called home"), ("to reply", "replied"), ("to wipe the counter", "wiped the counter"), ("to dance", "danced")]
actions = json.load(open("../data/verbs.json"))
correct_index = 1
nicknames = json.load(open("../data/nicknames.json"))

## GPT 3
 

In [None]:



gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
gpt_object_control_experiment  = Experiment("gpt3", "object-control", FixedGPTPrompt, run_gpt_prompt, 1, gpt_kwargs)
gpt_object_control_experiment.recover("../results/gpt_object_control_swap_names.csv")


# print(gpt_object_control_experiment.results[0:10])
# print(gpt_object_control_experiment.results[-10:])
# gpt_object_control_experiment.run(names, correct_index, verbs, actions, rate_limit_delay=None, nicknames=nicknames)
gpt_object_control_experiment.recompute(nicknames)
gpt_df = gpt_object_control_experiment.format_results()
gpt_df.to_csv("../results/gpt_object_control_swap_names.csv")
# old_gpt_df = pd.read_csv("../results/gpt_object_control_swap_names.csv")


In [None]:
print(accuracy_report(gpt_df)['total'])

## GPT Neo 2.7

In [9]:
from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("EleutherAI/gpt-neo-2.7B", device="cpu", constrained=False)
# wrapper_fxn=None

gptneo_object_control_experiment  = Experiment("gpt-neo-2.7b", "object-control", FixedGPTPrompt, wrapper_fxn, 1, None)
# gptneo_object_control_experiment.recover("../results/gpt_neo_2.7b_object_control.csv")

gptneo_object_control_experiment.run(names, correct_index, verbs, actions, do_swap = True, nicknames=nicknames, rate_limit_delay=None, overwrite=True)
gptneo_object_control_experiment.recompute(nicknames)
gptneo_df = gptneo_object_control_experiment.format_results()

gptneo_df.to_csv("../results/gpt_neo_2.7b_object_control.csv")

# print(accuracy_report(gptneo_df)['total'])
# old_gptneo_df = pd.read_csv("../results/gpt_neo_2.7b_object_control.csv")
# print(accuracy_report(old_gptneo_df)['total'])


 'You will be given a context and a question. Answer the question with either "Avery" or "Casey".\nContext: Avery told Casey to come.\n\nQuestion:  Who came, Avery or Casey?\nAnswer:  Avery.\n\nA:\n\n')
You will be given a context and a question. Answer the question with either "Avery" or "Casey".\nContext: Avery told Casey to come.\n\nQuestion:  Who came, Avery or Casey?\nAnswer:  Avery.\n\nA:\n\n
Casey
 'You will be given a context and a question. Answer the question with either "Avery" or "Casey".\nContext: Avery told Casey to go.\n\nQuestion:  Who went, Avery or Casey?\nAnswer:  Avery.\n\nA:\n\n')
You will be given a context and a question. Answer the question with either "Avery" or "Casey".\nContext: Avery told Casey to go.\n\nQuestion:  Who went, Avery or Casey?\nAnswer:  Avery.\n\nA:\n\n
Casey
 'You will be given a context and a question. Answer the question with either "Avery" or "Casey".\nContext: Avery told Casey to read.\n\nQuestion:  Who read, Avery or Casey?\nAnswer:  Aver

## T5 for QA

In [3]:
from hf_tools.hf import HuggingfaceRunFxn

wrapper_fxn = HuggingfaceRunFxn("valhalla/t5-base-qa-qg-hl", "cuda:0")
# wrapper_fxn = None

t5_object_control_experiment  = Experiment("t5", "object-control", FixedT5Prompt, wrapper_fxn, 1, None)
t5_object_control_experiment.recover("../results/t5_qa_object_control.csv")
# t5_object_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=False)
t5_object_control_experiment.recompute(nicknames)

In [4]:
t5_df = t5_object_control_experiment.format_results()

# t5_df.to_csv("../results/t5_qa_object_control.csv")

old_t5_df = pd.read_csv("../results/t5_object_control.csv")

In [5]:
print(accuracy_report(t5_df)['total'])
print(accuracy_report(old_t5_df)['total'])

(1.0, 1500, 1.0, 1500)
(0.9593333333333334, 1500, 0.9965373961218836, 1444)


## T0 

In [7]:
from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("bigscience/T0pp", device="cpu", constrained=False)

t0_object_control_experiment  = Experiment("t0", "object-control", FixedGPTPrompt, wrapper_fxn, 1, None)
t0_object_control_experiment.recover("../results/t0_object_control.csv")
# t0_object_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=False)
t0_object_control_experiment.recompute(nicknames)

t0_df = t0_object_control_experiment.format_results()

# t0_df.to_csv("../results/t0_object_control.csv")
old_t0_df = pd.read_csv("../results/t0_object_control.csv")

print(accuracy_report(t0_df)['total'])
print(accuracy_report(old_t0_df)['total'])


100%|██████████| 30/30 [00:00<00:00, 125.09it/s]


 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Avery')
Avery
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')
Casey
Casey
 'Casey')


## Jurassic Large

In [None]:

# jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
# jurassic_object_control_experiment  = Experiment("jurassic-large", "object-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)

# jurassic_object_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames)

# jurassic_df = jurassic_object_control_experiment.format_results()

# jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_object_control_swap_names.csv")

jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_object_control_swap_names.csv")

In [None]:
accuracy_report(jurassic_df)

In [None]:

jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
jurassic_object_control_experiment  = Experiment("jurassic-large", "object-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)
jurassic_object_control_experiment.run(names, correct_index, verbs, actions, qa_pair=("Q","A"))
jurassic_df = jurassic_object_control_experiment.format_results()
jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_object_control_qa_swap_names.csv")

# jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_object_control_qa_swap_names.csv")

In [None]:
accuracy_report(jurassic_df)

# Passives 

The passive form reverses the order and also syntactic role of the agent/patient. E.g. 

- Mary told Tom to wipe the counter 
- Tom was told by Mary to wipe the counter 

Now Tom is linearly further away from "wipe" and also the grammatical subject of the matrix clause (though still the patient) 

In [None]:
correct_index = 0 

## GPT3 Passive

In [None]:

gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
passive_gpt_object_control_experiment  = Experiment("gpt3", "object-control-passive", FixedPassiveGPTPrompt, run_gpt_prompt, 1, gpt_kwargs) 

passive_gpt_object_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames)

passive_gpt_df = passive_gpt_object_control_experiment.format_results()

passive_gpt_df.to_csv("/Users/Elias/child-lm/results/gpt_passive_object_control_swap_names.csv")
# passive_gpt_df = pd.read_csv("/Users/Elias/child-lm/results/gpt_passive_object_control_swap_names.csv")



In [None]:
accuracy_report(passive_gpt_df)


## T5 for QA Passive

In [None]:

# passive_t5_object_control_experiment  = Experiment("t5", "object-control-passive", FixedPassiveT5Prompt, run_t5_prompt, 1, None)

# passive_t5_object_control_experiment.run(names, correct_index, verbs, actions, do_swap = False)

# passive_t5_df = passive_t5_object_control_experiment.format_results()

# passive_t5_df.to_csv("/Users/Elias/child-lm/results/t5_passive_object_control.csv")
# passive_t5_df = pd.read_csv("/Users/Elias/child-lm/results/t5_passive_object_control.csv")

from hf_tools.hf import HuggingfaceRunFxn

wrapper_fxn = HuggingfaceRunFxn("valhalla/t5-base-qa-qg-hl", device="cuda:1", constrained=False)

passive_t5_object_control_experiment  = Experiment("t5", "object-control-passive", FixedPassiveT5Prompt, wrapper_fxn, 1, None)

passive_t5_object_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=True)

passive_t5_df = passive_t5_object_control_experiment.format_results()

passive_t5_df.to_csv("../results/t5_passive_object_control.csv")

In [None]:
accuracy_report(passive_t5_df)

## T0 Passive

In [None]:

from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("bigscience/T0pp", device="cpu", constrained=False)

passive_t0_object_control_experiment  = Experiment("t0", "object-control-passive", FixedPassiveGPTPrompt, wrapper_fxn, 1, None)

passive_t0_object_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=True)

passive_t0_df = passive_t0_object_control_experiment.format_results()

passive_t0_df.to_csv("../results/t0_passive_object_control.csv")

accuracy_report(passive_t0_df)



## GPT-Neo Passive

In [None]:
from hf_tools.hf import HuggingfaceRunFxn
import os
os.environ['TRANSFORMERS_CACHE'] = "/brtx/601-nvme1/estengel/.cache"

wrapper_fxn = HuggingfaceRunFxn("EleutherAI/gpt-neo-2.7B", device="cuda:1", constrained=False)

passive_gptneo_object_control_experiment  = Experiment("gpt-neo-2.7b", "object-control-passive", FixedPassiveGPTPrompt, wrapper_fxn, 1, None)

passive_gptneo_object_control_experiment.run(names, correct_index, verbs, actions, do_swap = False, nicknames=nicknames, rate_limit_delay=None, overwrite=True)

passive_gptneo_df = passive_gptneo_object_control_experiment.format_results()

passive_gptneo_df.to_csv("../results/gpt_neo_2.7b_passive_object_control.csv")

accuracy_report(passive_gptneo_df)


## Jurassic Large Passive

In [None]:

jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
passive_jurassic_object_control_experiment  = Experiment("jurassic-large", "object-control-passive", FixedPassiveGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)

passive_jurassic_object_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames)
passive_jurassic_df = passive_jurassic_object_control_experiment.format_results()

passive_jurassic_df.to_csv("/Users/Elias/child-lm/results/jurassic_passive_object_control_swap_names.csv")

# passive_jurassic_df = pd.read_csv("/Users/Elias/child-lm/results/jurassic_object_control_swap_names.csv")

In [None]:
accuracy_report(passive_jurassic_df)