In [None]:
import pandas as pd 
import json
import sys
import pathlib 

path_to_notebook = pathlib.Path("").absolute()
sys.path.insert(0, str(path_to_notebook.parent))
sys.path.insert(0, str(path_to_notebook.parent.parent))

from experiment import Experiment
from api_tools import (FixedGPTPrompt, 
                       FixedPassiveGPTPrompt, 
                       run_ai21_prompt, 
                       run_ai21_jumbo_prompt, 
                       run_gpt_prompt)

from metrics import accuracy_report


# Subject Control
Starting with the **subject** control experiments, we will look at how 4 LMs do:
1. GPT3
2. T5 for QA 
3. Jurassic large 
4. Jurassic Jumbo

## Experimental settings
We're looking at different name pairs to control for gender. We have 2 male-female name pairs, 1 male-male pair, 1 female-female pair, and 1 neutral-neutral. 
We're still looking at 5 different actions, but only one verb: promise. 

We can later coerce (with gender) subject control on "suggested" and "proposed" 

In [None]:
names = json.load(open("../../data/names_top_2.json"))
verbs = ["promised"]
actions = json.load(open("../../data/verbs.json"))
correct_index = 0

nicknames = json.load(open("../../data/nicknames.json"))


## GPT 3
 

In [None]:

gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
gpt_subject_control_experiment  = Experiment("gpt3", "subject-control", FixedGPTPrompt, run_gpt_prompt, 1, gpt_kwargs)

gpt_subject_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames, do_swap=False)

In [None]:
gpt_df = gpt_subject_control_experiment.format_results()
gpt_df.to_csv("../../short_instructions/results/gpt_subject_control.csv")

In [None]:
accuracy_report(gpt_df)

## Jurassic Large

In [9]:

jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
jurassic_subject_control_experiment  = Experiment("jurassic-large", "subject-control", FixedGPTPrompt, run_ai21_prompt, 1, jurassic_kwargs)

jurassic_subject_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames, do_swap = False)

100%|██████████| 30/30 [02:38<00:00,  5.29s/it]


In [11]:
jurassic_df = jurassic_subject_control_experiment.format_results()
jurassic_df.to_csv("../../short_instructions/results/jurassic_subject_control.csv")

In [None]:
accuracy_report(jurassic_df)

## Jurassic Jumbo

In [None]:

jurassic_kwargs = {"maxTokens": 2, "temperature": 0.0}
jurassic_jumbo_subject_control_experiment  = Experiment("jurassic-jumbo", "subject-control", FixedGPTPrompt, run_ai21_jumbo_prompt, 1, jurassic_kwargs)

jurassic_jumbo_subject_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames, do_swap=False, rate_limit_delay=60, rate_limit_count=19)
jurassic_jumbo_df = jurassic_jumbo_subject_control_experiment.format_results()

jurassic_jumbo_df.to_csv("../../short_instructions/results/jurassic_jumbo_subject_control.csv")


# Hacked prompts
## Starting with: baseline: agent and patient ID only

## GPT3


In [10]:

correct_index=0
gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
gpt_subject_control_experiment  = Experiment("gpt3", "subject-control", FixedGPTPrompt, run_gpt_prompt, 1, gpt_kwargs)
gpt_subject_control_experiment.run(names, correct_index, verbs, actions, nicknames=nicknames, do_swap=False, just_prompt_agent=True)
gpt_df = gpt_subject_control_experiment.format_results()
gpt_df.to_csv("../../short_instructions/results_just_prompt_agent/gpt_subject_control.csv")

100%|██████████| 30/30 [02:39<00:00,  5.32s/it]


In [12]:

correct_index=1
gpt_kwargs = {"max_tokens": 2, "temperature": 0.0}
gpt_subject_control_experiment2  = Experiment("gpt3", "subject-control", FixedGPTPrompt, run_gpt_prompt, 1, gpt_kwargs)
gpt_subject_control_experiment2.run(names, correct_index, verbs, actions, nicknames=nicknames, do_swap=False, just_prompt_patient=True)
gpt_df2 = gpt_subject_control_experiment2.format_results()
gpt_df2.to_csv("../../short_instructions/results_just_prompt_patient/gpt_subject_control.csv")

100%|██████████| 30/30 [02:40<00:00,  5.35s/it]
