In [1]:
from vis import recompute_agent_patient, do_mcnemar_agent_patient 
import pathlib 
import os 
import pandas as pd 
import numpy as np 

from agent_patient_experiment import AgentPatientExperiment
np.random.seed(12)
path_to_file = pathlib.Path("").absolute()
parent = path_to_file.parent


def get_df(condition, is_two = False, thresh=30):
    cos_df = pd.DataFrame(columns=["model", "num_prompts", "accuracy", "num_valid"], dtype=object)
    if is_two:
        two_affix = "_2"
    else:
        two_affix = "_1"
    for prefix in range(0,4):
        
        change_of_state_csvs = [parent.joinpath(f'agent_patient_results{two_affix}/gpt_{condition}_prefix_{prefix}.csv'),
                                parent.joinpath(f'agent_patient_results{two_affix}/gpt_neo_1.3b_{condition}{two_affix}_prefix_{prefix}.csv'),
                                parent.joinpath(f'agent_patient_results{two_affix}/gpt_neo_2.7b_{condition}{two_affix}_prefix_{prefix}.csv'),
                                parent.joinpath(f'agent_patient_results{two_affix}/gpt_j_{condition}{two_affix}_prefix_{prefix}.csv'),
                                parent.joinpath(f'agent_patient_results{two_affix}/jurassic_{condition}_prefix_{prefix}.csv'),
                                parent.joinpath(f'agent_patient_results{two_affix}/jurassic_jumbo_{condition}_prefix_{prefix}.csv'),
                                # parent.joinpath(f'agent_patient_results{two_affix}/t5_{condition}{two_affix}_prefix_{prefix}.csv'),
                                parent.joinpath(f'agent_patient_results{two_affix}/t0_{condition}{two_affix}_prefix_{prefix}.csv') ]
        names = ["gpt", "gpt-neo-1.3b", "gpt-neo-2.7b", "gpt-j", "jurassic-large", "jurassic-jumbo", "t0"]
        prompt_files = [parent.joinpath(f"data/agent_patient/{condition}{two_affix}_prefix_{prefix}.json") for i in range(len(change_of_state_csvs))]
        cos_acc = recompute_agent_patient(change_of_state_csvs, prompt_files, names)

        for model in names:
            try:
                __, __, acc, num = cos_acc[model]
            except KeyError:
                acc, num = -1.0, 0
            if model in cos_df['model'].values:
                curr_acc = cos_df[cos_df['model'] == model]['accuracy'].values[0]
                curr_num = cos_df[cos_df['model'] == model]['num_valid'].values[0]

                curr_weighted = curr_acc * curr_num
                if num > thresh:
                    # if acc > curr_acc:
                    weighted = acc * num
                    if weighted > curr_weighted:
                        cos_df.loc[cos_df['model'] == model, 'num_prompts'] = prefix
                        cos_df.loc[cos_df['model'] == model, 'accuracy'] = acc
                        cos_df.loc[cos_df['model'] == model, 'num_valid'] = num 
            else:
                if num > thresh: 
                    cos_df = cos_df.append({"model": model, "num_prompts": prefix, "accuracy": acc, "num_valid": num}, ignore_index=True)
                else:
                    cos_df = cos_df.append({"model": model, "num_prompts": prefix, "accuracy": -1.0, "num_valid": 0}, ignore_index=True)
        # print(prefix, cos_acc)
    # print(cos_df)
    return cos_df

In [10]:





def run_mcnemar_full(condition):
    models_levels_prefixes = []
    for model in ["gpt", "gpt_neo_1.3b", "gpt_neo_2.7b", "gpt_j", "jurassic", "jurassic_jumbo", "t0"]: 
        for level in range(4): 
            for aff in [1,2]: 
                models_levels_prefixes.append((model, level, aff))

    models_levels_prefixes.append(("random-yes", 0, 0))
    models_levels_prefixes.append(("random-no", 0, 0))
    models_levels_prefixes.append(("random-random", 0, 0))

    results = []
    done = []
    for model1, level1, two_affix1 in models_levels_prefixes: 
        if "random" in model1:
            continue
        for model2, level2, two_affix2 in models_levels_prefixes: 
            if model1 == model2 or ((model1, model2, level1, level2, two_affix1, two_affix2) in done) or ((model2,model1, level2, level1, two_affix2, two_affix1) in done): 
                continue
            pval, stat, table = do_mcnemar_agent_patient(model1, model2, level1, level2, f"_{two_affix1}", f"_{two_affix2}", condition=condition)
            try:
                model1_acc = len(table[model1]['correct'])/(len(table[model1]['correct']) + len(table[model1]['incorrect']))
            except ZeroDivisionError:
                model1_acc = -1
            try:
                model2_acc = len(table[model2]['correct'])/(len(table[model2]['correct']) + len(table[model2]['incorrect']))
            except ZeroDivisionError:
                model2_acc = -1
            results.append((model1, level1, two_affix1, model2, level2, two_affix2, f"{model1_acc:.2f}", f"{model2_acc:.2f}", pval, stat))  
            done.append((model1, model2, level1, level2, two_affix1, two_affix2))

    better_than_random = [x[0] for x in results if x[3] == "random-random" and x[-2] < 0.05]

    for x in results:
        if x[0] in better_than_random and x[-2] < 0.05:
            print(x)
    return results



In [32]:
import re
def combine_dfs(df1, df2, sig_res): 
    df_final = df1.copy()
    rand_lookup = {f'{x[0]}_{x[1]}_{x[2]}': x[-2] for x in sig_res if x[3] == "random-random"}
    print(rand_lookup)
    for l1, l2 in zip(df1.iterrows(), df2.iterrows()):
        __, l1 = l1
        __, l2 = l2
        model = re.sub("-", "_", l1['model']) 
        k1 = f"{model}_{l1['num_prompts']}_1"
        k2 = f"{model}_{l2['num_prompts']}_2"
        try:
            pval1 = rand_lookup[k1]
        except KeyError:
            pval1=1.0
        try:
            pval2 = rand_lookup[k2]
        except KeyError:
            pval2=1.0

        if pval1 < 0.05 and pval2 < 0.05:
            if l1['accuracy'] > l2['accuracy']: 
                df_final.loc[df_final['model'] == l1['model'], 'accuracy'] = l1['accuracy']
                df_final.loc[df_final['model'] == l1['model'], 'num_prompts'] = l1['num_prompts']
                df_final.loc[df_final['model'] == l1['model'], 'num_valid'] = l1['num_valid']
            else:
                df_final.loc[df_final['model'] == l2['model'], 'accuracy'] = l2['accuracy']
                df_final.loc[df_final['model'] == l2['model'], 'num_prompts'] = l2['num_prompts']
                df_final.loc[df_final['model'] == l2['model'], 'num_valid'] = l2['num_valid']
        elif pval1 < 0.05:
            df_final.loc[df_final['model'] == l1['model'], 'accuracy'] = l1['accuracy']
            df_final.loc[df_final['model'] == l1['model'], 'num_prompts'] = l1['num_prompts']
            df_final.loc[df_final['model'] == l1['model'], 'num_valid'] = l1['num_valid']
        elif pval2 < 0.05:
            df_final.loc[df_final['model'] == l2['model'], 'accuracy'] = l2['accuracy']
            df_final.loc[df_final['model'] == l2['model'], 'num_prompts'] = l2['num_prompts']
            df_final.loc[df_final['model'] == l2['model'], 'num_valid'] = l2['num_valid']
        else:
            df_final.loc[df_final['model'] == l2['model'], 'accuracy'] = -1.0
            df_final.loc[df_final['model'] == l2['model'], 'num_prompts'] = 0
            df_final.loc[df_final['model'] == l2['model'], 'num_valid'] = 0
    return df_final 

In [33]:
# if False: 
# print(f"change of state")
# cos_df1 = get_df("change_of_state")
# print(cos_df1)
# cos_df2 = get_df("change_of_state", is_two=True)
# print(cos_df2)

# cos_results = run_mcnemar_full("change_of_state")



df_final = combine_dfs(cos_df1, cos_df2, cos_results)
print(df_final)


{'gpt_0_1': 0.8974218269914307, 'gpt_0_2': 0.8974218269914307, 'gpt_1_1': 0.0463534743329112, 'gpt_1_2': 0.0463534743329112, 'gpt_2_1': 0.10478948242660333, 'gpt_2_2': 0.10478948242660333, 'gpt_3_1': 0.05806414679240905, 'gpt_3_2': 0.05806414679240905, 'gpt_neo_1.3b_0_1': 0.34888887944907765, 'gpt_neo_1.3b_0_2': 0.001312255859375, 'gpt_neo_1.3b_1_1': 0.765991824244793, 'gpt_neo_1.3b_1_2': 0.8388197422027588, 'gpt_neo_1.3b_2_1': 0.5600646295802106, 'gpt_neo_1.3b_2_2': 0.4050322461407632, 'gpt_neo_1.3b_3_1': 0.47312965989112854, 'gpt_neo_1.3b_3_2': 0.8555355519056321, 'gpt_neo_2.7b_0_1': 0.11727520595741225, 'gpt_neo_2.7b_0_2': 1.0, 'gpt_neo_2.7b_1_1': 0.15158963203430176, 'gpt_neo_2.7b_1_2': 0.454498291015625, 'gpt_neo_2.7b_2_1': 0.21875, 'gpt_neo_2.7b_2_2': 1.0, 'gpt_neo_2.7b_3_1': 1.0, 'gpt_neo_2.7b_3_2': 1.0, 'gpt_j_0_1': 0.453125, 'gpt_j_0_2': 0.5234670639038086, 'gpt_j_1_1': 0.75390625, 'gpt_j_1_2': 0.8145294189453125, 'gpt_j_2_1': 1.0, 'gpt_j_2_2': 0.453125, 'gpt_j_3_1': 1.0, 'gpt

In [14]:
cos_results = run_mcnemar_full("change_of_state")

('gpt', 1, 1, 'gpt_neo_2.7b', 0, 2, '0.61', '0.45', 0.011351591436778108, 14.0)
('gpt', 1, 1, 't0', 0, 1, '0.61', '0.47', 0.03961701489849968, 22.0)
('gpt', 1, 1, 't0', 3, 1, '0.61', '0.46', 0.0327657590988232, 23.0)
('gpt', 1, 1, 'random-no', 0, 0, '0.61', '0.47', 0.0463534743329112, 24.0)
('gpt', 1, 1, 'random-random', 0, 0, '0.61', '0.47', 0.0463534743329112, 24.0)
('gpt', 1, 2, 'gpt_neo_2.7b', 0, 2, '0.61', '0.45', 0.011351591436778108, 14.0)
('gpt', 1, 2, 't0', 0, 1, '0.61', '0.47', 0.03961701489849968, 22.0)
('gpt', 1, 2, 't0', 3, 1, '0.61', '0.46', 0.0327657590988232, 23.0)
('gpt', 1, 2, 'random-no', 0, 0, '0.61', '0.47', 0.0463534743329112, 24.0)
('gpt', 1, 2, 'random-random', 0, 0, '0.61', '0.47', 0.0463534743329112, 24.0)
('gpt', 3, 1, 'gpt_neo_1.3b', 2, 2, '0.59', '0.49', 0.03355243967962451, 12.0)
('gpt', 3, 1, 'gpt_neo_1.3b', 3, 2, '0.59', '0.43', 0.0241195447742939, 8.0)
('gpt', 3, 1, 'jurassic_jumbo', 1, 1, '0.59', '0.48', 0.010673840530216694, 8.0)
('gpt', 3, 1, 'jurass

In [30]:
# if False:
print(f"volition")
vol_df1 = get_df("volition")
print(vol_df1)
vol_df2 = get_df("volition", is_two=True)
print(vol_df2)

vol_results = run_mcnemar_full("volition")





volition
            model num_prompts  accuracy num_valid
0             gpt           2  0.696429       168
1    gpt-neo-1.3b           0  0.622222        45
2    gpt-neo-2.7b           1  0.568421        95
3           gpt-j           0  0.693694       111
4  jurassic-large           0  0.577922       154
5  jurassic-jumbo           1  0.410256       117
6              t0           0  0.595238       168
            model num_prompts  accuracy num_valid
0             gpt           3  0.767857       168
1    gpt-neo-1.3b           0 -1.000000         0
2    gpt-neo-2.7b           0  0.538462        65
3           gpt-j           0  0.565217        69
4  jurassic-large           0  0.574074       162
5  jurassic-jumbo           1  0.472603       146
6              t0           0  0.500000       168
('gpt', 0, 1, 'gpt_j', 0, 1, '0.54', '0.69', 0.047940324071666984, 21.0)
('gpt', 0, 2, 'gpt_j', 0, 1, '0.54', '0.69', 0.047940324071666984, 21.0)
('gpt', 1, 1, 't0', 1, 1, '0.57', '0.43', 0.0

In [34]:
df_final = combine_dfs(vol_df1, vol_df2, vol_results)
print(df_final)


{'gpt_0_1': 0.07254953219246177, 'gpt_0_2': 0.07254953219246177, 'gpt_1_1': 0.01918660866014698, 'gpt_1_2': 0.01918660866014698, 'gpt_2_1': 1.9671283608760123e-08, 'gpt_2_2': 1.9671283608760123e-08, 'gpt_3_1': 1.1934080595966506e-09, 'gpt_3_2': 1.1934080595966506e-09, 'gpt_neo_1.3b_0_1': 0.1351564508456704, 'gpt_neo_1.3b_0_2': 0.803619384765625, 'gpt_neo_1.3b_1_1': 0.043285250663757324, 'gpt_neo_1.3b_1_2': 0.0654296875, 'gpt_neo_1.3b_2_1': 0.1849333420395851, 'gpt_neo_1.3b_2_2': 0.7744140625, 'gpt_neo_1.3b_3_1': 0.143463134765625, 'gpt_neo_1.3b_3_2': 0.03515625, 'gpt_neo_2.7b_0_1': 1.0, 'gpt_neo_2.7b_0_2': 0.7283324808813632, 'gpt_neo_2.7b_1_1': 0.21805368187049962, 'gpt_neo_2.7b_1_2': 0.07075554598122835, 'gpt_neo_2.7b_2_1': 0.5078125, 'gpt_neo_2.7b_2_2': 1.0, 'gpt_neo_2.7b_3_1': 0.002599477767944336, 'gpt_neo_2.7b_3_2': 1.0, 'gpt_j_0_1': 2.568720337924735e-05, 'gpt_j_0_2': 0.3367836351899315, 'gpt_j_1_1': 1.0, 'gpt_j_1_2': 1.0, 'gpt_j_2_1': 1.0, 'gpt_j_2_2': 1.0, 'gpt_j_3_1': 1.0, 'g