In [1]:
import pickle
import copy
from experiments import *
from utils import *

Definitions

In [2]:
scenarios = {'cnn':['summarization_cnndm:temperature=0.3,device=cuda,'], 
             'xsum':['summarization_xsum:temperature=0.3,device=cuda,'], 
             'boolq:':['boolq:'],
             'civil_comments':['civil_comments:demographic=LGBTQ,',
                               'civil_comments:demographic=all,',
                               'civil_comments:demographic=black,',
                               'civil_comments:demographic=christian,',
                               'civil_comments:demographic=female,',
                               'civil_comments:demographic=male,',
                               'civil_comments:demographic=muslim,',
                               'civil_comments:demographic=other_religions,',
                               'civil_comments:demographic=white,'],
             'commonsense:dataset=hellaswag,method=multiple_choice_separate_original,':['commonsense:dataset=hellaswag,method=multiple_choice_separate_original,'],
             'commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,':['commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,'],
             'imdb:':['imdb:'],
             'mmlu':['mmlu:subject=abstract_algebra,method=multiple_choice_joint,',
                     'mmlu:subject=college_chemistry,method=multiple_choice_joint,',
                     'mmlu:subject=computer_security,method=multiple_choice_joint,',
                     'mmlu:subject=econometrics,method=multiple_choice_joint,',
                     'mmlu:subject=us_foreign_policy,method=multiple_choice_joint,'],
             'msmarco:track=regular,valid_topk=30,':['msmarco:track=regular,valid_topk=30,'],
             #'msmarco:track=trec,valid_topk=30,':['msmarco:track=trec,valid_topk=30,'],
             'narrative_qa:':['narrative_qa:'],
             'natural_qa:mode=closedbook,':['natural_qa:mode=closedbook,'],
             'natural_qa:mode=openbook_longans,':['natural_qa:mode=openbook_longans,'],
             'quac:':['quac:'],
             'raft':['raft:subset=ade_corpus_v2,',
                     'raft:subset=banking_77,',
                     'raft:subset=neurips_impact_statement_risks,',
                     'raft:subset=one_stop_english,',
                     'raft:subset=overruling,',
                     'raft:subset=semiconductor_org_types,',
                     'raft:subset=systematic_review_inclusion,',
                     'raft:subset=tai_safety_research,',
                     'raft:subset=terms_of_service,',
                     'raft:subset=tweet_eval_hate,',
                     'raft:subset=twitter_complaints,'],
             'truthful_qa:task=mc_single,method=multiple_choice_joint,':['truthful_qa:task=mc_single,method=multiple_choice_joint,']}
            
scenarios_metrics = {'boolq:':'em', 
                     'commonsense:dataset=hellaswag,method=multiple_choice_separate_original,':'em',
                     'commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,':'em',
                     'imdb:':'em', 
                     'mmlu':'em', 
                     'msmarco:track=regular,valid_topk=30,':'RR@10', 
                     'msmarco:track=trec,valid_topk=30,':'NDCG@10', 
                     'narrative_qa:':'f1', 
                     'natural_qa:mode=closedbook,':'f1', 
                     'natural_qa:mode=openbook_longans,':'f1', 
                     'quac:':'f1', 
                     'raft':'em', 
                     'truthful_qa:task=mc_single,method=multiple_choice_joint,':'em'}

## Data

Loading data

In [3]:
with open('data/helm.pickle', 'rb') as handle:
    data = pickle.load(handle)

## Results

In [4]:
device = 'cuda'
iterations = 3
Ds = [5, 10, 15] #

set_of_rows = [[0,1,2,3], #ai21
               [5,6,7,8,9,10,11], #cohere
               [4,12,13], #anthropic+microsoft
               [14,15,16,17,18,19,20,21,22], #openai
               [23,24,25,26,27]] #together
set_of_rows

[[0, 1, 2, 3],
 [5, 6, 7, 8, 9, 10, 11],
 [4, 12, 13],
 [14, 15, 16, 17, 18, 19, 20, 21, 22],
 [23, 24, 25, 26, 27]]

In [5]:
data['models']

['ai21_j1-grande',
 'ai21_j1-grande-v2-beta',
 'ai21_j1-jumbo',
 'ai21_j1-large',
 'anthropic_stanford-online-all-v4-s3',
 'cohere_command-medium-beta',
 'cohere_command-xlarge-beta',
 'cohere_large-20220720',
 'cohere_medium-20220720',
 'cohere_medium-20221108',
 'cohere_xlarge-20220609',
 'cohere_xlarge-20221108',
 'microsoft_TNLGv2_530B',
 'microsoft_TNLGv2_7B',
 'openai_ada',
 'openai_babbage',
 'openai_curie',
 'openai_davinci',
 'openai_text-ada-001',
 'openai_text-babbage-001',
 'openai_text-curie-001',
 'openai_text-davinci-002',
 'openai_text-davinci-003',
 'together_bloom',
 'together_gpt-j-6b',
 'together_gpt-neox-20b',
 'together_opt-175b',
 'together_opt-66b']

### Predicting accuracy

Full (one IRT model for all scenarios)

In [None]:
scenario_name = 'full' #we are evaluating all scenarios at once
chosen_scenarios = list(scenarios.keys())
sampling = {'random_sampling':True,'anchor_sampling':True,
            'anchor-irt_sampling':True,'disc_sampling':False}
results_full, accs_full = evaluate_scenarios(data, scenario_name, chosen_scenarios, scenarios, set_of_rows, Ds, iterations, device, bench='irt_helm', sampling = sampling)


Evaluating models [0, 1, 2, 3]

i) choosing optimal D


100%|████████████████████████████████████████████| 3/3 [11:49<00:00, 236.58s/it]


- opt D= 5 errors= [0.044579911983804656, 0.0453168327810513, 0.047692893828327226] 


ii) choosing optimal lambdas


100%|████████████████████████████████████████| 15/15 [00:00<00:00, 54424.36it/s]


{'random_gpirt': {'cnn': {10: 0.5810980065561022, 25: 0.776185364418146, 50: 0.8739913974827889, 75: 0.9123110788518177, 100: 0.9327592417518723}, 'xsum': {10: 0.12214799049569504, 25: 0.2580834196106841, 50: 0.4102802971372891, 75: 0.5106629684828556, 100: 0.5818422025325208}, 'boolq:': {10: 0.10161491521841513, 25: 0.22043765726512518, 50: 0.36124361773481034, 75: 0.45896613338189834, 100: 0.5307552785238266}, 'civil_comments': {10: 0.03457890877432671, 25: 0.0821844966189413, 50: 0.15188629457492606, 75: 0.2117485876802738, 100: 0.26371751324808634}, 'commonsense:dataset=hellaswag,method=multiple_choice_separate_original,': {10: 0.21291920659048225, 25: 0.40344593416830005, 50: 0.5749361971786783, 75: 0.6698451765235517, 100: 0.7301072871505678}, 'commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,': {10: 0.03920765730939187, 25: 0.09257468829234465, 50: 0.1694615284142004, 75: 0.23433675987525454, 100: 0.28981120677649247}, 'imdb:': {10: 0.2857569388647762, 2

100%|████████████████████████████████████████████| 5/5 [21:58<00:00, 263.69s/it]



iv) running random eval


100%|████████████████████████████████████████████| 5/5 [11:00<00:00, 132.07s/it]



v) running anchor points with IRT embeddings


100%|████████████████████████████████████████████| 5/5 [20:34<00:00, 246.84s/it]



Evaluating models [5, 6, 7, 8, 9, 10, 11]

i) choosing optimal D


100%|████████████████████████████████████████████| 3/3 [07:42<00:00, 154.23s/it]


- opt D= 5 errors= [0.05381872300941188, 0.05182277303935221, 0.0543264667069326] 


ii) choosing optimal lambdas


100%|████████████████████████████████████████| 15/15 [00:00<00:00, 68237.05it/s]


{'random_gpirt': {'cnn': {10: 0.6800790037019013, 25: 0.8416325599618499, 50: 0.9140070372987809, 75: 0.940979577879858, 100: 0.9550717625246664}, 'xsum': {10: 0.14586503589811217, 25: 0.29919865575231325, 50: 0.46058954021786525, 75: 0.561559983113336, 100: 0.6306899064184214}, 'boolq:': {10: 0.1425382279775365, 25: 0.2935767132258238, 50: 0.4538991931815538, 75: 0.5549117842038082, 100: 0.6243888094996332}, 'civil_comments': {10: 0.03905972098434126, 25: 0.09224472276087652, 50: 0.1689085254222309, 75: 0.23363160332824376, 100: 0.2890021276236617}, 'commonsense:dataset=hellaswag,method=multiple_choice_separate_original,': {10: 0.19321110116989337, 25: 0.3744933454437555, 50: 0.5449183827410241, 75: 0.6423605406403432, 100: 0.705433230426347}, 'commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,': {10: 0.022525857650832985, 25: 0.05447403266873823, 50: 0.10331981818626952, 75: 0.1473667731738924, 100: 0.18728897366516153}, 'imdb:': {10: 0.2761478371514502, 25: 0

100%|████████████████████████████████████████████| 5/5 [31:33<00:00, 378.67s/it]



iv) running random eval


100%|████████████████████████████████████████████| 5/5 [19:00<00:00, 228.08s/it]



v) running anchor points with IRT embeddings


100%|████████████████████████████████████████████| 5/5 [28:47<00:00, 345.49s/it]



Evaluating models [4, 12, 13]

i) choosing optimal D


100%|████████████████████████████████████████████| 3/3 [05:34<00:00, 111.42s/it]


- opt D= 10 errors= [0.04014776870666264, 0.03758734187511835, 0.03701439713524953] 


ii) choosing optimal lambdas


100%|████████████████████████████████████████| 15/15 [00:00<00:00, 12818.78it/s]


{'random_gpirt': {'cnn': {10: 0.2715794190775654, 25: 0.4824239312122899, 50: 0.6508582613311908, 75: 0.7365821147348108, 100: 0.788508955101164}, 'xsum': {10: 0.2833470942934764, 25: 0.49709296492559607, 50: 0.6640776178522768, 75: 0.747813367075591, 100: 0.7981329845771992}, 'boolq:': {10: 0.14899051660003776, 25: 0.30443859597020734, 50: 0.4667733642820862, 75: 0.5676727798031007, 100: 0.6364628314757391}, 'civil_comments': {10: 0.008348902948679956, 25: 0.020614099695106605, 50: 0.04039548287891528, 75: 0.05939360758913104, 100: 0.07765409124448619}, 'commonsense:dataset=hellaswag,method=multiple_choice_separate_original,': {10: 0.0270920533280044, 25: 0.06508519592184368, 50: 0.12221594323355832, 75: 0.1727665042144695, 100: 0.2178118105886194}, 'commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,': {10: 0.017040616643927916, 25: 0.04153974724732871, 50: 0.07976603361919415, 75: 0.11506010627606876, 100: 0.14774688429831753}, 'imdb:': {10: 0.3651840890700465

100%|████████████████████████████████████████████| 5/5 [18:18<00:00, 219.63s/it]



iv) running random eval


100%|█████████████████████████████████████████████| 5/5 [07:39<00:00, 91.93s/it]



v) running anchor points with IRT embeddings


100%|████████████████████████████████████████████| 5/5 [17:22<00:00, 208.50s/it]



Evaluating models [14, 15, 16, 17, 18, 19, 20, 21, 22]

i) choosing optimal D


100%|████████████████████████████████████████████| 3/3 [05:47<00:00, 115.95s/it]


- opt D= 5 errors= [0.045810498090451464, 0.04502040451789275, 0.046899226917154604] 


ii) choosing optimal lambdas


100%|████████████████████████████████████████| 15/15 [00:00<00:00, 87624.74it/s]


{'random_gpirt': {'cnn': {10: 0.7893264856506694, 25: 0.9035373144834618, 50: 0.9493245103300146, 75: 0.9656358671333084, 100: 0.9740035641056983}, 'xsum': {10: 0.7145113195727499, 25: 0.8622003906464604, 50: 0.9260017288978751, 75: 0.9494202136852479, 100: 0.9615793329819754}, 'boolq:': {10: 0.10702135504182779, 25: 0.23054373347519364, 50: 0.3747022185454754, 75: 0.4733674171260726, 100: 0.5451394687380875}, 'civil_comments': {10: 0.0040268472962810995, 25: 0.010006675211174428, 50: 0.0198150674778109, 75: 0.029431012467722242, 100: 0.03886011907397521}, 'commonsense:dataset=hellaswag,method=multiple_choice_separate_original,': {10: 0.041670195611746445, 25: 0.09804703061222278, 50: 0.17858439188630393, 75: 0.24591802716214067, 100: 0.30304896809380394}, 'commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,': {10: 0.01387779422219853, 25: 0.03398698889825317, 50: 0.06573968388996347, 75: 0.09547139613376172, 100: 0.12336912077818626}, 'imdb:': {10: 0.27637074250

100%|████████████████████████████████████████████| 5/5 [36:42<00:00, 440.44s/it]



iv) running random eval


100%|████████████████████████████████████████████| 5/5 [24:04<00:00, 288.98s/it]



v) running anchor points with IRT embeddings


 20%|████████▊                                   | 1/5 [06:36<26:27, 396.81s/it]

In [None]:
with open('results/results_full_helm.pickle', 'wb') as handle:
    pickle.dump(results_full, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('results/accs_full_helm.pickle', 'wb') as handle:
    pickle.dump(accs_full, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('results/results_full_helm.pickle', 'rb') as handle:
    results_full = pickle.load(handle)

In [None]:
methods = ['anchor-irt_naive', 'anchor-irt_cirt', 'anchor-irt_pirt', 'anchor-irt_gpirt']
plot_results(results_full, scenarios.keys(), methods = methods)
plot_agg_results(results_full, scenarios.keys(), methods = methods)

In [None]:
methods = ['anchor_naive', 'anchor_cirt', 'anchor_pirt', 'anchor_gpirt']
plot_results(results_full, scenarios.keys(), methods = methods)
plot_agg_results(results_full, scenarios.keys(), methods = methods)

In [None]:
methods = ['random_naive', 'random_cirt', 'random_pirt', 'random_gpirt']
plot_results(results_full, scenarios.keys(), methods = methods)
plot_agg_results(results_full, scenarios.keys(), methods = methods)

In [None]:
import pickle

with open('results/results_full_helm.pickle', 'rb') as handle:
    results = pickle.load(handle)

In [None]:
for sce in scenarios.keys():
    y=np.stack([results[m][100]['random_gpirt'][sce] for m in results.keys()]).mean(axis=1)
    x=np.vstack([data['data'][s]['correctness'] for s in scenarios[sce]]).mean(axis=0)
    plt.plot(x,y,'bo')
    plt.xlabel('acc')
    plt.ylabel('error')
    plt.title(sce)
    plt.savefig(f'plots/scenario-{sce}.png', bbox_inches='tight', dpi=300, transparent=False)
    plt.show()