In [1]:
import pickle
import copy
from experiments import *
from utils import *

Definitions

In [2]:
scenarios = {'cnn':['summarization_cnndm:temperature=0.3,device=cuda,'], 
             'xsum':['summarization_xsum:temperature=0.3,device=cuda,'], 
             'boolq:':['boolq:'],
             'civil_comments':['civil_comments:demographic=LGBTQ,',
                               'civil_comments:demographic=all,',
                               'civil_comments:demographic=black,',
                               'civil_comments:demographic=christian,',
                               'civil_comments:demographic=female,',
                               'civil_comments:demographic=male,',
                               'civil_comments:demographic=muslim,',
                               'civil_comments:demographic=other_religions,',
                               'civil_comments:demographic=white,'],
             'commonsense:dataset=hellaswag,method=multiple_choice_separate_original,':['commonsense:dataset=hellaswag,method=multiple_choice_separate_original,'],
             'commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,':['commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,'],
             'imdb:':['imdb:'],
             'mmlu':['mmlu:subject=abstract_algebra,method=multiple_choice_joint,',
                     'mmlu:subject=college_chemistry,method=multiple_choice_joint,',
                     'mmlu:subject=computer_security,method=multiple_choice_joint,',
                     'mmlu:subject=econometrics,method=multiple_choice_joint,',
                     'mmlu:subject=us_foreign_policy,method=multiple_choice_joint,'],
             'msmarco:track=regular,valid_topk=30,':['msmarco:track=regular,valid_topk=30,'],
             #'msmarco:track=trec,valid_topk=30,':['msmarco:track=trec,valid_topk=30,'],
             'narrative_qa:':['narrative_qa:'],
             'natural_qa:mode=closedbook,':['natural_qa:mode=closedbook,'],
             'natural_qa:mode=openbook_longans,':['natural_qa:mode=openbook_longans,'],
             'quac:':['quac:'],
             'raft':['raft:subset=ade_corpus_v2,',
                     'raft:subset=banking_77,',
                     'raft:subset=neurips_impact_statement_risks,',
                     'raft:subset=one_stop_english,',
                     'raft:subset=overruling,',
                     'raft:subset=semiconductor_org_types,',
                     'raft:subset=systematic_review_inclusion,',
                     'raft:subset=tai_safety_research,',
                     'raft:subset=terms_of_service,',
                     'raft:subset=tweet_eval_hate,',
                     'raft:subset=twitter_complaints,'],
             'truthful_qa:task=mc_single,method=multiple_choice_joint,':['truthful_qa:task=mc_single,method=multiple_choice_joint,']}
            
scenarios_metrics = {'boolq:':'em', 
                     'commonsense:dataset=hellaswag,method=multiple_choice_separate_original,':'em',
                     'commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,':'em',
                     'imdb:':'em', 
                     'mmlu':'em', 
                     'msmarco:track=regular,valid_topk=30,':'RR@10', 
                     'msmarco:track=trec,valid_topk=30,':'NDCG@10', 
                     'narrative_qa:':'f1', 
                     'natural_qa:mode=closedbook,':'f1', 
                     'natural_qa:mode=openbook_longans,':'f1', 
                     'quac:':'f1', 
                     'raft':'em', 
                     'truthful_qa:task=mc_single,method=multiple_choice_joint,':'em'}

## Data

Loading data

In [3]:
with open('data/helm.pickle', 'rb') as handle:
    data = pickle.load(handle)

## Results

In [4]:
device = 'cuda'
iterations = 10
Ds = [5, 10, 15, 20] #

set_of_rows = [[0,1,2,3], #ai21
               [5,6,7,8,9,10,11], #cohere
               [4,12,13], #anthropic+microsoft
               [14,15,16,17,18,19,20,21,22], #openai
               [23,24,25,26,27]] #together
set_of_rows

[[0, 1, 2, 3],
 [5, 6, 7, 8, 9, 10, 11],
 [4, 12, 13],
 [14, 15, 16, 17, 18, 19, 20, 21, 22],
 [23, 24, 25, 26, 27]]

In [5]:
data['models']

['ai21_j1-grande',
 'ai21_j1-grande-v2-beta',
 'ai21_j1-jumbo',
 'ai21_j1-large',
 'anthropic_stanford-online-all-v4-s3',
 'cohere_command-medium-beta',
 'cohere_command-xlarge-beta',
 'cohere_large-20220720',
 'cohere_medium-20220720',
 'cohere_medium-20221108',
 'cohere_xlarge-20220609',
 'cohere_xlarge-20221108',
 'microsoft_TNLGv2_530B',
 'microsoft_TNLGv2_7B',
 'openai_ada',
 'openai_babbage',
 'openai_curie',
 'openai_davinci',
 'openai_text-ada-001',
 'openai_text-babbage-001',
 'openai_text-curie-001',
 'openai_text-davinci-002',
 'openai_text-davinci-003',
 'together_bloom',
 'together_gpt-j-6b',
 'together_gpt-neox-20b',
 'together_opt-175b',
 'together_opt-66b']

### Predicting accuracy

Full (one IRT model for all scenarios)

In [None]:
scenario_name = 'full' #we are evaluating all scenarios at once
chosen_scenarios = list(scenarios.keys())
sampling = {'random_sampling':False,'anchor_sampling':False,
            'anchor-irt_sampling':True,'disc_sampling':False}
results_full, accs_full = evaluate_scenarios(data, scenario_name, chosen_scenarios, scenarios, set_of_rows, Ds, iterations, device, bench='irt_helm', sampling = sampling)


Evaluating models [0, 1, 2, 3]

i) choosing optimal D


100%|████████████████████████████████████████████| 4/4 [12:18<00:00, 184.54s/it]


- opt D= 5 errors= [0.044579911983804656, 0.0453168327810513, 0.047692893828327226, 0.049759814361327995] 


ii) choosing optimal lambdas


100%|████████████████████████████████████████| 15/15 [00:00<00:00, 10540.22it/s]


{'random_gpirt': {'cnn': {10: 0.6106500153682252, 25: 0.7710186221163557, 50: 0.8389766601558348, 75: 0.8755169245268296, 100: 0.9143715646295693}, 'xsum': {10: 0.09864735668411334, 25: 0.27348288919988994, 50: 0.38082260331151807, 75: 0.4623254879248749, 100: 0.5845693496878107}, 'boolq:': {10: 0.09335207393175736, 25: 0.2281357754643307, 50: 0.35042482226473454, 75: 0.5616991102464602, 100: 0.6345438206318922}, 'civil_comments': {10: 0.06043252750157128, 25: 0.05845086140679586, 50: 0.1786826212729647, 75: 0.19488823789360563, 100: 0.28262542556346976}, 'commonsense:dataset=hellaswag,method=multiple_choice_separate_original,': {10: 0.31853410497191076, 25: 0.49227117762582373, 50: 0.6283794731110073, 75: 0.7121305377685969, 100: 0.7645432827468794}, 'commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated,': {10: 0.037441018432384975, 25: 0.11296227042386832, 50: 0.2064591320881373, 75: 0.2877500866062527, 100: 0.4378412715932999}, 'imdb:': {10: 0.3455834566196065, 

  0%|                                                     | 0/5 [00:00<?, ?it/s]

In [None]:
with open('results/results_full_helm.pickle', 'wb') as handle:
    pickle.dump(results_full, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('results/accs_full_helm.pickle', 'wb') as handle:
    pickle.dump(accs_full, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
methods = ['anchor-irt_naive', 'anchor-irt_cirt', 'anchor-irt_pirt', 'anchor-irt_gpirt']
plot_results(results_full, scenarios.keys(), methods = methods)
plot_agg_results(results_full, scenarios.keys(), methods = methods)

In [None]:
methods = ['anchor_naive', 'anchor_cirt', 'anchor_pirt', 'anchor_gpirt']
plot_results(results_full, scenarios.keys(), methods = methods)
plot_agg_results(results_full, scenarios.keys(), methods = methods)

In [None]:
methods = ['random_naive', 'random_cirt', 'random_pirt', 'random_gpirt']
plot_results(results_full, scenarios.keys(), methods = methods)
plot_agg_results(results_full, scenarios.keys(), methods = methods)

In [None]:
import pickle

with open('results/results_full_helm.pickle', 'rb') as handle:
    results = pickle.load(handle)

In [None]:
for sce in scenarios.keys():
    y=np.stack([results[m][100]['random_pirt'][sce] for m in results.keys()]).mean(axis=1)
    x=np.vstack([data['data'][s]['correctness'] for s in scenarios[sce]]).mean(axis=0)
    plt.plot(x,y,'bo')
    plt.xlabel('acc')
    plt.xlabel('error')
    plt.title(sce)
    plt.savefig(f'plots/scenario-{sce}.png', bbox_inches='tight', dpi=300, transparent=False)
    plt.show()