In [1]:
import json
import os
import pickle
import numpy as np
from pathlib import Path

In [2]:
path = '/llmthonskdir/felipe/helm/lite/v1.0.0'

helm_lite_scenarios = {'commonsense:dataset=openbookqa,method=multiple_choice_joint,':['commonsense:dataset=openbookqa,method=multiple_choice_joint,'],
                       'gsm:':['gsm:'],
                       'med_qa:':['med_qa:'],
                       'legalbench':['legalbench:subset=abercrombie,',
                                     'legalbench:subset=corporate_lobbying,',
                                     'legalbench:subset=function_of_decision_section,',
                                     'legalbench:subset=proa,',
                                     'legalbench:subset=international_citizenship_questions,'],
                      'math':['math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,',
                              'math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,',
                              'math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,',
                              'math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,',
                              'math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,',
                              'math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,',
                              'math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,',],
                      'mmlu':['mmlu:subject=abstract_algebra,method=multiple_choice_joint,',
                              'mmlu:subject=college_chemistry,method=multiple_choice_joint,',
                              'mmlu:subject=computer_security,method=multiple_choice_joint,',
                              'mmlu:subject=econometrics,method=multiple_choice_joint,',
                              'mmlu:subject=us_foreign_policy,method=multiple_choice_joint,'],
                      'narrative_qa:':['narrative_qa:'],
                      'natural_qa:mode=closedbook,':['natural_qa:mode=closedbook,'],
                      'natural_qa:mode=openbook_longans,':['natural_qa:mode=openbook_longans,'],
                      'wmt_14':['wmt_14:language_pair=cs-en,',
                                'wmt_14:language_pair=de-en,',
                                'wmt_14:language_pair=fr-en,',
                                'wmt_14:language_pair=hi-en,',
                                'wmt_14:language_pair=ru-en,']}

In [3]:
directory = Path(path)
runs = [item.name for item in directory.iterdir() if item.is_dir()]

In [4]:
data2 = {}

for run in runs:
    #
    with open(path+f'/{run}/instances.json') as f:
        data3 = json.load(f)
    valid_ids = [d['id'] for d in data3 if d['split']=='test']

    #
    with open(path+f'/{run}/display_predictions.json') as f:
        data3 = json.load(f)
        
    #print(run,list(data3[0]['stats'].keys()))
    metric = list(data3[0]['stats'].keys())[-1]
    
    #scenario = run[:run.find(':')]
    subscenario = run[:run.find('model=')]
    model = run[run.find('model=')+6:]
    
    if subscenario not in list(data2.keys()):
        data2[subscenario] = {}

    if np.sum([s in subscenario for s in ['med_qa', 'mmlu', 'narrative_qa', 'wmt_14']])>0:
        data2[subscenario][model] = [d['stats'][metric] for d in data3 if d['instance_id'] in valid_ids]
    else:
        data2[subscenario][model] = [d['stats'][metric] for d in data3]
    #data2[subscenario]['metric'] = metric

In [5]:
data = {}
data['data'] = {}
data['models'] = list(np.unique([list(data2[subscenario].keys()) for subscenario in data2.keys()]))

for sub in list(data2.keys()):
    data['data'][sub] = {}
    data['data'][sub]['correctness'] = np.array([data2[sub][model] for model in data['models']]).T

In [6]:
with open('helm_lite.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
for scenario in helm_lite_scenarios.keys():
    print(scenario)
    print(len(helm_lite_scenarios[scenario]))
    print(np.round(np.sort(np.vstack([data['data'][sub]['correctness'].mean(axis=0) for sub in helm_lite_scenarios[scenario]]).mean(axis=0)),3))
    #print(np.round(np.sort(np.vstack([data['data'][sub]['correctness'] for sub in helm_lite_scenarios[scenario]]).mean(axis=0)),3))
    print('\n')

commonsense:dataset=openbookqa,method=multiple_choice_joint,
1
[0.26  0.272 0.284 0.286 0.398 0.544 0.614 0.634 0.662 0.688 0.754 0.774
 0.776 0.796 0.8   0.828 0.838 0.838 0.844 0.862 0.868 0.872 0.878 0.878
 0.908 0.92  0.938 0.938 0.95  0.96 ]


gsm:
1
[0.028 0.055 0.075 0.137 0.149 0.154 0.159 0.239 0.266 0.267 0.375 0.377
 0.452 0.479 0.489 0.501 0.567 0.583 0.604 0.61  0.615 0.622 0.648 0.668
 0.721 0.735 0.784 0.831 0.831 0.932]


med_qa:
1
[0.254 0.26  0.276 0.276 0.312 0.39  0.392 0.392 0.419 0.431 0.445 0.497
 0.507 0.525 0.525 0.531 0.547 0.559 0.598 0.618 0.618 0.622 0.644 0.652
 0.652 0.656 0.684 0.684 0.815 0.817]


legalbench
5
[0.332 0.346 0.397 0.421 0.442 0.452 0.468 0.48  0.502 0.519 0.528 0.533
 0.578 0.58  0.58  0.586 0.591 0.618 0.622 0.626 0.629 0.63  0.643 0.643
 0.644 0.645 0.673 0.677 0.709 0.713]


math
7
[0.026 0.04  0.044 0.064 0.078 0.097 0.098 0.102 0.103 0.126 0.128 0.236
 0.257 0.297 0.323 0.375 0.421 0.428 0.449 0.494 0.499 0.54  0.58  0.603
 0.632 0.6