In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle

def load_core_scenarios(
     agg_res_paths=["data/agg_data_v0.2.2_v3"],
     scenarios_to_take=[],
     blacklist=[],
     lite=False,
 ):
    """Load the core scenarios from the aggregated results."""
    #agg_data_paths = [
    #     os.path.join(os.getcwd(), agg_res_path) for agg_res_path in agg_res_paths
    #]
    paths = []
    for agg_res_path in agg_res_paths:
        paths.extend(
             [
                 f"{agg_res_path}/{subscenario_agg_filename}"
                 for subscenario_agg_filename in os.listdir(agg_res_path)
                 if subscenario_agg_filename.endswith(".csv")
             ]
         )
    if lite:
        import random
        print("sampling a subset of subscenarios")
        paths = random.sample(paths, k=3)
    if scenarios_to_take:
        paths = [
             path
             for path in paths
             if any([scenario in path for scenario in scenarios_to_take])
         ]
        print(f"only a part of scenarios taken: {scenarios_to_take}")
    if blacklist:
        paths = [
             path
             for path in paths
             if not any([scenario in path for scenario in blacklist])
         ]
    df = []
    for path in tqdm(paths, desc="Getting core scenarions"):
        df.append(pd.read_csv(path))
    df = pd.concat(df)

    def extract_scenario_name(x):
        x = x.split("v0.0.2/")[-1].split("v0.2.2/")[-1].split("v0.2.3/")[-1]
        if "msmarco" in x:
            x = x.split(",")[0]
        elif "natural_qa" in x:
            x = x.split(",model")[0].replace("_longans", "")
        elif "hellaswag" in x:
            x = "hellaswag"
        elif "openbookqa" in x:
            x = "openbookqa"
        else:
            x = x.split(":")[0]
        return x
    
    #df=df.fillna("NaN")
    #print(df.columns)
    #print(df.head())
    
    df["scenario"] = [extract_scenario_name(x) for x in df["data_augmentations"]]
    df["subscenario"] = df["data_augmentations"].apply(
        lambda x: x.split("model")[0].split("/")[-1]
     )
    #return df[['model','unique_id_inc_seed','score','scenario', "subscenario"]]

    balanced_df = []
    for scenario, group_df in df.groupby("scenario", group_keys=False):
        group_df["balanced_score"] = get_balanced_score_col(group_df)
        balanced_df.append(group_df)
    return pd.concat(balanced_df)[['model','unique_id_inc_seed','balanced_score','scenario', "subscenario","instance_id","score_name"]]

    
def get_balanced_score_col(df, column_to_balance="sub_split"):
    # need to balance civil_comments
    balance = False  # default is nothing to balance
    if column_to_balance in df.columns:  # is there a subsplit
        if len(df[column_to_balance].unique()) > 1:  # are there two splits
            balance = True
            assert (
                 df.scenario.unique()[0] == "civil_comments"
            ), f"we are balancing more than civil comments? df.scenario.unique()[0]={df.scenario.unique()[0]}"
    if balance:
         df["weight"] = df.groupby(column_to_balance)["unique_id_inc_seed"].transform(
             lambda x: 1 - len(x) / len(df)
         )
         balanced_score = df["score"].mul(df["weight"]).values.tolist()
    else:
         balanced_score = df["score"].values.tolist()
    return balanced_score

data1

In [2]:
df = load_core_scenarios()
df = df.drop_duplicates(subset=['model','subscenario','instance_id'])
df.shape

Getting core scenarions: 100%|██████████████████| 38/38 [00:08<00:00,  4.31it/s]


(742594, 7)

In [3]:
models = []

for sub in list(df.subscenario.unique()):
    models.append(list(df.loc[df.subscenario==sub].model.unique()))

In [4]:
list_of_lists = models

intersection = set(list_of_lists[0])

for sublist in list_of_lists[1:]:
    intersection = intersection.intersection(set(sublist))

intersection = list(np.sort(list(intersection)))

len(intersection)

28

In [5]:
data = {}
data['models'] = intersection
data['data'] = {}

for sub in list(np.sort(df.subscenario.unique())):
    data['data'][sub] = {}
    data['data'][sub]['score_name'] = df.loc[df.subscenario==sub]['score_name'].unique()[0]
    data_aux = df.loc[df.subscenario==sub]
    data_aux = data_aux.loc[[mod in intersection for mod in data_aux.model]]
    data['data'][sub]['correctness'] = np.array(data_aux.pivot_table(index='model', columns='instance_id', values='balanced_score')).T
    k1 = data['data'][sub]['correctness'].shape[0]
    arr = data['data'][sub]['correctness']
    data['data'][sub]['correctness'] = arr[np.sum(np.isnan(arr), axis=1)==0]
    k2 = data['data'][sub]['correctness'].shape[0]
    
    print(sub, data['data'][sub]['correctness'].shape, k2/k1)

boolq: (1000, 28) 1.0
civil_comments:demographic=LGBTQ, (1000, 28) 1.0
civil_comments:demographic=all, (1000, 28) 1.0
civil_comments:demographic=black, (1000, 28) 1.0
civil_comments:demographic=christian, (1000, 28) 1.0
civil_comments:demographic=female, (1000, 28) 1.0
civil_comments:demographic=male, (1000, 28) 1.0
civil_comments:demographic=muslim, (1000, 28) 1.0
civil_comments:demographic=other_religions, (1000, 28) 1.0
civil_comments:demographic=white, (1000, 28) 1.0
commonsense:dataset=hellaswag,method=multiple_choice_separate_original, (1000, 28) 1.0
commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated, (500, 28) 1.0
imdb: (989, 28) 0.9782393669634025
mmlu:subject=abstract_algebra,method=multiple_choice_joint, (111, 28) 1.0
mmlu:subject=college_chemistry,method=multiple_choice_joint, (108, 28) 1.0
mmlu:subject=computer_security,method=multiple_choice_joint, (111, 28) 1.0
mmlu:subject=econometrics,method=multiple_choice_joint, (126, 28) 1.0
mmlu:subject=us_for

In [6]:
with open('data/helm.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

data2

In [7]:
df = load_core_scenarios()
df = df.drop_duplicates(subset=['model','subscenario','instance_id'])
df.shape

Getting core scenarions: 100%|██████████████████| 38/38 [00:03<00:00,  9.80it/s]


(742594, 7)

In [8]:
data = {}
data['data'] = {}

for sub in list(np.sort(df.subscenario.unique())):
    data['data'][sub] = {}
    data['data'][sub]['models'] = list(np.sort(list(df.loc[df.subscenario==sub].model.unique())))
    data['data'][sub]['score_name'] = df.loc[df.subscenario==sub]['score_name'].unique()[0]
    data['data'][sub]['correctness'] = np.array(df.loc[df.subscenario==sub].pivot_table(index='model', columns='instance_id', values='balanced_score')).T
    k1 = data['data'][sub]['correctness'].shape[0]
    arr = data['data'][sub]['correctness']
    data['data'][sub]['correctness'] = arr[np.sum(np.isnan(arr), axis=1)==0]
    k2 = data['data'][sub]['correctness'].shape[0]
    
    print(sub, len(data['data'][sub]['models']), data['data'][sub]['correctness'].shape, k2/k1)

boolq: 36 (1000, 36) 1.0
civil_comments:demographic=LGBTQ, 37 (1000, 37) 1.0
civil_comments:demographic=all, 37 (1000, 37) 1.0
civil_comments:demographic=black, 37 (1000, 37) 1.0
civil_comments:demographic=christian, 37 (1000, 37) 1.0
civil_comments:demographic=female, 37 (1000, 37) 1.0
civil_comments:demographic=male, 37 (1000, 37) 1.0
civil_comments:demographic=muslim, 37 (1000, 37) 1.0
civil_comments:demographic=other_religions, 37 (1000, 37) 1.0
civil_comments:demographic=white, 37 (1000, 37) 1.0
commonsense:dataset=hellaswag,method=multiple_choice_separate_original, 29 (1000, 29) 1.0
commonsense:dataset=openbookqa,method=multiple_choice_separate_calibrated, 29 (500, 29) 1.0
imdb: 37 (989, 37) 0.9782393669634025
mmlu:subject=abstract_algebra,method=multiple_choice_joint, 37 (111, 37) 1.0
mmlu:subject=college_chemistry,method=multiple_choice_joint, 37 (108, 37) 1.0
mmlu:subject=computer_security,method=multiple_choice_joint, 37 (111, 37) 1.0
mmlu:subject=econometrics,method=multiple

In [9]:
with open('data/helm_all_models.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)