In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import re
from fuzzywuzzy import fuzz
from datasets import load_dataset

def search(s, s_list):
    scores = [fuzz.token_sort_ratio(s, s_try) for s_try in s_list]
    return [s_list[np.argmax(scores)], np.max(scores)]
    
def standard_name(s, slash=True):
    if not slash:
        return s.lower().replace("-hf","")
    else:
        if "/" in s:
            return s.split("/")[1].lower().replace("-hf","")
        else:
            return s.lower().replace("-hf","")

def consolidate_columns(df):
    # Get all columns with '_x' suffix
    x_columns = [col for col in df.columns if col.endswith('_x')]
    
    # Iterate over each '_x' column
    for x_col in x_columns:
        # Derive the corresponding '_y' column name
        y_col = x_col[:-2] + '_y'
        
        # Consolidate columns
        df[x_col[:-2]] = df[x_col].combine_first(df[y_col])
        
        # Drop the '_x' and '_y' columns
        df.drop([x_col, y_col], axis=1, inplace=True)
    
    return df
    
def remove_params(s):
    pattern = r'\d+(\.\d+)?[BbMm](\d)?'
    cleaned_str = re.sub(pattern, '', s)
    if cleaned_str[-1]=='-':
        cleaned_str = cleaned_str[:-1]
    cleaned_str = cleaned_str.replace("--","-")
    return cleaned_str
    
def are_strings_equivalent(str1, str2):
    cleaned_str1 = remove_params(str1)
    cleaned_str2 = remove_params(str2)
    bool1 = cleaned_str1 == cleaned_str2
    return bool1 

def get_families(data, min=2):
    if type(data)==list:
        models = np.unique(data).tolist()
    else:
        models = np.unique(list(data.Model)).tolist()
    D = (np.array([[are_strings_equivalent(m1, m2) for m1 in models] for m2 in tqdm(models)]))
    
    families = []
    while len(models)>0:
        indices = [j for j,bool in enumerate(D[0]) if bool]
        D = np.delete(D, indices, axis=0)
        D = np.delete(D, indices, axis=1)
        families.append(np.array(models)[indices].tolist())
    
        for m in np.array(models)[indices].tolist():
            models.remove(m)
    
    families = [f for f in families if len(f)>=min]
    #families_instruct = [f for f in families if 'chat' in f[0].lower() or 'instruct' in f[0].lower() or '-it' in f[0][-4:].lower()]
    #families_base = [f for f in families if f not in families_instruct]

    families = [np.sort(f).tolist() for f in families]
    #families_instruct = [np.sort(f).tolist() for f in families if data.loc[data.Model==f[0]]['T'].iloc[0]=='ðŸ’¬']
    #families_base = [np.sort(f).tolist() for f in families if data.loc[data.Model==f[0]]['T'].iloc[0]=='ðŸŸ¢']
    return families#, families_base, families_instruct

def get_family_name(strings):
    # Start with the shortest string in the list
    shortest_string = min(strings, key=len)
    length = len(shortest_string)
    
    # Iterate over all possible substrings of the shortest string
    for sub_len in range(length, 0, -1):  # Start with the longest substrings
        for i in range(length - sub_len + 1):
            substring = shortest_string[i:i + sub_len]
            # Check if this substring is in all other strings
            if all(substring in string for string in strings):
                return substring
    
    return ""  # Return an empty string if no common substring is found


## Gathering model names linked to a family from the old and new open llm leaderboards

In [158]:
lb_new = pd.read_csv("open-llm-leaderboard_new.csv")
new_families = get_families(lb_new)

  0%|          | 0/707 [00:00<?, ?it/s]

In [3]:
lb_old = pd.read_csv("open-llm-leaderboard_old.csv")
old_families = get_families(lb_old)

  0%|          | 0/6811 [00:00<?, ?it/s]


KeyboardInterrupt



In [4]:
merged_list = [item for sublist in new_families for item in sublist]
[print(m) for m in np.sort(merged_list).tolist()]

01-ai/Yi-1.5-34B
01-ai/Yi-1.5-34B-32K
01-ai/Yi-1.5-34B-Chat
01-ai/Yi-1.5-34B-Chat-16K
01-ai/Yi-1.5-6B
01-ai/Yi-1.5-6B-Chat
01-ai/Yi-1.5-9B
01-ai/Yi-1.5-9B-32K
01-ai/Yi-1.5-9B-Chat
01-ai/Yi-1.5-9B-Chat-16K
01-ai/Yi-34B
01-ai/Yi-34B-200K
01-ai/Yi-34B-Chat
01-ai/Yi-6B
01-ai/Yi-6B-200K
01-ai/Yi-6B-Chat
01-ai/Yi-9B
01-ai/Yi-9B-200K
Azure99/blossom-v5.1-34b
Azure99/blossom-v5.1-9b
BEE-spoke-data/smol_llama-101M-GQA
BEE-spoke-data/smol_llama-220M-GQA
CohereForAI/aya-23-35B
CohereForAI/aya-23-8B
EleutherAI/gpt-neo-1.3B
EleutherAI/gpt-neo-2.7B
EleutherAI/pythia-12b
EleutherAI/pythia-160m
EleutherAI/pythia-2.8b
EleutherAI/pythia-410m
EleutherAI/pythia-6.9b
HuggingFaceTB/SmolLM-1.7B
HuggingFaceTB/SmolLM-1.7B-Instruct
HuggingFaceTB/SmolLM-135M
HuggingFaceTB/SmolLM-135M-Instruct
HuggingFaceTB/SmolLM-360M
HuggingFaceTB/SmolLM-360M-Instruct
NeverSleep/Lumimaid-v0.2-12B
NeverSleep/Lumimaid-v0.2-8B
NousResearch/Yarn-Llama-2-13b-128k
NousResearch/Yarn-Llama-2-7b-128k
OpenBuddy/openbuddy-zero-3b-v21.2-32

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [5]:
merged_list = [item for sublist in old_families for item in sublist]
[print(m) for m in np.sort(merged_list).tolist()]

0-hero/Matter-0.2-32B
0-hero/Matter-0.2-7B
01-ai/Yi-1.5-34B
01-ai/Yi-1.5-34B-32K
01-ai/Yi-1.5-34B-Chat
01-ai/Yi-1.5-34B-Chat-16K
01-ai/Yi-1.5-6B
01-ai/Yi-1.5-6B-Chat
01-ai/Yi-1.5-9B
01-ai/Yi-1.5-9B-32K
01-ai/Yi-1.5-9B-Chat
01-ai/Yi-1.5-9B-Chat-16K
01-ai/Yi-34B
01-ai/Yi-34B-200K
01-ai/Yi-6B
01-ai/Yi-6B-200K
01-ai/Yi-9B
01-ai/Yi-9B-200K
0x7194633/fialka-13B-v3
0x7194633/fialka-7B-v3
922-CA/monika-ddlc-7b-v1
922-CA/monika-ddlc-8b-v1
AI-Sweden-Models/gpt-sw3-1.3b
AI-Sweden-Models/gpt-sw3-1.3b-instruct
AI-Sweden-Models/gpt-sw3-126m
AI-Sweden-Models/gpt-sw3-126m-instruct
AI-Sweden-Models/gpt-sw3-20b
AI-Sweden-Models/gpt-sw3-20b-instruct
AI-Sweden-Models/gpt-sw3-356m
AI-Sweden-Models/gpt-sw3-356m-instruct
AI-Sweden-Models/gpt-sw3-40b
AI-Sweden-Models/gpt-sw3-6.7b
AIGym/deepseek-coder-1.3b-chat
AIGym/deepseek-coder-1.3b-chat-and-function-calling
AIGym/deepseek-coder-6.7b-chat
AIGym/deepseek-coder-6.7b-chat-and-function-calling
Aspik101/trurl-2-13b-pl-instruct_unload
Aspik101/trurl-2-7b-pl-inst

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

## Compiling the data

### Loading dataset containing scores for subtasks

In [2]:
subscenario_scores = pd.read_csv('subscenario_scores.csv')

### Processing new LB

In [3]:
with open('new_lb_annotated.txt', 'r') as file:
    new_lb_annotated = file.readlines()
new_lb_annotated = [line.strip() for line in new_lb_annotated]
new_lb_annotated = [m[:-2] for m in new_lb_annotated if m[-2:] in [' Y',' ?']]
new_lb_annotated = [re.sub(r"\s*\[.*?\]", "", m) for m in new_lb_annotated]

new_lb = pd.read_csv("open-llm-leaderboard_new.csv")
new_lb = new_lb.iloc[[np.argmax(np.array(new_lb.Model)==m) for m in new_lb_annotated]].loc[:,['Model','#Params (B)','Upload To Hub Date','IFEval Raw','BBH Raw','MATH Lvl 5 Raw','GPQA Raw','MUSR Raw','MMLU-PRO Raw']]
new_lb.columns = new_lb.columns.str.replace(' Raw', '', regex=False)
new_lb = new_lb.reset_index(drop=True)

new_families_names = [remove_params(m) for m in new_lb_annotated]
instruct = []
for i,f in enumerate(new_families_names):
    f_new = f.replace('-Chat','').replace('-chat','')
    f_new = f_new.replace('-Instruct','').replace('-instruct','')
    f_new = f_new.replace('pankajmathur/orca_mini_v3_','meta-llama/Llama-2-hf')
    f_new = f_new.replace('pankajmathur/orca_mini_v7_','Qwen/Qwen2')
    f_new = f_new.replace('microsoft/Orca-2','meta-llama/Llama-2-hf')
    f_new = f_new.replace('teknium/OpenHermes','meta-llama/Llama-2-hf')
    f_new = f_new.replace('lmsys/vicuna','huggyllama/llama')
    f_new = f_new.replace('databricks/dolly-v2','EleutherAI/pythia')
    f_new = f_new.replace('WizardLMTeam/WizardLM-V1.0','meta-llama/Llama-2-hf')
    f_new = f_new.replace('Azure99/blossom-v5.1','01-ai/Yi-1.5')
    f_new = f_new.replace('VAGOsolutions/SauerkrautLM-Gemma','google/gemma')
    f_new = f_new.replace('VAGOsolutions/Llama-3-SauerkrautLM','meta-llama/Meta-Llama-3')
    f_new = f_new.replace('cognitivecomputations/dolphin-2.9.1-yi-1.5','01-ai/Yi-1.5')
    f_new = f_new.replace('cognitivecomputations/dolphin-2.9.2-qwen2','Qwen/Qwen2')
    f_new = f_new.replace('gemma-it','gemma')
    f_new = f_new.replace('gemma-1.1-it','gemma-1.1')
    f_new = f_new.replace('gemma-2-it','gemma-2')
    new_families_names[i] = f_new
    if f!=f_new:
        instruct.append(True)
    else:
        instruct.append(False)
        
new_lb['Family'] = new_families_names
new_lb['Instruct'] = instruct
new_lb =  consolidate_columns(new_lb.merge(subscenario_scores, on='Model', how='left'))
new_lb = new_lb.loc[:,['Model', 'Family', 'Instruct', '#Params (B)', 'date', 'IFEval', 'BBH', 'MATH Lvl 5', 'GPQA', 'MUSR', 'MMLU-PRO',
                       'bbh_boolean_expressions','bbh_causal_judgement','bbh_date_understanding','bbh_disambiguation_qa',
                       'bbh_formal_fallacies','bbh_geometric_shapes','bbh_hyperbaton','bbh_logical_deduction_five_objects',
                       'bbh_logical_deduction_seven_objects','bbh_logical_deduction_three_objects','bbh_movie_recommendation',
                       'bbh_navigate','bbh_object_counting','bbh_penguins_in_a_table','bbh_reasoning_about_colored_objects',
                       'bbh_ruin_names','bbh_salient_translation_error_detection','bbh_snarks','bbh_sports_understanding',
                       'bbh_temporal_sequences','bbh_tracking_shuffled_objects_five_objects','bbh_tracking_shuffled_objects_seven_objects',
                       'bbh_tracking_shuffled_objects_three_objects','bbh_web_of_lies','gpqa_diamond','gpqa_extended','gpqa_main',
                       'math_algebra_hard','math_counting_and_prob_hard','math_geometry_hard','math_intermediate_algebra_hard',
                       'math_num_theory_hard','math_prealgebra_hard','math_precalculus_hard','musr_murder_mysteries',
                       'musr_object_placements','musr_team_allocation']]
new_lb['Family'] = [standard_name(m) for m in new_lb.Family]
new_lb['Family'] = new_lb['Family'].replace({'gpt-neo':'gpt-neo/j','bloom-1':'bloom'}) #to match Tatsu data


### Processing old LB

In [4]:
old_lb = pd.read_csv("open-llm-leaderboard_old.csv")
old_lb = old_lb.sort_values(by=['date']).reset_index(drop=True)
old_lb =  consolidate_columns(old_lb.merge(subscenario_scores, on='Model', how='left'))
old_lb = old_lb.drop_duplicates(subset=['Model'], keep='last')
old_lb = old_lb.loc[:,['Model', '#Params (B)', 'ARC', 'HellaSwag', 'MMLU', 'TruthfulQA', 'Winogrande', 'GSM8K',
                       'hendrycksTest-abstract_algebra','hendrycksTest-anatomy','hendrycksTest-astronomy','hendrycksTest-business_ethics',
                       'hendrycksTest-clinical_knowledge','hendrycksTest-college_biology','hendrycksTest-college_chemistry','hendrycksTest-college_computer_science',
                       'hendrycksTest-college_mathematics','hendrycksTest-college_medicine','hendrycksTest-college_physics',
                       'hendrycksTest-computer_security','hendrycksTest-conceptual_physics','hendrycksTest-econometrics',
                       'hendrycksTest-electrical_engineering','hendrycksTest-elementary_mathematics','hendrycksTest-formal_logic',
                       'hendrycksTest-global_facts','hendrycksTest-high_school_biology','hendrycksTest-high_school_chemistry',
                       'hendrycksTest-high_school_computer_science','hendrycksTest-high_school_european_history','hendrycksTest-high_school_geography',
                       'hendrycksTest-high_school_government_and_politics','hendrycksTest-high_school_macroeconomics','hendrycksTest-high_school_mathematics',
                       'hendrycksTest-high_school_microeconomics','hendrycksTest-high_school_physics','hendrycksTest-high_school_psychology',
                       'hendrycksTest-high_school_statistics','hendrycksTest-high_school_us_history','hendrycksTest-high_school_world_history',
                       'hendrycksTest-human_aging','hendrycksTest-human_sexuality','hendrycksTest-international_law','hendrycksTest-jurisprudence',
                       'hendrycksTest-logical_fallacies','hendrycksTest-machine_learning','hendrycksTest-management','hendrycksTest-marketing',
                       'hendrycksTest-medical_genetics','hendrycksTest-miscellaneous','hendrycksTest-moral_disputes','hendrycksTest-moral_scenarios',
                       'hendrycksTest-nutrition','hendrycksTest-philosophy','hendrycksTest-prehistory','hendrycksTest-professional_accounting',
                       'hendrycksTest-professional_law','hendrycksTest-professional_medicine','hendrycksTest-professional_psychology',
                       'hendrycksTest-public_relations','hendrycksTest-security_studies','hendrycksTest-sociology','hendrycksTest-us_foreign_policy',
                       'hendrycksTest-virology','hendrycksTest-world_religions']]
old_lb.loc[:,['ARC', 'HellaSwag', 'MMLU', 'TruthfulQA', 'Winogrande', 'GSM8K']]=old_lb.loc[:,['ARC', 'HellaSwag', 'MMLU', 'TruthfulQA', 'Winogrande', 'GSM8K']]/100
old_lb.shape

(6811, 65)

### Processing Tatsu data

In [5]:
tatsu_base = pd.read_csv('base_llm_benchmark_eval.csv')
tatsu_base['Instruct'] = False
tatsu_base['Model Family'] = [standard_name(m,slash=False) for m in tatsu_base['Model Family']]
tatsu_base = tatsu_base.rename(columns={'ARC-C': 'ARC','Model Family':'Family','Model Size (B)':'#Params (B)','Winograd':'Winogrande'})
tatsu_inst = pd.read_csv('instruct_llm_benchmark_eval.csv')
tatsu_inst['Instruct'] = True
tatsu_inst['Model Family'] = [standard_name(m,slash=False) for m in tatsu_inst['Model Family']]
tatsu_inst = tatsu_inst.rename(columns={'ARC-C': 'ARC','Model Family':'Family','Model Size (B)':'#Params (B)','Winograd':'Winogrande'})
tatsu = pd.concat((tatsu_base, tatsu_inst), axis=0).reset_index(drop=True)

Comparing model names from Tatsu data and new lb data (we do not worry about the old cause tatsu data was build from the old lb)

In [6]:
interdata = [[m1]+search(m1, list(new_lb.Model)) for m1 in list(tatsu.Model)]
interdata = pd.DataFrame(interdata).sort_values(by=[2], ascending=False)
print(interdata.to_string())

                                                  0                                             1    2
72                            bigcode/starcoder2-7b                         bigcode/starcoder2-7b  100
73                            bigcode/starcoder2-3b                         bigcode/starcoder2-3b  100
8                        meta-llama/Meta-Llama-3-8B                    meta-llama/Meta-Llama-3-8B  100
10                                 Qwen/Qwen1.5-32B                              Qwen/Qwen1.5-32B  100
11                                 Qwen/Qwen1.5-14B                              Qwen/Qwen1.5-14B  100
12                                  Qwen/Qwen1.5-7B                               Qwen/Qwen1.5-7B  100
13                                  Qwen/Qwen1.5-4B                               Qwen/Qwen1.5-4B  100
20                      mistralai/Mixtral-8x7B-v0.1                   mistralai/Mixtral-8x7B-v0.1  100
14                                Qwen/Qwen1.5-1.8B                      

In [7]:
#it seems that only the chat llama-2 family and dolly is not matching
tatsu = tatsu.replace({'dolly-v2-12b': 'databricks/dolly-v2-12b',
                       'llama-2-7b-chat': 'meta-llama/Llama-2-7b-chat-hf',
                       'llama-2-13b-chat': 'meta-llama/Llama-2-13b-chat-hf',
                       'llama-2-70b-chat': 'meta-llama/Llama-2-70b-chat-hf'})

### Merging LB and Tatsu data

In [8]:
cons_lb = consolidate_columns((new_lb.merge(tatsu, on='Model', how='outer')))
cons_lb =  consolidate_columns(cons_lb.merge(old_lb, on='Model', how='left'))
cons_lb.shape

(219, 117)

In [9]:
cons_lb.loc[cons_lb.Family=='mistral-instruct','Family'] = 'mistral'

Getting more instruct models from the old lb (we do not care about the new lb, cause we have already filtered all models of interest from that lb). 

In [10]:
old_lb_instruct = old_lb.loc[['-instruct' in m or '-Instruct' in m or '-chat' in m or '-Chat' in m or '-it' in m or '-It' in m for m in old_lb.Model]].reset_index(drop=True)
old_lb_instruct['Model2'] = [m.replace('-instruct','').replace('-chat','').replace('-it','').replace('-Instruct','').replace('-Chat','').replace('-It','') for m in old_lb_instruct.Model]
old_lb_instruct = old_lb_instruct.loc[[m in list(cons_lb.Model) for m in old_lb_instruct['Model2']]]
old_lb_instruct = old_lb_instruct.drop(['Model2'], axis=1)
old_lb_instruct['Instruct'] = True
cons_lb = pd.concat((cons_lb,old_lb_instruct), axis=0)
cons_lb = cons_lb.drop_duplicates(subset=['Model'], keep='first')
cons_lb.shape

(232, 117)

We also include more models from the 'rwkv-raven' family (missing from Tatsu data)

more_data = old_lb.loc[['rwkv-raven' in m for m in old_lb.Model]].reset_index(drop=True)
more_data['Instruct'] = False
cons_lb = pd.concat((cons_lb,more_data), axis=0)
cons_lb = cons_lb.drop_duplicates(subset=['Model'], keep='first').reset_index(drop=True)
cons_lb.shape

cons_lb.loc[['rwkv-raven' in m for m in cons_lb.Model],'Family'] = 'rwkv-raven' 

In [11]:
cons_lb = cons_lb.loc[['rwkv-raven' not in m for m in cons_lb.Model]] # we delete 'rwkv-raven' cause there is only one model from this family

Filling family for instruct models

In [12]:
for m in cons_lb.loc[cons_lb.Instruct].Model:
    model_name = m.replace('-instruct','').replace('-chat','').replace('-it','').replace('-Instruct','').replace('-Chat','').replace('-It','')
    for var in ['Family']:
        if cons_lb.loc[cons_lb.Model==m, var].isnull().iloc[0]:
            if not list(cons_lb.loc[cons_lb.Model==model_name, var])==[]:
                cons_lb.loc[cons_lb.Model==m, var] = cons_lb.loc[cons_lb.Model==model_name, var].iloc[0]

Getting model size from the names (the data from the lb can be misleading sometimes)

In [13]:
ind = ~np.array([re.search(r'\d+(\.\d+)?[BbMm]', m) is None for m in list(cons_lb.Model)]) 
sizes = [re.search(r'\d+(\.\d+)?[BbMm]', m).group() for m in list(cons_lb.loc[ind].Model)]
for i in range(len(sizes)):
    if sizes[i][-1].lower()=='b':
        sizes[i] = float(sizes[i][:-1])
    else:
        sizes[i] = float(sizes[i][:-1])/1000
cons_lb.loc[ind,['#Params (B)']] = sizes

Standardizing model names

In [14]:
cons_lb['Model'] = [standard_name(m) for m in cons_lb.Model]
np.unique(cons_lb['Family'])

array(['amber', 'bloom', 'claude-1', 'claude-2', 'claude-instant',
       'codegen', 'codellama', 'codellama-instruct', 'decilm',
       'deepseek-coder', 'deepseek-llm', 'deepseek-llm-chat',
       'deepseek-moe', 'falcon', 'gemma', 'gemma-1.1', 'gemma-2', 'gpt-2',
       'gpt-3.5-turbo', 'gpt-4', 'gpt-neo/j', 'guanaco', 'internlm2',
       'koala', 'lemur-chat', 'llama', 'llama-2', 'llama-v1.3',
       'meta-llama-3', 'meta-llama-3.1', 'mistral', 'mixtral-8x-v0.1',
       'mpt', 'oasst-sft', 'olmo', 'openchat', 'openllama', 'openllamav2',
       'opt', 'phi', 'pythia', 'qwen', 'qwen1.5', 'qwen2',
       'recurrentgemma', 'redpajama-incite-base', 'rwkv', 'smollm',
       'stablelm', 'starcoder', 'starcoder2', 'vicuna', 'wizardlm',
       'xglm', 'yi', 'yi-1.5', 'yi-200k'], dtype=object)

### Merging HumanEval

In [15]:
humaneval = [['GPT-4-Turbo (April 2024)',90.2],
['GPT-4 (May 2023)',88.4],
['DeepSeek-Coder-V2-Instruct',85.4],
['GPT-4-Turbo (Nov 2023)',85.4],
['CodeQwen1.5-7B-Chat',83.5],
['claude-3-opus (Mar 2024)',82.9],
['DeepSeek-Coder-33B-instruct',81.1],
['WizardCoder-33B-V1.1',79.9],
['OpenCodeInterpreter-DS-33B',79.3],
['Llama3-70B-instruct',77.4],
['OpenCodeInterpreter-DS-6.7B',77.4],
['speechless-codellama-34B-v2.0',77.4],
['GPT-3.5-Turbo (Nov 2023)',76.8],
['Magicoder-S-DS-6.7B',76.8],
['claude-3-haiku (Mar 2024)',76.8],
['Mixtral-8x22B-Instruct-v0.1',76.2],
['Artigenz-Coder-DS-6.7B',75.6],
['DeepSeek-Coder-7B-instruct-v1.5',75.6],
['XwinCoder-34B',75.6],
['WaveCoder-Ultra-6.7B',75],
['databricks/dbrx-instruct',75],
['DeepSeek-Coder-6.7B-instruct',74.4],
['code-millenials-34B',74.4],
['starchat2-15b-v0.1',73.8],
['GPT-3.5 (May 2023)',73.2],
['WizardCoder-Python-34B-V1.0',73.2],
['OpenChat-3.5-7B-0106',72.6],
['CodeLlama-70B-Instruct',72],
['WhiteRabbitNeo-33B-v1',72],
['Phind-CodeLlama-34B-v2',71.3],
['speechless-coder-ds-6.7B',71.3],
['Magicoder-S-CL-7B',70.7],
['claude-3-sonnet (Mar 2024)',70.7],
['Mistral Large (Mar 2024)',69.5],
['claude-2 (Mar 2024)',69.5],
['Qwen1.5-72B-Chat',68.3],
['Gemini Pro 1.5',68.3],
['starcoder2-15b-instruct-v0.1',67.7],
['speechless-starcoder2-15b',67.1],
['DeepSeek-Coder-1.3B-instruct',65.9],
['Code-290k-6.7B-Instruct',64.6],
['Phi-3-mini-4k-instruct',64.6],
['Command-R+',64],
['dolphin-2.6-mixtral-8x7b',64],
['Gemini Pro 1.0',63.4],
['Llama3-8B-instruct',61.6],
['codegemma-7b-it',60.4],
['claude-instant-1 (Mar 2024)',57.3],
['WizardCoder-15B-V1.0',56.7],
['Code-13B',56.1],
['speechless-starcoder2-7b',56.1],
['CodeLlama-70B',55.5],
['Code-33B',54.9],
['speechless-coding-7B-16k-tora',54.9],
['OpenHermes-2.5-Code-290k-13B',54.3],
['CodeLlama-34B',51.8],
['CodeQwen1.5-7B',51.8],
['DeepSeek-Coder-33B-base',51.2],
['WizardCoder-Python-7B-V1.0',50.6],
['phi-2-2.7B',49.4],
['Mistral-codealpaca-7B',48.2],
['speechless-code-mistral-7B-v1.0',48.2],
['DeepSeek-Coder-6.7B-base',47.6],
['MistralHermes-CodePro-7B-v1',47.6],
['StarCoder2-15B',46.3],
['Mixtral-8x7B-Instruct-v0.1',45.1],
['codegemma-7b',44.5],
['SOLAR-10.7B-Instruct-v1.0',43.3],
['CodeLlama-13B',42.7],
['gemma-1.1-7b-it',42.7],
['Mistral-7B-Instruct-v0.2',42.1],
['xDAN-L1-Chat-RL-v1-7B',40.2],
['CodeLlama-7B',37.8],
['StarCoder2-7B',35.4],
['gemma-7b',35.4],
['StarCoder-15B',34.1],
['Llama3-8B-base',33.5],
['CodeGen-16B',32.9],
['Python-Code-13B',32.9],
['CodeT5+-16B',31.7],
['StarCoder2-3B',31.7],
['Zephyr Î²-7B',30],
['CodeGen-6B',29.3],
['CodeT5+-6B',29.3],
['stable-code-3B',29.3],
['DeepSeek-Coder-1.3B-base',28.7],
['Mistral-7B',28.7],
['gemma-7b-it',28.7],
['codegemma-2b',26.8],
['CodeT5+-2B',25],
['gemma-2b',25],
['CodeGen-2B',24.4],
['StarCoderBase-7B',24.4],
['gemma-1.1-2b-it',22.6],
['CodeGen2-16B',19.5],
['CodeGen2-7B',18.3],
['StarCoderBase-3B',17.7],
['gemma-2b-it',17.7],
['Vicuna-13B',17.1],
['CodeGen2-3B',15.9],
['InCoder-6.7B',15.9],
['SantaCoder-1.1B',14.6],
['StarCoderBase-1B',14.6],
['GPT-J-6B',12.2],
['InCoder-1.3B',12.2],
['Vicuna-7B',11.6],
['CodeGen2-1B',11],
['GPT-Neo-2.7B',7.9],
['PolyCoder-2.7B',6.1],
['StableLM-7B',2.4],
['zyte-1B',2.4]]

In [16]:
humaneval = pd.DataFrame(np.array(humaneval), columns = ['Model','HumanEval'])
humaneval['Model'] = [m.lower() for m in humaneval['Model']]
humaneval['HumanEval'] = humaneval['HumanEval'].astype(float)/100

In [17]:
interdata = [[m1]+search(m1, list(cons_lb.Model)) for m1 in list(humaneval.Model)]
interdata = pd.DataFrame(interdata).sort_values(by=[2], ascending=False)
print(interdata.to_string())

                                   0                            1    2
55                     codellama-34b                codellama-34b  100
57           deepseek-coder-33b-base      deepseek-coder-33b-base  100
62          deepseek-coder-6.7b-base     deepseek-coder-6.7b-base  100
64                    starcoder2-15b               starcoder2-15b  100
65        mixtral-8x7b-instruct-v0.1   mixtral-8x7b-instruct-v0.1  100
96                  starcoderbase-3b             starcoderbase-3b  100
97                       gemma-2b-it                  gemma-2b-it  100
15       mixtral-8x22b-instruct-v0.1  mixtral-8x22b-instruct-v0.1  100
51                     codellama-70b                codellama-70b  100
85          deepseek-coder-1.3b-base     deepseek-coder-1.3b-base  100
87                       gemma-7b-it                  gemma-7b-it  100
90                          gemma-2b                     gemma-2b  100
27            codellama-70b-instruct       codellama-70b-instruct  100
72    

In [18]:
names_map = {'codegen-16b':'codegen-16b-nl',
             'codegen-2b':'codegen-2b-nl',
             'codegen-6b':'codegen-6b-nl',
             'codegen2-16b':'codegen2-16b',
             'codegen2-1b':'codegen2-1b',
             'codegen2-3b':'codegen2-3b',
             'codegen2-7b':'codegen2-7b',
             'llama3-70b-instruct':'meta-llama-3-70b-instruct',
             'llama3-8b-base':'meta-llama-3-8b',
             'llama3-10b-base':'meta-llama-3-70b',
             'llama3-8b-instruct':'meta-llama-3-8b-instruct',
             'mistral-7b':'mistral-7b-v0.1',
             'vicuna-13b-v1.1':'vicuna-13b',
             'vicuna-7b-v1.1':'vicuna-7b'}
humaneval = humaneval.replace(names_map)

In [19]:
cons_lb = consolidate_columns(humaneval.merge(cons_lb, on='Model', how='right'))
cons_lb.shape

(231, 117)

### Merging training tokens info

Some models do not have training tokens info (we needed to fill by hand)

In [20]:
training_tokens = pd.read_csv('training_tokens.csv').drop(['Family','Parameters (b)'], axis=1)
training_tokens.columns = ['Model', 'Pretraining Data Size (T)']
cons_lb = consolidate_columns(cons_lb.merge(training_tokens, on='Model', how='left'))

Filling some values

In [21]:
instruct_models = np.array(cons_lb.loc[cons_lb.Instruct].Model)

for m in instruct_models:
    family,size=tuple(cons_lb.loc[cons_lb.Model==m,['Family','#Params (B)']].iloc[0])
    if m=='dolly-v2-3b':size = 2.8
    if m=='dolly-v2-7b':size = 6.9
        
    for var in ['Pretraining Data Size (T)','#Params (B)']:
        ind = np.array(cons_lb.Family==family)*np.array(cons_lb['#Params (B)']==size)*np.array(cons_lb['Instruct']==False)
        if np.sum(ind)>0:
            cons_lb.loc[cons_lb.Model==m,var] = cons_lb.loc[ind].loc[:,var].iloc[0]
        else:
            cons_lb.loc[cons_lb.Model==m,var] = cons_lb.loc[cons_lb.Model==m,var]
cons_lb['FLOPs (1E21)'] = 6*cons_lb['#Params (B)']*cons_lb['Pretraining Data Size (T)']

In [22]:
np.unique(cons_lb['Family'])

array(['amber', 'bloom', 'claude-1', 'claude-2', 'claude-instant',
       'codegen', 'codellama', 'codellama-instruct', 'decilm',
       'deepseek-coder', 'deepseek-llm', 'deepseek-llm-chat',
       'deepseek-moe', 'falcon', 'gemma', 'gemma-1.1', 'gemma-2', 'gpt-2',
       'gpt-3.5-turbo', 'gpt-4', 'gpt-neo/j', 'guanaco', 'internlm2',
       'koala', 'lemur-chat', 'llama', 'llama-2', 'llama-v1.3',
       'meta-llama-3', 'meta-llama-3.1', 'mistral', 'mixtral-8x-v0.1',
       'mpt', 'oasst-sft', 'olmo', 'openchat', 'openllama', 'openllamav2',
       'opt', 'phi', 'pythia', 'qwen', 'qwen1.5', 'qwen2',
       'recurrentgemma', 'redpajama-incite-base', 'rwkv', 'smollm',
       'stablelm', 'starcoder', 'starcoder2', 'vicuna', 'wizardlm',
       'xglm', 'yi', 'yi-1.5', 'yi-200k'], dtype=object)

### Filtering families
Leaving models families that have at least two base models

In [23]:
ind = (1-(cons_lb.Instruct)).astype(bool)
families, counts = np.unique(cons_lb.loc[ind].Family, return_counts=True)
families = [f for (f,c) in zip(families, counts) if c > 1]
cons_lb = cons_lb.loc[[f in families for f in list(cons_lb.Family)]]

In [24]:
cons_lb.Instruct

0      False
1       True
2      False
3       True
4      False
       ...  
225     True
226     True
228     True
229     True
230     True
Name: Instruct, Length: 197, dtype: object

In [25]:
cons_lb.loc[['dedup' in m for m in cons_lb.Model],'Family'] = 'pythia-deduped'

In [26]:
vars = ['Model','Family','Instruct','date','#Params (B)','Pretraining Data Size (T)','FLOPs (1E21)']
cons_lb = cons_lb.loc[:,vars+[c for c in cons_lb.columns if c not in vars]]
cons_lb.to_csv('data_v1.csv')

V2

In [27]:
cons_lb = cons_lb.sort_values(by=['Model','#Params (B)'])
benchs = ['IFEval',
         'BBH',
         'MATH Lvl 5',
         'GPQA',
         'MUSR',
         'MMLU-PRO'] + ['MMLU',
         'ARC',
         'HellaSwag',
         'Winogrande',
         'TruthfulQA',
         'GSM8K',
         'HumanEval']

diffs = []
for model in ['pythia-12b','pythia-160m','pythia-2.8b','pythia-410m','pythia-6.9b']:
    diffs.append(np.abs(np.array(cons_lb.loc[cons_lb.Model==model,benchs])-np.array(cons_lb.loc[cons_lb.Model==model+'-deduped',benchs])))
diffs = np.vstack(diffs)
np.nanmean(diffs,0)

  np.nanmean(diffs,0)


array([       nan,        nan,        nan,        nan,        nan,
              nan, 0.00826668, 0.01478111, 0.00976551, 0.01235135,
       0.0050058 , 0.0022627 ,        nan])

In [28]:
benchs = ['IFEval',
 'BBH',
 'MATH Lvl 5',
 'GPQA',
 'MUSR',
 'MMLU-PRO',
 'bbh_boolean_expressions',
 'bbh_causal_judgement',
 'bbh_date_understanding',
 'bbh_disambiguation_qa',
 'bbh_formal_fallacies',
 'bbh_geometric_shapes',
 'bbh_hyperbaton',
 'bbh_logical_deduction_five_objects',
 'bbh_logical_deduction_seven_objects',
 'bbh_logical_deduction_three_objects',
 'bbh_movie_recommendation',
 'bbh_navigate',
 'bbh_object_counting',
 'bbh_penguins_in_a_table',
 'bbh_reasoning_about_colored_objects',
 'bbh_ruin_names',
 'bbh_salient_translation_error_detection',
 'bbh_snarks',
 'bbh_sports_understanding',
 'bbh_temporal_sequences',
 'bbh_tracking_shuffled_objects_five_objects',
 'bbh_tracking_shuffled_objects_seven_objects',
 'bbh_tracking_shuffled_objects_three_objects',
 'bbh_web_of_lies',
 'gpqa_diamond',
 'gpqa_extended',
 'gpqa_main',
 'math_algebra_hard',
 'math_counting_and_prob_hard',
 'math_geometry_hard',
 'math_intermediate_algebra_hard',
 'math_num_theory_hard',
 'math_prealgebra_hard',
 'math_precalculus_hard',
 'musr_murder_mysteries',
 'musr_object_placements',
 'musr_team_allocation',
 'XWinograd',
 'Arena-Elo',
 'MTBench',
 'hendrycksTest-abstract_algebra',
 'hendrycksTest-anatomy',
 'hendrycksTest-astronomy',
 'hendrycksTest-business_ethics',
 'hendrycksTest-clinical_knowledge',
 'hendrycksTest-college_biology',
 'hendrycksTest-college_chemistry',
 'hendrycksTest-college_computer_science',
 'hendrycksTest-college_mathematics',
 'hendrycksTest-college_medicine',
 'hendrycksTest-college_physics',
 'hendrycksTest-computer_security',
 'hendrycksTest-conceptual_physics',
 'hendrycksTest-econometrics',
 'hendrycksTest-electrical_engineering',
 'hendrycksTest-elementary_mathematics',
 'hendrycksTest-formal_logic',
 'hendrycksTest-global_facts',
 'hendrycksTest-high_school_biology',
 'hendrycksTest-high_school_chemistry',
 'hendrycksTest-high_school_computer_science',
 'hendrycksTest-high_school_european_history',
 'hendrycksTest-high_school_geography',
 'hendrycksTest-high_school_government_and_politics',
 'hendrycksTest-high_school_macroeconomics',
 'hendrycksTest-high_school_mathematics',
 'hendrycksTest-high_school_microeconomics',
 'hendrycksTest-high_school_physics',
 'hendrycksTest-high_school_psychology',
 'hendrycksTest-high_school_statistics',
 'hendrycksTest-high_school_us_history',
 'hendrycksTest-high_school_world_history',
 'hendrycksTest-human_aging',
 'hendrycksTest-human_sexuality',
 'hendrycksTest-international_law',
 'hendrycksTest-jurisprudence',
 'hendrycksTest-logical_fallacies',
 'hendrycksTest-machine_learning',
 'hendrycksTest-management',
 'hendrycksTest-marketing',
 'hendrycksTest-medical_genetics',
 'hendrycksTest-miscellaneous',
 'hendrycksTest-moral_disputes',
 'hendrycksTest-moral_scenarios',
 'hendrycksTest-nutrition',
 'hendrycksTest-philosophy',
 'hendrycksTest-prehistory',
 'hendrycksTest-professional_accounting',
 'hendrycksTest-professional_law',
 'hendrycksTest-professional_medicine',
 'hendrycksTest-professional_psychology',
 'hendrycksTest-public_relations',
 'hendrycksTest-security_studies',
 'hendrycksTest-sociology',
 'hendrycksTest-us_foreign_policy',
 'hendrycksTest-virology',
 'hendrycksTest-world_religions',
 'MMLU',
 'ARC',
 'HellaSwag',
 'Winogrande',
 'TruthfulQA',
 'GSM8K',
 'HumanEval']

In [29]:
for model in ['pythia-12b','pythia-160m','pythia-2.8b','pythia-410m','pythia-6.9b']:
    for bench in benchs:
        x = cons_lb.loc[cons_lb.Model==model,[bench]].iloc[0,0]
        y = cons_lb.loc[cons_lb.Model==model+'-deduped',[bench]].iloc[0,0]
        if np.isnan(x):
            cons_lb.loc[cons_lb.Model==model,[bench]] = y
    cons_lb = cons_lb.loc[cons_lb.Model!=model+'-deduped']
for model in ['pythia-1.4b-deduped','pythia-1b-deduped','pythia-70m-deduped']:
    cons_lb.loc[cons_lb.Model==model,['Family']] = cons_lb.loc[cons_lb.Model==model,['Family']].iloc[0,0].replace('-deduped','')
    cons_lb.loc[cons_lb.Model==model,['Model']] = cons_lb.loc[cons_lb.Model==model,['Model']].iloc[0,0].replace('-deduped','')
    

models_to_delete = ['blossom-v5.1-34b',
 'blossom-v5.1-9b',
 'meta-llama-3.1-70b',
 'meta-llama-3.1-70b-instruct',
 'meta-llama-3.1-8b',
 'meta-llama-3.1-8b-instruct',
 'falcon-rw-1b',
 'sauerkrautlm-gemma-2b',
 'sauerkrautlm-gemma-7b',
 'openhermes-13b',
 'openhermes-7b',
 'orca-2-13b',
 'orca-2-7b',
 'orca_mini_v3_13b',
 'orca_mini_v3_70b',
 'orca_mini_v3_7b',
 'wizardlm-13b-v1.0',
 'wizardlm-70b-v1.0',
 'llama-3-sauerkrautlm-70b-instruct',
 'llama-3-sauerkrautlm-8b-instruct',
 'mpt-30b-chat',
 'mpt-7b-chat',
 'dolphin-2.9.2-qwen2-72b',
 'dolphin-2.9.2-qwen2-7b',
 'dolphin-2.9.1-yi-1.5-34b',
 'dolphin-2.9.1-yi-1.5-9b',
 'orca_mini_v7_7b',
 'orca_mini_v7_72b']


In [30]:
models_to_delete = ['meta-llama-3.1-70b',
 'meta-llama-3.1-70b-instruct',
 'meta-llama-3.1-8b',
 'meta-llama-3.1-8b-instruct']


In [31]:
for model in models_to_delete:
    cons_lb = cons_lb.loc[cons_lb.Model!=model]
cons_lb.shape

(188, 117)

In [32]:
cons_lb = cons_lb.sort_values(by=['Model','#Params (B)'])
cons_lb = cons_lb.reset_index(drop=True)

In [33]:
cons_lb['Family2'] = [remove_params(s) for s in cons_lb.Model]
cons_lb['Family2']

0        bloom
1        bloom
2        bloom
3        bloom
4        bloom
        ...   
183    yi-chat
184         yi
185    yi-200k
186    yi-chat
187         yi
Name: Family2, Length: 188, dtype: object

In [34]:
for f in ['gpt-j', 'gpt-neo', 'gpt-neox']:
    cons_lb.loc[cons_lb['Family2']==f,'Family2'] = 'gpt-j-neo-neox'

In [35]:
np.unique(cons_lb['Family2'])

array(['bloom', 'blossom-v5.1', 'codegen-nl', 'codellama',
       'codellama-instruct', 'deepseek-coder-base', 'dolly-v2',
       'dolphin-2.9.1-yi-1.5', 'dolphin-2.9.2-qwen2', 'falcon',
       'falcon-instruct', 'falcon-rw', 'gemma', 'gemma-2', 'gemma-2-it',
       'gemma-it', 'gpt-j-neo-neox', 'gpt2', 'gpt2-large', 'internlm2',
       'llama', 'llama-2', 'llama-2-chat',
       'llama-3-sauerkrautlm-instruct', 'meta-llama-3',
       'meta-llama-3-instruct', 'mixtral-8x-instruct-v0.1',
       'mixtral-8x-v0.1', 'mpt', 'mpt-chat', 'mpt-instruct', 'olmo',
       'open_llama_', 'open_llama__v2', 'openhermes', 'opt', 'orca-2',
       'orca_mini_v3_', 'orca_mini_v7_', 'phi-1_5', 'phi-2', 'pythia',
       'qwen', 'qwen1.5', 'qwen1.5-chat', 'qwen2', 'qwen2-instruct',
       'recurrentgemma', 'recurrentgemma-it',
       'redpajama-incite-base-v0.1', 'redpajama-incite-base-v1',
       'rwkv-4-pile', 'sauerkrautlm-gemma', 'smollm', 'smollm-instruct',
       'stablelm-2-1_', 'stablelm-2-1_-chat',

In [36]:
np.unique(cons_lb['Family'])

array(['bloom', 'codegen', 'codellama', 'deepseek-coder', 'falcon',
       'gemma', 'gemma-2', 'gpt-2', 'gpt-neo/j', 'internlm2', 'llama',
       'llama-2', 'meta-llama-3', 'mixtral-8x-v0.1', 'mpt', 'olmo',
       'openllama', 'openllamav2', 'opt', 'phi', 'pythia', 'qwen',
       'qwen1.5', 'qwen2', 'recurrentgemma', 'redpajama-incite-base',
       'rwkv', 'smollm', 'stablelm', 'starcoder', 'starcoder2', 'xglm',
       'yi', 'yi-1.5', 'yi-200k'], dtype=object)

In [37]:
cons_lb.to_csv('data_v2.csv')

In [47]:
test_families_list = [['bloom'],
                           ['codegen-nl'],
                           ['codellama'],
                           ['deepseek-coder-base'],
                           ['pythia','dolly-v2'],
                           ['falcon'],
                           ['gemma', 'gemma-it','sauerkrautlm-gemma'],
                           ['gpt-j-neo-neox'], 
                           ['internlm2'],
                           ['meta-llama-3', 'meta-llama-3-instruct'],
                           ['mpt', 'mpt-chat','mpt-instruct'],
                           ['olmo'],
                           ['opt'],
                           ['qwen2'],
                           ['rwkv-4-pile'],
                           #['rwkv-raven'],
                           ['starcoder2'],
                           ['stablelm-base-alpha'],
                           ['xglm'],
                           ['yi-1.5', 'yi-1.5-chat','dolphin-2.9.1-yi-1.5'],
                           ['bloom'],
                           ['pythia','dolly-v2'],
                           ['falcon','falcon-instruct'],
                           ['gemma-2', 'gemma-2-it'],
                           ['gpt-j-neo-neox'], 
                           ['meta-llama-3', 'meta-llama-3-instruct','llama-3-sauerkrautlm-instruct'],
                           ['olmo'],
                           ['opt'],
                           ['qwen2','qwen2-instruct','dolphin-2.9.2-qwen2'],
                           ['starcoder2'],
                           ['smollm', 'smollm-instruct'],
                           ['yi-1.5', 'yi-1.5-chat','dolphin-2.9.1-yi-1.5'],
                           ['bloom'],
                           ['pythia','dolly-v2'],
                           ['falcon'],
                           ['gemma', 'gemma-it', 'sauerkrautlm-gemma'],
                           ['gpt-j-neo-neox'], 
                           ['meta-llama-3', 'meta-llama-3-instruct'],
                           ['olmo'],
                           ['opt'],
                           ['qwen2'],
                           ['starcoder2'],
                           ['yi-1.5', 'yi-1.5-chat','dolphin-2.9.1-yi-1.5']]
test_families_list = [item for sublist in test_families_list for item in sublist]

In [77]:
cons_lb = pd.read_csv('data_v2.csv')

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [124]:
cons_lb = pd.read_csv('data_v2.csv')
cons_lb = cons_lb.loc[np.array([not bool(np.isnan(x)) for x in cons_lb.loc[:,'FLOPs (1E21)']])]
cons_lb['Leaderboard1'] = [bool(not np.isnan(x)) for x in cons_lb['MMLU']]
cons_lb['Leaderboard2'] = [bool(not np.isnan(x)) for x in cons_lb['IFEval']]
cons_lb['HumanEval'] = [bool(not np.isnan(x)) for x in cons_lb['HumanEval']]
cons_lb['OriginalFamily'] = cons_lb['Family']
cons_lb['Family'] = cons_lb['Family2']
cons_lb['TestFamily'] = [x in test_families_list for x in cons_lb['Family']]
cons_lb = cons_lb.loc[:,['Model','Family','OriginalFamily','TestFamily','Leaderboard1','Leaderboard2','HumanEval']]

unique_families, counts_families = np.unique(cons_lb.Family, return_counts=True)
avail_families = unique_families[counts_families>=2]
cons_lb = cons_lb.loc[[f in avail_families for f in cons_lb.Family]]
cons_lb = cons_lb.reset_index(drop=True)

np.mean(cons_lb['Leaderboard1'].astype(int)+cons_lb['Leaderboard2'].astype(int)>0)

np.float64(1.0)

In [127]:
np.unique(cons_lb['Family']).shape

(53,)

In [128]:
np.unique(cons_lb['OriginalFamily']).shape

(30,)

In [129]:
for v in ['Leaderboard1', 'Leaderboard2', 'HumanEval']:
    cons_lb[v] = cons_lb[v].astype(int)

d = cons_lb.loc[:,['OriginalFamily','Leaderboard1', 'Leaderboard2', 'HumanEval']].groupby('OriginalFamily').mean()

In [130]:
(np.array(d)>0).sum(0)

array([28, 17, 18])

In [137]:
(np.array(d['Leaderboard1']+d['Leaderboard2'])>1).sum(0)

np.int64(15)

In [106]:
print(cons_lb.to_latex())

\begin{tabular}{llllrrrr}
\toprule
 & Model & Family & OriginalFamily & TestFamily & Leaderboard1 & Leaderboard2 & HumanEval \\
\midrule
0 & bloom & bloom & bloom & True & True & False & True \\
1 & bloom-1b1 & bloom & bloom & True & True & True & True \\
2 & bloom-3b & bloom & bloom & True & True & True & True \\
3 & bloom-560m & bloom & bloom & True & True & True & True \\
4 & bloom-7b1 & bloom & bloom & True & True & True & True \\
5 & blossom-v5.1-34b & blossom-v5.1 & yi-1.5 & False & True & True & False \\
6 & blossom-v5.1-9b & blossom-v5.1 & yi-1.5 & False & False & True & False \\
7 & codegen-16b-nl & codegen-nl & codegen & True & True & False & True \\
8 & codegen-6b-nl & codegen-nl & codegen & True & True & False & True \\
9 & codellama-13b & codellama & codellama & True & True & False & True \\
10 & codellama-34b & codellama & codellama & True & True & False & True \\
11 & codellama-70b & codellama & codellama & True & True & False & True \\
12 & codellama-7b & codellama & co

In [107]:
EQ_dat = {'meta-llama-3-70b-instruct': 82.13, 'yi-1.5-34b-chat': 72.93, 
          'qwen1.5-32b-chat': 75.59, 'meta-llama-3-8b-instruct': 68.88, 
          'yi-34b-chat': 71.62, 'yi-1.5-9b-chat':70.37, 
          'qwen1.5-14b-chat': 74.99, 'llama-2-70b-chat': 73.59, 
          'yi-1.5-6b-chat': 59.45,'qwen1.5-7b-chat': 54.41, 'gemma-7b-it': 61.72,
            'llama-2-13b-chat': 49.12, 'llama-2-7b-chat': 36.32, 'qwen1.5-4b-chat': 28.75, 'qwen1.5-1.8b-chat': 24.12  }

In [110]:
np.unique(list(EQ_dat.keys()))

array(['gemma-7b-it', 'llama-2-13b-chat', 'llama-2-70b-chat',
       'llama-2-7b-chat', 'meta-llama-3-70b-instruct',
       'meta-llama-3-8b-instruct', 'qwen1.5-1.8b-chat',
       'qwen1.5-14b-chat', 'qwen1.5-32b-chat', 'qwen1.5-4b-chat',
       'qwen1.5-7b-chat', 'yi-1.5-34b-chat', 'yi-1.5-6b-chat',
       'yi-1.5-9b-chat', 'yi-34b-chat'], dtype='<U25')

Exploring the data 1

In [187]:
ind = np.array(cons_lb.Instruct)
ind = (1-ind).astype(bool)

In [188]:
vars1 = ['Model', 'Family', 'Instruct','Pretraining Data Size (T)','#Params (B)',
        'ARC', 'HellaSwag', 'TruthfulQA', 'GSM8K', 'Winogrande']
np.sum(np.unique(cons_lb.loc[ind,vars].dropna().Family, return_counts=True)[1]>1), np.unique(cons_lb.loc[ind,vars].dropna().Family, return_counts=True)

(17,
 (array(['bloom', 'falcon', 'gemma', 'gemma-2', 'gpt-neo/j', 'llama',
         'llama-2', 'meta-llama-3', 'olmo', 'opt', 'pythia', 'qwen1.5',
         'qwen2', 'recurrentgemma', 'smollm', 'starcoder2', 'yi', 'yi-1.5'],
        dtype=object),
  array([4, 2, 2, 2, 2, 3, 3, 2, 2, 2, 5, 6, 4, 1, 3, 3, 3, 3])))

In [189]:
vars2 = ['Model', 'Family', 'Instruct','Pretraining Data Size (T)','#Params (B)',
        'IFEval', 'BBH', 'MATH Lvl 5', 'GPQA', 'MUSR', 'MMLU-PRO']
np.sum(np.unique(cons_lb.loc[ind,vars].dropna().Family, return_counts=True)[1]>1), np.unique(cons_lb.loc[ind,vars].dropna().Family, return_counts=True)

(17,
 (array(['bloom', 'falcon', 'gemma', 'gemma-2', 'gpt-neo/j', 'llama',
         'llama-2', 'meta-llama-3', 'olmo', 'opt', 'pythia', 'qwen1.5',
         'qwen2', 'recurrentgemma', 'smollm', 'starcoder2', 'yi', 'yi-1.5'],
        dtype=object),
  array([4, 2, 2, 2, 2, 3, 3, 2, 2, 2, 5, 6, 4, 1, 3, 3, 3, 3])))

In [190]:
vars3 = vars2 + ['ARC', 'HellaSwag', 'TruthfulQA', 'GSM8K', 'Winogrande','MMLU']
np.sum(np.unique(cons_lb.loc[ind,vars].dropna().Family, return_counts=True)[1]>1), np.unique(cons_lb.loc[ind,vars].dropna().Family, return_counts=True)

(17,
 (array(['bloom', 'falcon', 'gemma', 'gemma-2', 'gpt-neo/j', 'llama',
         'llama-2', 'meta-llama-3', 'olmo', 'opt', 'pythia', 'qwen1.5',
         'qwen2', 'recurrentgemma', 'smollm', 'starcoder2', 'yi', 'yi-1.5'],
        dtype=object),
  array([4, 2, 2, 2, 2, 3, 3, 2, 2, 2, 5, 6, 4, 1, 3, 3, 3, 3])))

In [191]:
vars4 = vars3 + ['HumanEval']
np.sum(np.unique(cons_lb.loc[ind,vars].dropna().Family, return_counts=True)[1]>1), np.unique(cons_lb.loc[ind,vars].dropna().Family, return_counts=True)

(17,
 (array(['bloom', 'falcon', 'gemma', 'gemma-2', 'gpt-neo/j', 'llama',
         'llama-2', 'meta-llama-3', 'olmo', 'opt', 'pythia', 'qwen1.5',
         'qwen2', 'recurrentgemma', 'smollm', 'starcoder2', 'yi', 'yi-1.5'],
        dtype=object),
  array([4, 2, 2, 2, 2, 3, 3, 2, 2, 2, 5, 6, 4, 1, 3, 3, 3, 3])))

Exploring the data 2

In [192]:
ind = np.array(cons_lb.Instruct)
vars1

['Model',
 'Family',
 'Instruct',
 'Pretraining Data Size (T)',
 '#Params (B)',
 'ARC',
 'HellaSwag',
 'TruthfulQA',
 'GSM8K',
 'Winogrande']

In [193]:
cons_lb.loc[ind,vars1].dropna().sort_values(by=['Family','#Params (B)'])

Unnamed: 0,Model,Family,Instruct,Pretraining Data Size (T),#Params (B),ARC,HellaSwag,TruthfulQA,GSM8K,Winogrande
14,codellama-70b-instruct,codellama,True,3.02,70.0,0.5503,0.7724,0.5044,0.4625,0.7451
31,falcon-7b-instruct,falcon,True,1.5,7.0,0.4582,0.7078,0.4407,0.0462,0.6803
38,gemma-2b-it,gemma,True,6.0,2.0,0.4394,0.627,0.4582,0.0546,0.6093
149,sauerkrautlm-gemma-2b,gemma,True,6.0,2.0,0.4872,0.7141,0.3577,0.2676,0.6796
40,gemma-7b-it,gemma,True,6.0,7.0,0.5145,0.7196,0.4729,0.2919,0.6796
150,sauerkrautlm-gemma-7b,gemma,True,6.0,7.0,0.5998,0.8191,0.61,0.6368,0.7664
56,llama-2-7b-chat,llama-2,True,2.0,7.0,0.52901,0.785501,0.455704,0.073541,0.717443
84,openhermes-7b,llama-2,True,2.0,7.0,0.5614,0.7832,0.45,0.05,0.7451
94,orca-2-7b,llama-2,True,2.0,7.0,0.541,0.7619,0.5245,0.1471,0.7348
97,orca_mini_v3_7b,llama-2,True,2.0,7.0,0.5691,0.7964,0.5051,0.0713,0.7427


In [194]:
cons_lb.loc[ind,vars2].dropna().sort_values(by=['Family','#Params (B)'])

Unnamed: 0,Model,Family,Instruct,Pretraining Data Size (T),#Params (B),IFEval,BBH,MATH Lvl 5,GPQA,MUSR,MMLU-PRO
31,falcon-7b-instruct,falcon,True,1.5,7.0,0.2,0.32,0.01,0.25,0.36,0.12
29,falcon-40b-instruct,falcon,True,1.0,40.0,0.25,0.41,0.02,0.25,0.38,0.23
38,gemma-2b-it,gemma,True,6.0,2.0,0.27,0.32,0.0,0.28,0.33,0.14
149,sauerkrautlm-gemma-2b,gemma,True,6.0,2.0,0.25,0.34,0.02,0.26,0.37,0.15
40,gemma-7b-it,gemma,True,6.0,7.0,0.39,0.36,0.02,0.28,0.43,0.17
150,sauerkrautlm-gemma-7b,gemma,True,6.0,7.0,0.34,0.42,0.05,0.29,0.36,0.3
34,gemma-2-2b-it,gemma-2,True,8.0,2.0,0.57,0.42,0.0,0.27,0.39,0.25
36,gemma-2-9b-it,gemma-2,True,13.0,9.0,0.75,0.6,0.0,0.35,0.41,0.39
56,llama-2-7b-chat,llama-2,True,2.0,7.0,0.4,0.31,0.01,0.25,0.37,0.17
84,openhermes-7b,llama-2,True,2.0,7.0,0.18,0.36,0.01,0.27,0.43,0.19


In [195]:
cons_lb.loc[ind,vars3].dropna().sort_values(by=['Family','#Params (B)'])

Unnamed: 0,Model,Family,Instruct,Pretraining Data Size (T),#Params (B),IFEval,BBH,MATH Lvl 5,GPQA,MUSR,MMLU-PRO,ARC,HellaSwag,TruthfulQA,GSM8K,Winogrande,MMLU
31,falcon-7b-instruct,falcon,True,1.5,7.0,0.2,0.32,0.01,0.25,0.36,0.12,0.4582,0.7078,0.4407,0.0462,0.6803,0.2566
38,gemma-2b-it,gemma,True,6.0,2.0,0.27,0.32,0.0,0.28,0.33,0.14,0.4394,0.627,0.4582,0.0546,0.6093,0.3765
149,sauerkrautlm-gemma-2b,gemma,True,6.0,2.0,0.25,0.34,0.02,0.26,0.37,0.15,0.4872,0.7141,0.3577,0.2676,0.6796,0.429
40,gemma-7b-it,gemma,True,6.0,7.0,0.39,0.36,0.02,0.28,0.43,0.17,0.5145,0.7196,0.4729,0.2919,0.6796,0.5352
150,sauerkrautlm-gemma-7b,gemma,True,6.0,7.0,0.34,0.42,0.05,0.29,0.36,0.3,0.5998,0.8191,0.61,0.6368,0.7664,0.6376
56,llama-2-7b-chat,llama-2,True,2.0,7.0,0.4,0.31,0.01,0.25,0.37,0.17,0.52901,0.785501,0.455704,0.073541,0.717443,0.470594
84,openhermes-7b,llama-2,True,2.0,7.0,0.18,0.36,0.01,0.27,0.43,0.19,0.5614,0.7832,0.45,0.05,0.7451,0.4862
94,orca-2-7b,llama-2,True,2.0,7.0,0.22,0.45,0.01,0.26,0.5,0.23,0.541,0.7619,0.5245,0.1471,0.7348,0.5637
97,orca_mini_v3_7b,llama-2,True,2.0,7.0,0.28,0.41,0.0,0.25,0.5,0.21,0.5691,0.7964,0.5051,0.0713,0.7427,0.5237
52,llama-2-13b-chat,llama-2,True,2.0,13.0,0.4,0.33,0.01,0.23,0.4,0.19,0.590444,0.819359,0.441179,0.152388,0.745067,0.541181


In [196]:
cons_lb.loc[ind,vars4].dropna().sort_values(by=['Family','#Params (B)'])

Unnamed: 0,Model,Family,Instruct,Pretraining Data Size (T),#Params (B),IFEval,BBH,MATH Lvl 5,GPQA,MUSR,MMLU-PRO,ARC,HellaSwag,TruthfulQA,GSM8K,Winogrande,MMLU,HumanEval
38,gemma-2b-it,gemma,True,6.0,2.0,0.27,0.32,0.0,0.28,0.33,0.14,0.4394,0.627,0.4582,0.0546,0.6093,0.3765,0.177
40,gemma-7b-it,gemma,True,6.0,7.0,0.39,0.36,0.02,0.28,0.43,0.17,0.5145,0.7196,0.4729,0.2919,0.6796,0.5352,0.287
56,llama-2-7b-chat,llama-2,True,2.0,7.0,0.4,0.31,0.01,0.25,0.37,0.17,0.52901,0.785501,0.455704,0.073541,0.717443,0.470594,0.121951
52,llama-2-13b-chat,llama-2,True,2.0,13.0,0.4,0.33,0.01,0.23,0.4,0.19,0.590444,0.819359,0.441179,0.152388,0.745067,0.541181,0.182927
54,llama-2-70b-chat,llama-2,True,2.0,70.0,0.5,0.3,0.01,0.26,0.37,0.24,0.645904,0.858793,0.528047,0.266869,0.805051,0.634535,0.317073
65,meta-llama-3-8b-instruct,meta-llama-3,True,15.0,8.0,0.74,0.5,0.09,0.26,0.36,0.37,0.6075,0.7855,0.5165,0.6869,0.7451,0.6707,0.616
63,meta-llama-3-70b-instruct,meta-llama-3,True,15.0,70.0,0.81,0.65,0.23,0.29,0.42,0.52,0.7142,0.8569,0.6181,0.8544,0.8287,0.8006,0.774
20,dolly-v2-12b,pythia,True,0.25,12.0,0.24,0.33,0.01,0.24,0.37,0.11,0.424061,0.725254,0.338271,0.01213,0.608524,0.258084,0.0


# Inlcuding Llama 3.1 data

In [748]:
llama_data = pd.DataFrame({'models':['meta-llama/Meta-Llama-3.1-8B',
                                     'meta-llama/Meta-Llama-3.1-8B-Instruct',
                                     'meta-llama/Meta-Llama-3.1-70B',
                                     'meta-llama/Meta-Llama-3.1-70B-Instruct',
                                     'meta-llama/Meta-Llama-3.1-405B',
                                     'meta-llama/Meta-Llama-3.1-405B-Instruct']})

metrics = {'BIG-Bench Hard':'average/em',
           'Winogrande':'acc_char',
           'MMLU': 'macro_avg/acc_char',
           'MMLU-Pro': 'macro_avg/em',
           'ARC-C': 'acc_char'}

metrics_instruct = {'MMLU': 'macro_avg/acc', #'macro_avg/acc_char',
                   'MMLU-Pro': 'micro_avg/acc',
                   'ARC-C': 'acc',
                   'MATH-HARD':'final_em',
                   'GSM8K':'em_maj1@1',
                    'GPQA':'acc',
                    'IFEval Strict':'startend_total',
                    'HumanEval':'pass@1'}

benchs = np.unique(list(metrics.keys())+list(metrics_instruct.keys())).tolist()
for b in benchs:
    llama_data[b]=None

for m in list(llama_data.models):
    eval_data = load_dataset(f"{m}-evals", f'{m.replace('meta-llama/','')}-evals__metrics', split="latest", use_auth_token="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")
    eval_data = eval_data.to_pandas()

    for b in benchs:
        if b in np.unique(eval_data.benchmark_label).tolist():
            if 'Instruct' in m:
                llama_data.loc[llama_data.models==m, b] = eval_data.loc[np.array(eval_data.benchmark_label==b)*np.array(eval_data.metric_tag==metrics_instruct[b])].metric_value_computed.iloc[0]
            else:
                llama_data.loc[llama_data.models==m, b] = eval_data.loc[np.array(eval_data.benchmark_label==b)*np.array(eval_data.metric_tag==metrics[b])].metric_value_computed.iloc[0]



In [749]:
llama_data

Unnamed: 0,models,ARC-C,BIG-Bench Hard,GPQA,GSM8K,HumanEval,IFEval Strict,MATH-HARD,MMLU,MMLU-Pro,Winogrande
0,meta-llama/Meta-Llama-3.1-8B,79.7,64.2,,,,,,66.7,37.1,60.5
1,meta-llama/Meta-Llama-3.1-8B-Instruct,83.4,,32.8,84.5,72.6,12.4,25.4,69.4,47.0,
2,meta-llama/Meta-Llama-3.1-70B,92.9,81.6,,,,,,79.3,53.8,83.3
3,meta-llama/Meta-Llama-3.1-70B-Instruct,94.8,,46.7,95.1,80.5,12.4,43.8,83.6,65.1,
4,meta-llama/Meta-Llama-3.1-405B,96.1,85.9,,,,,,85.2,61.6,86.7
5,meta-llama/Meta-Llama-3.1-405B-Instruct,96.9,,51.1,96.8,89.0,12.4,53.4,87.3,72.2,


In [756]:
cons_lb.loc[cons_lb.Family=='meta-llama-3.1',['Model']+['BBH', 'GPQA', 'GSM8K','HumanEval', 'IFEval','MATH Lvl 5', 'MMLU', 'MMLU-PRO', 'Winogrande']]

Unnamed: 0,Model,BBH,GPQA,GSM8K,HumanEval,IFEval,MATH Lvl 5,MMLU,MMLU-PRO,Winogrande
82,meta-llama-3.1-70b,0.63,0.39,,,0.17,0.17,,0.47,
83,meta-llama-3.1-70b-instruct,0.68,0.32,,,0.84,0.03,,0.53,
84,meta-llama-3.1-8b,0.47,0.3,,,0.13,0.05,,0.32,
85,meta-llama-3.1-8b-instruct,0.5,0.27,,,0.77,0.16,,0.37,
