In [1]:
import pandas as pd
import numpy as np
import pickle
from matplotlib.colors import ListedColormap
import statsmodels.api as sm
import statsmodels.formula.api as smf
from irt import *
data_path = '/llmthonskdir/felipe/download_openllmlb/'

def filter(s):
    try:s = s.split("/")[1]
    except: s = s
    try:s = s.split("__")[1]
    except: s = s
    return s.lower().replace("-hf","").replace("_","").replace("-","")
    
def search(s, s_list):
    scores = [fuzz.token_sort_ratio(filter(s), filter(s_try)) for s_try in s_list]
    return [s_list[np.argmax(scores)], np.max(scores)]

scenarios = ['harness_arc_challenge_25', 'harness_gsm8k_5', 'harness_hellaswag_10', 'harness_truthfulqa_mc_0', 'harness_winogrande_5', 'harness_hendrycksTest_abstract_algebra_5', 'harness_hendrycksTest_anatomy_5', 'harness_hendrycksTest_astronomy_5', 'harness_hendrycksTest_business_ethics_5', 'harness_hendrycksTest_clinical_knowledge_5', 'harness_hendrycksTest_college_biology_5', 'harness_hendrycksTest_college_chemistry_5', 'harness_hendrycksTest_college_computer_science_5', 'harness_hendrycksTest_college_mathematics_5', 'harness_hendrycksTest_college_medicine_5', 'harness_hendrycksTest_college_physics_5', 'harness_hendrycksTest_computer_security_5', 'harness_hendrycksTest_conceptual_physics_5', 'harness_hendrycksTest_econometrics_5', 'harness_hendrycksTest_electrical_engineering_5', 'harness_hendrycksTest_elementary_mathematics_5', 'harness_hendrycksTest_formal_logic_5', 'harness_hendrycksTest_global_facts_5', 'harness_hendrycksTest_high_school_biology_5', 'harness_hendrycksTest_high_school_chemistry_5', 'harness_hendrycksTest_high_school_computer_science_5', 'harness_hendrycksTest_high_school_european_history_5', 'harness_hendrycksTest_high_school_geography_5', 'harness_hendrycksTest_high_school_government_and_politics_5', 'harness_hendrycksTest_high_school_macroeconomics_5', 'harness_hendrycksTest_high_school_mathematics_5', 'harness_hendrycksTest_high_school_microeconomics_5', 'harness_hendrycksTest_high_school_physics_5', 'harness_hendrycksTest_high_school_psychology_5', 'harness_hendrycksTest_high_school_statistics_5', 'harness_hendrycksTest_high_school_us_history_5', 'harness_hendrycksTest_high_school_world_history_5', 'harness_hendrycksTest_human_aging_5', 'harness_hendrycksTest_human_sexuality_5', 'harness_hendrycksTest_international_law_5', 'harness_hendrycksTest_jurisprudence_5', 'harness_hendrycksTest_logical_fallacies_5', 'harness_hendrycksTest_machine_learning_5', 'harness_hendrycksTest_management_5', 'harness_hendrycksTest_marketing_5', 'harness_hendrycksTest_medical_genetics_5', 'harness_hendrycksTest_miscellaneous_5', 'harness_hendrycksTest_moral_disputes_5', 'harness_hendrycksTest_moral_scenarios_5', 'harness_hendrycksTest_nutrition_5', 'harness_hendrycksTest_philosophy_5', 'harness_hendrycksTest_prehistory_5', 'harness_hendrycksTest_professional_accounting_5', 'harness_hendrycksTest_professional_law_5', 'harness_hendrycksTest_professional_medicine_5', 'harness_hendrycksTest_professional_psychology_5', 'harness_hendrycksTest_public_relations_5', 'harness_hendrycksTest_security_studies_5', 'harness_hendrycksTest_sociology_5', 'harness_hendrycksTest_us_foreign_policy_5', 'harness_hendrycksTest_virology_5', 'harness_hendrycksTest_world_religions_5']

In [2]:
scenario = 'harness_arc_challenge_25'
fam=0
n_fam = 17
cmap = ListedColormap(plt.get_cmap('tab20').colors[:n_fam])  # Ensure unique colors

In [None]:
for scenario in scenarios:
    for fam in tqdm(range(17)):
        ###
        with open('lb.pickle', 'rb') as handle:
            lb_data1 = pickle.load(handle)
        with open(data_path+'scaling_laws/old_leaderboard_processed_20240630.pickle', 'rb') as handle:
            lb_data2 = pickle.load(handle)
        scaling_base = pd.read_csv(data_path+'scaling_laws/base_llm_benchmark_eval.csv')
        scaling_inst = pd.read_csv(data_path+'scaling_laws/instruct_llm_benchmark_eval.csv')
        scaling_data = pd.concat((scaling_base, scaling_inst))
        
        ###
        models_lb_tb = lb_data1['models']
        models_lb_tb = [filter(m) for m in models_lb_tb]
        models_lb_scaling = lb_data2[scenario]['models']
        models_lb_scaling = [filter(m) for m in models_lb_scaling]
        models_scaling = list(scaling_data.Model)
        models_scaling = [filter(m) for m in models_scaling]
        ind = [i for i,m in enumerate(models_lb_tb) if m not in models_lb_scaling]
        models_lb_tb = np.array(models_lb_tb)[ind].tolist()
        lb_data1['data'][scenario]['correctness'] = lb_data1['data'][scenario]['correctness'].T[ind]
        models_lb = models_lb_scaling+models_lb_tb
        
        ###
        scaling_data = scaling_data.loc[[m in models_lb_scaling for m in models_scaling]]
        scaling_data['logFLOPs'] = np.log2(scaling_data.loc[:,['FLOPs (1E21)']])
        scaling_data = scaling_data.loc[:,['Model','Model Family', 'logFLOPs', 'Model Size (B)']]
        scaling_data = scaling_data.rename(columns={'Model':'models', 'Model Family': "family"})
        scaling_data = scaling_data.sort_values(by=['family', 'logFLOPs']).reset_index(drop=True)
        scaling_data = scaling_data.loc[~np.array(np.isnan(scaling_data.loc[:,['logFLOPs']])).squeeze()]
        scaling_data = scaling_data.groupby('family').filter(lambda x: len(x) > 1)
        
        ###
        families = np.unique(scaling_data.family).tolist()
        test_fam = [families[fam]]
        train_fam = [f for f in families if f not in test_fam]
        ind = [i for i,m in enumerate(models_lb) if m not in [filter(m) for m in scaling_data.loc[scaling_data.family==test_fam[0]].models]]
        Y = np.vstack((lb_data2[scenario]['correctness'], lb_data1['data'][scenario]['correctness']))
        irt = IRT([1])
        irt.fit(Y[ind], tol=1e-2, verbose=False)
    
        ###
        models_scaling = list(scaling_data.models)
        models_scaling = [filter(m) for m in models_scaling]
        thetas = np.array(len(models_scaling)*[np.nan])
        models_test = np.array(models_lb)[[i for i in range(Y.shape[0]) if i not in ind]].tolist()
        models_train = [m for m in models_scaling if m not in models_test]
        theta_train = irt.Theta.squeeze()[[np.argmax(np.array(models_lb)[ind]==m) for m in models_train]]#[[i for i,m in enumerate(np.array(models_lb)[ind].tolist()) if m in models_train]]
        theta_test = irt.fit_theta(Y[[i for i in range(Y.shape[0]) if i not in ind]], list(range(Y.shape[1]))).squeeze()
        thetas[[np.argmax(np.array(models_scaling)==m) for m in models_train]] = theta_train
        thetas[[np.argmax(np.array(models_scaling)==m) for m in models_test]] = theta_test
        scaling_data['thetas']=thetas
        
        
        ###
        test_ind = np.array([f in test_fam for f in scaling_data.family])
        train_ind = ~np.array(test_ind)
        test_data = scaling_data.loc[test_ind]
        train_data = scaling_data.loc[train_ind]
    
        mod = smf.ols(formula='thetas ~ logFLOPs + family - 1', data=train_data)
        mod = mod.fit()
        
        intercept = np.array(test_data.thetas)[0] - (mod.params['logFLOPs']*np.array(test_data.logFLOPs))[0]
        preds = np.array(intercept + mod.params['logFLOPs']*np.array(test_data.logFLOPs)).tolist()[1:]
        trues = np.array(test_data.thetas).tolist()[1:]
        plt.plot(trues, preds, 'o', label=test_fam[0], color=cmap(fam))
    plt.title(scenario)
    plt.show()

  0%|          | 0/17 [00:00<?, ?it/s]

In [43]:
len(families)

17