In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from joblib import Parallel, delayed
from methods import *
from experiments import *

random_seed = 42
threshs = [.5,1.5,3]
benchs = ['MMLU', 'ARC-C', 'HellaSwag', 'Winograd', 'TruthfulQA','GSM8K', 'XWinograd', 'HumanEval']

In [2]:
results = {}

for filter_families in [None, True, False]:
    for bench in tqdm(benchs):
        ### Preparing data
        data = pd.read_csv('data/base_llm_benchmark_eval.csv')
        data = data.loc[-np.isnan(data['FLOPs (1E21)'])]
        
        fam_encoder = LabelEncoder()
        fam_encoder.fit(data['Model Family'])
        data['T'] = data['Pretraining Data Size (T)']
        data['S'] = data['Model Size (B)']
        data['F'] = data['FLOPs (1E21)']
        data['family'] = data['Model Family']

        bad_families = ['XGLM','OPT','Codegen','GPT-Neo/J','Pythia','BLOOM']
        #bad_families = ['XGLM','OPT','StarCoder','StableLM','Pythia','GPT-Neo/J','BLOOM']
        if filter_families is None:
            pass
        elif filter_families:
            data = data.loc[[f not in bad_families for f in data.family]]
        else:
            data = data.loc[[f in bad_families for f in data.family]]
            
        data['Y'] = data[bench]
        data = data[['family','Y','T','S','F',]]
        data = data.loc[-np.isnan(data['Y'])]
        data = data.sort_values(by=['family','S']).reset_index(drop=True)
        
        data['logT'] = np.log(data['T'])
        data['logS'] = np.log(data['S'])
        data['logF'] = np.log(data['F'])
        data['logS*logT'] = data['logS']*data['logT']
        
        data['T_verylow'] = 0
        data['T_low'] = 0
        data['T_mid'] = 0
        data['T_high'] = 0
        data.loc[(np.array(data['T'])<threshs[0]), 'T_verylow'] = 1
        data.loc[(np.array(data['T'])>=threshs[0])*(np.array(data['T'])<threshs[1]), 'T_low'] = 1
        data.loc[(np.array(data['T'])>=threshs[1])*(np.array(data['T'])<threshs[2]), 'T_mid'] = 1
        data.loc[(np.array(data['T'])>=threshs[2]), 'T_high'] = 1
        
        ### Preparing data
        avail_families = []
        for fam in np.unique(data['family']):
            if np.sum(data['family']==fam)>=2:
                avail_families.append(fam)
        
        results[bench] = Parallel(n_jobs=-1, verbose=10)(
                                     delayed(get_results)(data, test_family) for test_family in avail_families
                                  )
        
        ### Saving results
        np.save(f"results/results_filter-{filter_families}.npy", results)

  0%|          | 0/8 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  25 | elapsed:  3.6min remaining: 19.1min
[Parallel(n_jobs=-1)]: Done   7 out of  25 | elapsed:  3.7min remaining:  9.4min
[Parallel(n_jobs=-1)]: Done  10 out of  25 | elapsed:  3.7min remaining:  5.6min
[Parallel(n_jobs=-1)]: Done  13 out of  25 | elapsed:  3.8min remaining:  3.5min
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed:  3.8min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:  3.8min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  22 out of  25 | elapsed:  3.8min remaining:   31.4s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  3.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done   3 out of  24 | elapsed:   32.4s remaining:  3.8min
[Parallel(n_jobs=-1)]: Done   6 out of  24 | elapsed:   33.3s rem

  0%|          | 0/8 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  19 | elapsed:  1.0min remaining:  8.8min
[Parallel(n_jobs=-1)]: Done   4 out of  19 | elapsed:  1.1min remaining:  3.9min
[Parallel(n_jobs=-1)]: Done   6 out of  19 | elapsed:  1.2min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done   8 out of  19 | elapsed:  1.2min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  10 out of  19 | elapsed:  1.2min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  12 out of  19 | elapsed:  1.2min remaining:   42.3s
[Parallel(n_jobs=-1)]: Done  14 out of  19 | elapsed:  1.2min remaining:   26.0s
[Parallel(n_jobs=-1)]: Done  16 out of  19 | elapsed:  1.2min remaining:   13.9s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:  1.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  18 | elapsed:   24.8s remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   5 out of  18 | e

  0%|          | 0/8 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:   12.1s remaining:   24.3s
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   13.2s remaining:   13.2s
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:   13.5s remaining:    6.8s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   14.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:   14.8s remaining:   29.5s
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   15.1s remaining:   15.1s
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed:   15.4s remaining:    7.7s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   17.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:   15.7s remaining:   31.3s
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   16.4