In [1]:
import pickle
import copy
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [2]:
benchs = ['lb', 'mmlu', 'helm_lite', 'alpaca']
number_item = 100
method = 'anchor-irt'
tinyBenchmarks = {}

In [3]:
for bench in benchs:
    with open(f'results/samples_{bench}_iterations-5.pickle', 'rb') as handle:
        sample_data = pickle.load(handle)

    scenarios = list(sample_data['scenarios_position'].keys())

    with open(f'results/results_{bench}_split-iid_iterations-5.pickle', 'rb') as handle:
        results = pickle.load(handle)
        avg_error = np.mean([np.mean([results[it][number_item][method+"_naive"][scenario] for it in results.keys()],axis=0) for scenario in scenarios], axis=0)
    
    print(bench,np.min(avg_error),avg_error)
    best_it = np.argmin(avg_error)
    
    optimal_lambdas = {}
    for scenario in scenarios:
        optimal_lambdas[scenario] = sample_data['opt_lambds'][method+"_gpirt"][scenario][number_item]
    
    tinyBenchmarks[bench] = {'seen_examples':sample_data['seen_items'][method][number_item][best_it],
                             'examples_weights':sample_data['item_weights'][method][number_item][best_it],
                             'irt_parameters':{"A":sample_data["A"], "B":sample_data["B"]},
                             'scenarios_position':sample_data['scenarios_position'],
                             'subscenarios_position':sample_data['subscenarios_position'],
                             'optimal_lambdas':optimal_lambdas}

lb 0.01906167653388267 [0.01906168 0.01998235 0.02033831 0.0202809  0.02015149]
mmlu 0.024398970156813124 [0.02728616 0.02829303 0.03106703 0.02439897 0.03167772]
helm_lite 0.0242250825876071 [0.02748594 0.02689979 0.02783222 0.02589602 0.02422508]
alpaca 0.01119042421102733 [0.0116694  0.01212778 0.01119042 0.01195137 0.01160708]


In [4]:
with open('tinyBenchmarks.pkl', 'wb') as f:
    pickle.dump(tinyBenchmarks, f)

Checking invidivual performances of different benchmarks

In [5]:
tinyBenchmarks['lb']['scenarios_position'].keys()

dict_keys(['truthfulqa', 'gsm8k', 'winogrande', 'arc', 'hellaswag', 'mmlu'])

In [6]:
!pip install --ignore-installed git+https://github.com/felipemaiapolo/tinyBenchmarks

Collecting git+https://github.com/felipemaiapolo/tinyBenchmarks
  Cloning https://github.com/felipemaiapolo/tinyBenchmarks to /tmp/pip-req-build-42evdu31
  Running command git clone --filter=blob:none --quiet https://github.com/felipemaiapolo/tinyBenchmarks /tmp/pip-req-build-42evdu31
  Resolved https://github.com/felipemaiapolo/tinyBenchmarks to commit 7bd5ac75fee01d7ac0b55f9b8198abb443f6a316
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting numpy (from tinyBenchmarks==1.0.0)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy (from tinyBenchmarks==1.0.0)
  Using cached scipy-1.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting requests (from tinyBenchmarks==1.0.0)
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests->tinyBenchmarks==1.0.0)
  Using cached charset_normalizer-3.3.2-cp311-cp311-manylinu

In [7]:
import tinyBenchmarks as tb
import multiprocessing as mp
from run_experiment import get_data
from utils import prepare_and_split_data

def array_to_markdown_table(avg, std, headers, row_names):
    """
    Converts a NumPy array into a Markdown table.
    
    Parameters:
        array (np.array): The NumPy array to be converted.
        headers (list): A list of strings for the table headers.
        row_names (list): A list of strings for the row names.
    
    Returns:
        str: A string formatted as a Markdown table.
    """
    # Start with the header
    markdown_table = "|| " + " | ".join(headers) + " |\n"
    # Add the separator
    markdown_table += "|--|-" + "-|-".join(["" for _ in headers]) + "-|\n"
    
    # Add each row of the array to the table
    i=0
    for row_name, row_values in zip(row_names, avg):
        row_str = [f"{row_values[j]:.3f} ({std[i][j]:.3f})" for j in range(len(row_values))]  # Format numbers to 3 decimal places
        markdown_table += f"| {row_name} | " + " | ".join(row_str) + " |\n"
        i+=1
    return markdown_table

In [8]:
for bench in ['lb','mmlu','alpaca','truthfulqa', 'gsm8k', 'winogrande', 'arc', 'hellaswag']:
    ###
    number_of_examples = 100
    lb_scenarios = ['truthfulqa', 'gsm8k', 'winogrande', 'arc', 'hellaswag']
    benchs = ['lb', 'mmlu', 'helm_lite', 'alpaca']
    if bench in lb_scenarios: bench_name = 'lb'
    else: bench_name = bench

    data, scenarios, set_of_rows = get_data(bench_name, split='iid')
    test_rows = set_of_rows[0]
    scores_train, scores_test, balance_weights, scenarios_position, subscenarios_position = prepare_and_split_data(scenarios, scenarios, data, test_rows)

    seen_examples = tinyBenchmarks[bench_name]['seen_examples']
    scenarios_position = tinyBenchmarks[bench_name]['scenarios_position']

    if bench not in benchs:
        scenarios = [bench]
        ind_scenario = number_of_examples*([i for i,s in enumerate(scenarios_position.keys()) if s==bench][0])
        seen_examples = seen_examples[ind_scenario:ind_scenario+number_of_examples]
    else:
        scenarios = list(scenarios_position.keys())

    ###
    pool = mp.Pool(mp.cpu_count())
    accs = pool.starmap(tb.evaluate, [(scores_test[llm,seen_examples], bench) for llm in range(scores_test.shape[0])])
    pool.close()
    pool.join()

    estimates = np.array([[[acc[scenario]['irt'], acc[scenario]['pirt'], acc[scenario]['gpirt']] for scenario in scenarios] for acc in accs])
    true_accs = np.array([(balance_weights*scores_test)[:,scenarios_position[scenario]].mean(axis=1) for scenario in scenarios]).T[:,:,None]
    errors = np.abs(estimates-true_accs)

    markdown_table = array_to_markdown_table(errors.mean(axis=0), errors.std(axis=0), ["IRT", "p-IRT", "gp-IRT"], scenarios)
    print(markdown_table)

99 395
|| IRT | p-IRT | gp-IRT |
|--|--|--|--|
| truthfulqa | 0.013 (0.010) | 0.016 (0.013) | 0.011 (0.009) |
| gsm8k | 0.022 (0.017) | 0.022 (0.017) | 0.020 (0.015) |
| winogrande | 0.022 (0.017) | 0.011 (0.013) | 0.011 (0.011) |
| arc | 0.022 (0.018) | 0.012 (0.010) | 0.010 (0.009) |
| hellaswag | 0.013 (0.016) | 0.011 (0.020) | 0.011 (0.018) |
| mmlu | 0.024 (0.018) | 0.017 (0.017) | 0.015 (0.015) |

99 395
|| IRT | p-IRT | gp-IRT |
|--|--|--|--|
| mmlu | 0.024 (0.017) | 0.016 (0.015) | 0.016 (0.015) |

25 100
|| IRT | p-IRT | gp-IRT |
|--|--|--|--|
| alpaca_v2 | 0.012 (0.015) | 0.020 (0.021) | 0.016 (0.016) |

99 395
|| IRT | p-IRT | gp-IRT |
|--|--|--|--|
| truthfulqa | 0.013 (0.010) | 0.010 (0.009) | 0.011 (0.009) |

99 395
|| IRT | p-IRT | gp-IRT |
|--|--|--|--|
| gsm8k | 0.022 (0.017) | 0.029 (0.022) | 0.020 (0.017) |

99 395
|| IRT | p-IRT | gp-IRT |
|--|--|--|--|
| winogrande | 0.022 (0.017) | 0.016 (0.014) | 0.015 (0.013) |

99 395
|| IRT | p-IRT | gp-IRT |
|--|--|--|--|
| a