In [1]:
import pickle
import copy
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

In [2]:
benchs = ['lb', 'mmlu', 'helm_lite', 'alpaca']
number_item = 100
method = 'anchor-irt'
tinyBenchmarks = {}

In [4]:
for bench in benchs:
    with open(f'results/samples_{bench}_iterations-5.pickle', 'rb') as handle:
        sample_data = pickle.load(handle)

    scenarios = list(sample_data['scenarios_position'].keys())

    with open(f'results/results_{bench}_split-iid_iterations-5.pickle', 'rb') as handle:
        results = pickle.load(handle)
        avg_error = np.mean([np.mean([results[it][number_item][method+"_naive"][scenario] for it in results.keys()],axis=0) for scenario in scenarios], axis=0)
    
    print(bench,np.min(avg_error),avg_error)
    best_it = np.argmin(avg_error)
    
    optimal_lambdas = {}
    for scenario in scenarios:
        optimal_lambdas[scenario] = .5 #sample_data['opt_lambds'][method+"_gpirt"][scenario][number_item]
    
    tinyBenchmarks[bench] = {'seen_examples':sample_data['seen_items'][method][number_item][best_it],
                             'examples_weights':sample_data['item_weights'][method][number_item][best_it],
                             'irt_parameters':{"A":sample_data["A"], "B":sample_data["B"]},
                             'scenarios_position':sample_data['scenarios_position'],
                             'subscenarios_position':sample_data['subscenarios_position'],
                             'optimal_lambdas':optimal_lambdas}

lb 0.018127779895138887 [0.02047152 0.02068844 0.01857669 0.01812778 0.02016862]
mmlu 0.024082251076550446 [0.026144   0.02591576 0.02805555 0.02576776 0.02408225]
helm_lite 0.025932711015329447 [0.02657301 0.03014362 0.0282301  0.02593271 0.02686508]
alpaca 0.011602607838921739 [0.01365071 0.01160261 0.01216845 0.01365583 0.01164006]


In [5]:
with open('tinyBenchmarks.pkl', 'wb') as f:
    pickle.dump(tinyBenchmarks, f)

In [3]:
### Parameters
bench = 'lb' # chosen from possible benchmarks in ['lb', 'mmlu', 'helm_lite', 'alpaca']
y_input = np.random.binomial(1,.5, 600) # dummy data (in bench='lb' we have 6 scenarios and 100 examples per scenario)

### Evaluation
tb.evaluate(y_input, bench)

{'harness_truthfulqa_mc_0': {'irt': 0.5348837209302325,
  'pirt': 0.5212710930243036,
  'gpirt': 0.528077406977268},
 'gsm8k': {'irt': 0.6125852918877936,
  'pirt': 0.5515895737660961,
  'gpirt': 0.5820874328269449},
 'winogrande': {'irt': 0.5374901341752171,
  'pirt': 0.4784850637507187,
  'gpirt': 0.5079875989629679},
 'arc': {'irt': 0.4863481228668942,
  'pirt': 0.4756461693373306,
  'gpirt': 0.4809971461021124},
 'hellaswag': {'irt': 0.4303923521210914,
  'pirt': 0.4847432751892539,
  'gpirt': 0.45756781365517263},
 'mmlu': {'irt': 0.4306957893341643,
  'pirt': 0.49958991605315767,
  'gpirt': 0.46514285269366096}}

{'harness_truthfulqa_mc_0': {'irt': 0.5483476132190942,
  'pirt': 0.5216756041366227,
  'gpirt': 0.5350116086778585},
 'gsm8k': {'irt': 0.5132676269901439,
  'pirt': 0.5328183759663551,
  'gpirt': 0.5230430014782494},
 'winogrande': {'irt': 0.4301499605367009,
  'pirt': 0.4792754277690377,
  'gpirt': 0.4547126941528693},
 'arc': {'irt': 0.5520477815699659,
  'pirt': 0.5066457168990404,
  'gpirt': 0.5293467492345032},
 'hellaswag': {'irt': 0.5338577972515436,
  'pirt': 0.5108037778592825,
  'gpirt': 0.5223307875554131},
 'mmlu': {'irt': 0.5377958382081949,
  'pirt': 0.5393624918280722,
  'gpirt': 0.5385791650181335}}

In [1]:
import pickle
import numpy as np
from scipy.optimize import minimize

### Utility functions
def sigmoid(z):
    return 1/(1+np.exp(-z))

def item_curve(theta, a, b):
    z = np.clip(a*theta - b, -30, 30).sum(axis=1)
    return sigmoid(z)

def fit_theta(responses_test, seen_items, A, B, theta_init=None, eps=1e-10, optimizer="BFGS"):
    D = A.shape[1]
    # Define the negative log likelihood function
    def neg_log_like(x):
        P = item_curve(x.reshape(1, D, 1), A[:, :, seen_items], B[:, :, seen_items]).squeeze()
        log_likelihood = np.sum(responses_test[seen_items] * np.log(P + eps) + (1 - responses_test[seen_items]) * np.log(1 - P + eps))
        return -log_likelihood
    # Use the minimize function to find the ability parameters that minimize the negative log likelihood
    optimal_theta = minimize(neg_log_like, np.zeros(D), method = optimizer).x[None,:,None] 
    return optimal_theta

### Evaluation function
def evaluate(y_input, bench):
              
    assert len(y_input.shape)==1, "y_input must be a unidimensional numpy array."
    assert bench in ['lb', 'mmlu', 'helm_lite', 'alpaca']
    
    ### Loading and creating important objects
    number_of_examples = 100
    with open('tinyBenchmarks.pkl', 'rb') as handle:
        tinyBenchmarks = pickle.load(handle)

    seen_examples = tinyBenchmarks[bench]['seen_examples']
    examples_weights = tinyBenchmarks[bench]['examples_weights']
    irt_parameters = tinyBenchmarks[bench]['irt_parameters']
    A, B = irt_parameters['A'], irt_parameters['B']
    optimal_lambdas = tinyBenchmarks[bench]['optimal_lambdas']
    scenarios_position = tinyBenchmarks[bench]['scenarios_position']
    subscenarios_position = tinyBenchmarks[bench]['subscenarios_position']

    N = np.max([np.max(x) for x in scenarios_position.values()])+1
    balance_weights = np.ones(N)
    for scenario in scenarios_position.keys():
        N_sce = len(scenarios_position[scenario])
        n_sub = len(subscenarios_position[scenario])
        for sub in subscenarios_position[scenario].keys():
            n_i = len(subscenarios_position[scenario][sub])
        balance_weights[subscenarios_position[scenario][sub]] = N_sce/(n_sub*n_i) 

    ### Creating vector y and estimating theta
    y = np.zeros(N)
    for i, j in enumerate(seen_examples):
        y[j] = y_input[i]

    ### Getting estimates
    theta = fit_theta(y, seen_examples, A, B)
    estimates = {}
    unseen_examples = [i for i in range(N) if i not in seen_examples]

    for scenario in scenarios_position.keys():

        N_sce = len(scenarios_position[scenario])
        seen_examples_sce = [s for s in seen_examples if s in scenarios_position[scenario]]
        unseen_examples_sce = [s for s in unseen_examples if s in scenarios_position[scenario]]

        data_part_IRTp = ((balance_weights*y)[seen_examples_sce]).mean()
        irt_part = (balance_weights*item_curve(theta.reshape(1, A.shape[1], 1), A, B))[0, [unseen_examples_sce]].mean()
        IRTp_lambd = number_of_examples/N_sce
        IRT = (examples_weights[scenario]*y[seen_examples_sce]).sum()
        IRTp = IRTp_lambd * data_part_IRTp + (1 - IRTp_lambd) * irt_part
        IRTpp = optimal_lambdas[scenario]*IRT + (1-optimal_lambdas[scenario])*IRTp

        estimates[scenario] = {}
        estimates[scenario]['irt'] = IRT
        estimates[scenario]['pirt'] = IRTp
        estimates[scenario]['gpirt'] = IRTpp
        
    return estimates