In [1]:

from scipy.stats import pearsonr
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

def tpr(x,y):
    return (x*y).sum() / y.sum()

def tnr(x,y):
    return ((1-x)*(1-y)).sum() / (1-y).sum()

models = ["anthropic_claude-3-5-sonnet-20240620","openai_gpt-4-0613","google_gemini-1.5-pro-001","meta_llama-3-70b","meta_llama-2-7b"]

def get_all_targets():
    return np.load("Data/targets_mmlu.npy")

def get_all_predictions(model):
    if model=="llama3.1":
        model = "llama_405B_numeric"
    return np.load("Data/"+model + "_predictions_mmlu.npy")

In [2]:
preds = get_all_predictions("llama3.1")

targets = get_all_targets()
for model in models:
    print(model)
    gpt = get_all_predictions(model)

    scores = (gpt*preds).sum(1)

    real_acc = (targets*gpt).sum(1)

    print("baseline var", real_acc.var())

    print("ppi var", (scores-real_acc).var())

    lam = np.cov(real_acc,scores)[0,1] / np.var(scores)

    print("ppi++ var", (lam*scores-real_acc).var())

    coarse = np.round(scores/2,decimals=1)*2
    #print("counts", np.unique(coarse,return_counts=True))

    print("ppi_coarse var", (coarse-real_acc).var())

    lam_coarse = np.cov(real_acc,coarse)[0,1] / np.var(coarse)

    print("ppi++ var coarse", (lam_coarse*coarse-real_acc).var())

    coarse_calibrated = coarse.copy()

    for value in np.unique(coarse):
        coarse_calibrated[coarse==value] = real_acc[coarse==value].mean()
        #print(value,real_acc[coarse==value].mean())
    print("basic strat var", (coarse_calibrated-real_acc).var())

    total_variance = 0.0
    for value in np.unique(coarse):
        prob = (coarse == value).mean()
        conditional_var = real_acc[coarse == value].var()
        total_variance += prob*conditional_var
        
    print("basic strat var alt", (total_variance))
    
    optimized_variance = 0.0
    for value in np.unique(coarse):
        prob = (coarse == value).mean()
        conditional_std = real_acc[coarse == value].std()
        optimized_variance += prob*conditional_std
        
    optimized_variance = optimized_variance**2
    
    print("neyman strat var alt", (optimized_variance))
    

anthropic_claude-3-5-sonnet-20240620
baseline var 0.11159607733456192
ppi var 0.0914719110337836
ppi++ var 0.08252020258803268
ppi_coarse var 0.09495730218342471
ppi++ var coarse 0.08314227996516704
basic strat var 0.08311946985592207
basic strat var alt 0.08311946985592207
neyman strat var alt 0.062255709586008325
openai_gpt-4-0613
baseline var 0.13417131558505752
ppi var 0.08743322710140392
ppi++ var 0.08425228616462092
ppi_coarse var 0.09064579188553938
ppi++ var coarse 0.08545455954805033
basic strat var 0.08536377357098263
basic strat var alt 0.08536377357098261
neyman strat var alt 0.06413683779532607
google_gemini-1.5-pro-001
baseline var 0.14175047141725372
ppi var 0.08632067074230355
ppi++ var 0.08320801589950304
ppi_coarse var 0.08923108875501803
ppi++ var coarse 0.08414464128462343
basic strat var 0.08367091755536868
basic strat var alt 0.08367091755536867
neyman strat var alt 0.06325588583662496
meta_llama-3-70b
baseline var 0.1677557921794873
ppi var 0.0820214832851304
ppi