In [None]:
import os
import json
import numpy as np 
import pandas as pd 
from tqdm import tqdm 

from gpqa.gpqa_utils import * 

from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

from mmlu_pro.mmlu_utils import * 

from hotpotqa.hotpotqa_utils import *

from drop.drop_utils import *

from utils import * 

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
tasks = ['math500','gpqa', "drop", "hotpotqa", "musr_location", 'musr_efficiently']
shots = ["few"]
models = ['gpt-4o-mini', 'gpt-4o', 'llama']
subjects = ['business', 'law', 'psychology', 'biology', 'chemistry', 'history', 'other', 'health', 'economics', 'math', 'physics', 'computer science', 'philosophy', 'engineering']

In [None]:
def get_value(likelihood_file, baseline_few_file, baseline_few_zero_file): 
    if not os.path.exists(baseline_few_file):
        print(f"File not found: {baseline_few_file}")
        return None
    else:
        few_res = []
        with open(baseline_few_file, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f, start=1):
                try:
                    few_res.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(baseline_few_file)
                    print(f"Error parsing line {i}: {e}")
                    print(line[:100])
    
    if not os.path.exists(baseline_few_zero_file):
        print(f"File not found: {baseline_few_zero_file}")
        return None
    else:
        few_zero_res = []
        with open(baseline_few_zero_file, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f, start=1):
                try:
                    few_zero_res.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(baseline_few_zero_file)
                    print(f"Error parsing line {i}: {e}")
                    print(line[:100])

    if not os.path.exists(likelihood_file):
        print(f"Error: {likelihood_file} not found.")
        return None

    with open(likelihood_file, "r") as f:
        likelihoods = json.load(f)
    
    problem_groups = list(zip(*likelihoods))
    
    num_examples = len(few_res)           

    aggregated_results = {
        "forward": [[] for _ in range(num_examples)],
        "direct": [[] for _ in range(num_examples)],
        "no_replace": [[] for _ in range(num_examples)],
        "backward": [[] for _ in range(num_examples)],
        "is_correct": [[] for _ in range(num_examples)]
    }

    for i, (few, few_zero, problem_likelihoods) in enumerate(zip(few_res, few_zero_res, problem_groups)):
        problem_list = list(problem_likelihoods)

        for cl in problem_list:
            calculate_diffs(cl)

        few_results = few['results']       
        few_zero_results = few_zero['results']      
        
        
        for j, (few_result, few_zero_result) in enumerate(zip(few_results, few_zero_results)):
            
            is_correct_val = problem_list[j]['is_correct']
            
            ce_list = few_result["ce_losses"]   
            ce_mean = np.mean(ce_list)

            no_backward_val = problem_list[j]['no_replace_ce_diff']
            backward_val = problem_list[j]['replace_ce_diff']

            aggregated_results["forward"][i].append(ce_mean)
            aggregated_results["no_replace"][i].append(no_backward_val)
            aggregated_results["backward"][i].append(backward_val)
            aggregated_results["is_correct"][i].append(is_correct_val)

            ce_list = few_zero_result["ce_losses"]   
            ce_mean = np.mean(ce_list)

            aggregated_results["direct"][i].append(ce_mean)

    for key in aggregated_results:
        aggregated_results[key] = np.array(aggregated_results[key], dtype=np.float32)

    return aggregated_results




In [4]:
results = {}

for task in tasks:
    if task not in results:
        results[task] = {}
    for model in models:
        likelihood_file = f"likelihood/{task}/{model}/few/all_likelihoods.json"
        baseline_few_file = f"baselines/baseline/{task}/{model}/{task}_few_few.jsonl"
        baseline_few_zero_file = f"baselines/baseline/{task}/{model}/{task}_few_zero.jsonl"
        
        result = get_value(likelihood_file, baseline_few_file, baseline_few_zero_file)
        
        results[task][model] = result

In [None]:
mmlu_results = {}

for model in models:
    mmlu_results[model] = {}  
    for subject in subjects: 
        baseline_few_file = f"baselines/baseline/mmlu_pro/{model}/{subject}/mmlu_pro_few_few.jsonl"
        baseline_few_zero_file = f"baselines/baseline/mmlu_pro/{model}/{subject}/mmlu_pro_few_zero.jsonl"
        likelihoods_file = f"likelihood/mmlu_pro/{model}/{subject}/all_likelihoods.json"
    
        result = get_value(likelihoods_file, baseline_few_file, baseline_few_zero_file)
    
        mmlu_results[model][subject] = result

In [None]:
metrics    = ['direct']
benchmarks = [
    "math500",
    "mmlu_pro",
    "gpqa",
    "drop",
    "hotpotqa",
    "musr_location",
    "musr_efficiently"
]

for model in models:
    cols = benchmarks + ['Avg']
    df = pd.DataFrame(index=metrics, columns=cols, dtype=object)

    metric_values = {m: [] for m in metrics}
    if 'mmlu_pro' in benchmarks:
        for subject, res in mmlu_results.get(model, {}).items():
            if res is None:
                print(f"[WARNING] {model}/{subject} is None. Skipping.")
                continue
            is_corr = res['is_correct']      # shape (N, C)
            N       = is_corr.shape[0]
            for m in metrics:
                arr = res.get(m)
                if arr is None:
                    continue
                chosen = arr.argmin(axis=-1)
                acc    = 100 * is_corr[np.arange(N), chosen].mean()
                metric_values[m].append(round(acc, 1))
        avg_metrics = {
            m: round(np.mean(vals), 1) if vals else np.nan
            for m, vals in metric_values.items()
        }
        for m in metrics:
            df.at[m, 'mmlu_pro'] = avg_metrics[m]

    for bm in [b for b in benchmarks if b != 'mmlu_pro']:
        res = results.get(bm, {}).get(model)
        if res is None:
            df[bm] = np.nan
            continue

        is_corr = res['is_correct']
        N       = is_corr.shape[0]

        for m in metrics:
            arr = res.get(m)
            if arr is None:
                df.at[m, bm] = np.nan
                continue

            chosen = arr.argmin(axis=-1)

            if bm in ['drop', 'hotpotqa'] and is_corr.ndim == 3:
                em = 100 * is_corr[np.arange(N), chosen, 0].mean()
                f1 = 100 * is_corr[np.arange(N), chosen, 1].mean()
                df.at[m, bm] = f"EM: {em:.1f}, F1: {f1:.1f}"
            else:
                acc = 100 * is_corr[np.arange(N), chosen].mean()
                df.at[m, bm] = round(acc, 1)

    for m in metrics:
        em_vals = []
        for bm in benchmarks:
            cell = df.at[m, bm]
            if pd.isna(cell):
                continue
            if isinstance(cell, str) and cell.startswith("EM:"):
                em_val = float(cell.split()[1].rstrip(','))
                em_vals.append(em_val)
            else:
                try:
                    em_vals.append(float(cell))
                except:
                    pass
        df.at[m, 'Avg'] = round(np.mean(em_vals), 1) if em_vals else np.nan

    print(f"===== Model: {model} =====")
    display(df)


In [None]:
def compute_temps(beta, check):
    backward  = check['backward']
    no_replace   = check['no_replace']
    direct     = check['direct']
    forward      = check['forward']


    no_replace_referi = (1 - beta) * direct + beta * (forward - no_replace)

    referi = (1 - beta) * direct + beta * (forward - backward)


    return {
        'no_replace_referi': no_replace_referi,
        'referi': referi,
    }


In [None]:
beta_list    = [0.5, 0.75, 1.0]

methods = ['referi', 'no_replace_referi']

ordered_benchmarks = [
    "math500", 
    "mmlu_pro",
    "gpqa",
    "drop", 
    "hotpotqa",
    "musr_location", 
    "musr_efficiently"
]

for model in models:
    print(f"===== MODEL: {model} =====")

    
    ce_rows = [m + "_BEST" for m in methods]
    df   = pd.DataFrame(index=ce_rows,
                           columns=ordered_benchmarks + ['Avg'],
                           dtype=object)

    for bm in ordered_benchmarks:
        if bm == 'mmlu_pro':
            for m in methods:
                best_avg, best_alpha = None, None
                for a in beta_list:
                    vals = []
                    for subj, res in mmlu_results.get(model, {}).items():
                        if res is None:
                            continue
                        is_corr = res['is_correct']
                        temps   = compute_temps(a, res)
                        arr     = temps.get(m)
                        if arr is None:
                            continue
                        chosen = np.argmin(arr, axis=-1)
                        acc    = 100 * is_corr[np.arange(len(chosen)), chosen].mean()
                        vals.append(acc)
                    if not vals:
                        continue
                    avg_acc = round(np.mean(vals), 1)
                    if best_avg is None or avg_acc > best_avg:
                        best_avg, best_alpha = avg_acc, a
                if best_avg is not None:
                    df.at[f"{m}_BEST", 'mmlu_pro'] = f"{best_avg:.1f} ({best_alpha:.1f})"
                else:
                    df.at[f"{m}_BEST", 'mmlu_pro'] = np.nan

        else:
            res = results.get(bm, {}).get(model)
            if res is None:
                df[bm] = np.nan
                continue

            is_corr = res['is_correct']
            N       = is_corr.shape[0]
            is_drop = bm in ['drop', 'hotpotqa']

            for m in methods:
                best_score, best_alpha = None, None

                for a in beta_list:
                    temps = compute_temps(a, res)
                    arr   = temps[m]
                    chosen = np.argmin(arr, axis=-1)

                    if is_drop:
                        score = 100 * is_corr[np.arange(N), chosen, 0].mean()
                    else:
                        score = 100 * is_corr[np.arange(N), chosen].mean()

                    if best_score is None or score > best_score:
                        best_score, best_alpha = score, a

                if best_score is None:
                    df.at[f"{m}_BEST", bm] = np.nan
                else:
                    if is_drop:
                        temps_best = compute_temps(best_alpha, res)
                        arr_best   = temps_best[m]
                        chosen_best= np.argmin(arr_best, axis=-1)
                        best_f1    = 100 * is_corr[np.arange(N), chosen_best, 1].mean()

                        df.at[f"{m}_BEST", bm] = (
                            f"EM: {best_score:.1f}({best_alpha:.1f}), "
                            f"F1: {best_f1:.1f}({best_alpha:.1f})"
                        )
                    else:
                        df.at[f"{m}_BEST", bm] = f"{best_score:.1f} ({best_alpha:.1f})"

    def compute_avg(df):
        for idx in df.index:
            vals = []
            for bm in ordered_benchmarks:
                cell = df.at[idx, bm]
                if pd.isna(cell): continue
                if bm in ['drop', 'hotpotqa']:
                    try:
                        em = float(cell.split("EM:")[1].split("(")[0])
                        vals.append(em)
                    except:
                        pass
                else:
                    try:
                        vals.append(float(cell.split()[0]))
                    except:
                        pass
            df.at[idx, 'Avg'] = round(np.mean(vals), 1) if vals else np.nan

    compute_avg(df)

    print(">>> BEST only:")
    display(df)
    print()