In [None]:
import os
import sys
import json
from tqdm import tqdm 

from dotenv import load_dotenv
from openai import OpenAI

sys.path.append('../')  

from gpqa.gpqa_utils import * 

from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

from mmlu_pro.mmlu_utils import * 

from hotpotqa.hotpotqa_utils import *

from drop.drop_utils import *

from musr.musr import MuSRDataset

from utils import load_model_outputs


load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def load_data_and_fewshot(args):
    if args.task == "mmlu_pro":
        dataset, fewshot = load_mmlu_pro()

    elif args.task == "math500": 
        file_path = f"../data/math500/test.jsonl"
        with open(file_path, 'r', encoding='utf-8') as f:
            dataset = [json.loads(line) for line in f]
        fewshot = load_prompt(num_shots=5)
    
    elif args.task == "gpqa":
        dataset = load_examples("../data/gpqa/gpqa_diamond.csv", seed=0)
        with open("../gpqa/chain_of_thought_examples.json", 'r') as f:
            fewshot = json.load(f)

    elif args.task == "hotpotqa":
        dataset = json.load(open(f'../data/hotpotqa/{args.task}.json'))
        with open("../hotpotqa/react_prompt.json", 'r') as f:
            fewshot = json.load(f)

    elif args.task == "drop":
        dataset = pd.read_parquet("../data/drop/drop_sub.parquet", engine="pyarrow")
        dataset = dataset.to_dict(orient="records")  

        dataset = convert_ndarray_to_list(dataset)
        dataset = convert_ndarray_to_list(dataset)

        with open("../drop/prompt.json", 'r') as f:
            fewshot = json.load(f)

    elif args.task == "musr_efficiently":
        ta_path = '../data/musr/team_allocation.json'
        dataset = MuSRDataset(ta_path)
        fewshot = 1

    elif args.task == "musr_location":
        op_path = '../data/musr/object_placements.json'
        dataset = MuSRDataset(op_path)
        fewshot = 1
    else: 
        return None, None
    
    return dataset, fewshot

def construct_prompt(args, dataset, fewshot): 
    system_prompt = (
        "Your job is selecting the most accurate response among multiple candidates. "
        "You will receive a question and several candidate answers labeled candidate1, candidate2, etc. "
        "Please summarize the debate very briefly and then conclude which single candidate is the most plausible. "
        "Output exactly in this format:\n"
        "Summary: <brief summary>\n"
        "Conclusion: candidate<number>\n"
        "Remember to choose only one candidate as the final answer.\n"
    )
    before_fewshot = "The below examples are well-constructed gold question and answer pairs for the same task.\n\n"
    before_question = "Now, let’s select the most proper answer for the given question\n"

    output_res_path = f"{args.input_dir}/{args.task}/{args.model}"

    samples = []

    if args.task == "math500":
        output_res_path = os.path.join(output_res_path, f"{args.task}_few.jsonl")
        res = load_model_outputs(output_res_path)

        start_prompt = "Please reason step by step, and put your final answer within \\boxed{{}}.\n\n"
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            question = entry.get('problem', '')
            model_outputs = r.get('model_outputs', [])

            user_prompt = start_prompt + before_fewshot

            if fewshot != None: 
                user_prompt += "\n\n".join([f"{q}\n\n{a}" for q, a in fewshot]) + "\n\n" 

            user_prompt += before_question

            user_prompt += f"Question: {question}\n"

            for i, output in enumerate(model_outputs, start=1):
                user_prompt += f"candidate{i}: {output}\n"
                
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)
        
    elif args.task == "mmlu_pro":
        subjects = list(dataset.keys())
        for subject in tqdm(subjects): 
            res_path = os.path.join(output_res_path, f"{subject}_result.jsonl")
            res = load_model_outputs(res_path)
            
            start_prompt = "The following are multiple choice questions (with answers) about {}. Think step by" \
                " step and then output the answer in the format of \"The answer is (X)\" at the end.\n\n" \
            .format(subject)

            start_prompt += before_fewshot

            if fewshot != None:
                for each in fewshot[subject]:
                    start_prompt += format_example(each["question"], each["options"], each["cot_content"])
            
            start_prompt += before_question
            

            random.seed(42)
            test_data = random.sample(dataset[subject], min(300, len(dataset[subject])))
            
            for idx, (entry, r) in enumerate(zip(test_data, res)):
                entry = r.get('entry', {})
                model_outputs = r.get('model_outputs', [])
                question = format_example(entry['question'], entry['options'])
                
                user_prompt = start_prompt + f"{question}\n"
                for i, output in enumerate(model_outputs, start=1):
                    user_prompt += f"candidate{i}: {output}\n"

                message = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ]

                sample = {"idx": idx, "prompt": message, "entry": entry}
                samples.append(sample)
    
    elif args.task == "gpqa": 
        def chain_of_thought_prompt(json_data, example: Example) -> str:
            """Creates a chain-of-thought prompt given a single example."""
            prompt = f"Here are some example questions from experts. An explanation is given before the final answer. Answer the final question yourself, giving your reasoning beforehand.\n"
            prompt += generate_prompt_from_examples(json_data, with_explanations=True)
            prompt += before_question
            prompt += f"Question: {example.question}"
            prompt += f"\nChoices:\n(A) {example.choice1}\n(B) {example.choice2}\n(C) {example.choice3}\n(D) {example.choice4}"
            prompt += "\nGive step by step reasoning before you answer, and when you're ready to answer, please use the format \"The correct answer is (insert answer here)\":\n"
            return prompt
        
        output_res_path = os.path.join(output_res_path, f"{args.task}_few.jsonl")
        res = load_model_outputs(output_res_path)


        start_prompt = "You are a very intelligent assistant, who follows instructions directly.\n\n"
        start_prompt += before_fewshot
        for idx, (example, r) in enumerate(zip(dataset, res)):
            user_prompt = start_prompt

            if fewshot != None: 
                user_prompt += chain_of_thought_prompt(fewshot, example)
            
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])

            for i, output in enumerate(model_outputs, start=1):
                user_prompt += f"candidate{i}: {output}\n"

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

    elif args.task == "hotpotqa": 
        output_res_path = os.path.join(output_res_path, f"{args.task}_few.jsonl")
        res = load_model_outputs(output_res_path)


        if fewshot != None: 
            fewshot_prompt = before_fewshot
            for qa in fewshot:
                question = qa["Q"]
                answer = qa["A"]

                fewshot_prompt += f"Q: {question}\nA: {answer}\n\n"
        fewshot_prompt += before_question

        for idx, (entry, r) in enumerate(zip(dataset, res)):
            if fewshot != None: 
                user_prompt = fewshot_prompt + f"Q: {entry['question']}." + "\n\nEnd your answer with \"Answer <answer>\". Think step by step." + "\n\n" 
            
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])

            for i, output in enumerate(model_outputs, start=1):
                user_prompt += f"candidate{i}: {output}\n"

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

    elif args.task == "drop": 
        output_res_path = os.path.join(output_res_path, f"{args.task}_few.jsonl")
        res = load_model_outputs(output_res_path)


        if fewshot != None: 
            fewshot_prompt = before_fewshot
            for qa in fewshot:
                question = qa["Q"]
                answer = qa["A"]

                fewshot_prompt += f"Q: {question}\nA: {answer}\n\n"
        fewshot_prompt += before_question

        for idx, (entry, r) in enumerate(zip(dataset, res)):
            if fewshot != None: 
                user_prompt = fewshot_prompt + f"Q: {entry['passage']} {entry['question']}" + "\n\nEnd your answer with \"So the answer is <answer>\". Think step by step." + "\n\n"
            
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])

            for i, output in enumerate(model_outputs, start=1):
                user_prompt += f"candidate{i}: {output}\n"

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

    elif args.task == "musr_efficiently" or args.task == "musr_location":
        from musr.op_icl_fixed import op_fewshot, few_shot_op_instruction, test_op_instruction
        from musr.ta_icl_fixed import ta_fewshot, few_shot_ta_instruction, test_ta_instruction

        if args.task == "musr_location":
            few_shot_examples = op_fewshot  
            few_instruction = few_shot_op_instruction
            test_instruction = test_op_instruction
        elif args.task == 'musr_efficiently':
            few_shot_examples = ta_fewshot
            few_instruction = few_shot_ta_instruction
            test_instruction = test_ta_instruction

        output_res_path = os.path.join(output_res_path, f"{args.task}_few.jsonl")
        res = load_model_outputs(output_res_path)

        for idx, (entry, r) in enumerate(zip(dataset, res)):
            model_outputs = r.get('model_outputs', [])

            question = entry['question'].strip()
            context = entry['context'].strip()
            choices = entry['choices']['text']
            labels = ['A', 'B', 'C', 'D', 'E', 'F'][:len(choices)]
            choice_str = '\n'.join([f'{labels[idx]}: {choices[idx]}' for idx in range(len(choices))])
            original_question_part = f"{context}\n\n{question}\n\n{choice_str}"
            start_prompt = entry['prompt_parts']['cot_system_prompt'] + "\n\n" + before_fewshot
            
            user_prompt = start_prompt
            for (q, a) in few_shot_examples:
                user_prompt += q + "\n\n" + few_instruction + "\n"
                user_prompt += a + "\n\n"

            user_prompt += before_question
            user_prompt += original_question_part + "\n\n" + test_instruction + "\n"

            for i, output in enumerate(model_outputs, start=1):
                user_prompt += f"candidate{i}: {output}\n"
            
            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)


    else: 
        return None
        
    return samples

def generate_model_output(model: str, prompt: str, temperature: float = 1.0, n: int = 1) -> str:        
    responses = client.chat.completions.create(
        model=model,
        messages=prompt,
        n=1,
        temperature=temperature,
    )
    outputs = [choice.message.content for choice in responses.choices]

    return outputs 

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def main(config):

    save_path = f"{config.output_dir}/{config.task}/{config.model}/{config.task}_{config.shot_type}.jsonl"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
        
    dataset, fewshot = load_data_and_fewshot(config)
    if config.shot_type == "zero":
        fewshot = None
        
    samples = construct_prompt(config, dataset, fewshot)
    if config.num_examples != -1: 
        samples = samples[:config.num_examples]

    if samples:
        print(f"Model: {config.model} Task: {config.task}, Shot: {config.shot_type}")
        print(samples[0].keys())
        print("-" * 50)
        prompt = samples[0]["prompt"]
        for message in prompt:
            print(f"Role:\n{message['role']}")
            print(f"Content:\n{message['content']}")
            print("-" * 50)
    else:
        print(f"No samples found for Task: {config.task}, Shot: {config.shot_type}")

    
    if os.path.exists(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:

            existing_data = {json.loads(line)['prompt'][1]['content'] for line in f}  

    else:
        existing_data = set()  


    if samples:

        with open(save_path, "a", encoding='utf-8') as f:  

            for sample in tqdm(samples, total=len(samples)):
                if sample['prompt'][1]['content'] in existing_data:  

                    continue
                try:

                    model_outputs = generate_model_output(config.model, sample["prompt"], config.temperature)
                    sample["prompt_output"] = model_outputs
                    json.dump(sample, f)
                    f.write("\n")
                except Exception as e:
                    print(f"Error processing sample {sample['idx']}: {e}")
                    break

        print(f"Results saved to {save_path}")

In [None]:
class Config:
    def __init__(self):
        self.model = "gpt-4o-mini"  
        self.task = "drop"  # "math500", "mmlu_pro", "gpqa", "drop", "hotpotqa"
        self.shot_type = "few"  
        self.output_dir = "llm_as_judge"
        self.input_dir = "../result"
        self.num_examples = -1
        self.temperature = 0.0

In [None]:

tasks = ['math500']
subjects = ['business', 'law', 'psychology', 'biology', 'chemistry', 'history', 'other', 'health', 'economics', 'math', 'physics', 'computer science', 'philosophy', 'engineering']

shots = ["few"]
models = ['gpt-4o-mini']

config = Config()

In [None]:

for model in models:
    for task in tasks:
        for shot in shots:
            config.model = model
            config.task = task
            config.shot_type = shot
            print("=" * 50)
            main(config)



500it [00:00, 66453.89it/s]




Model: gpt-4o-mini Task: math500, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:
Your job is selecting the most accurate response among multiple candidates. You will receive a question and several candidate answers labeled candidate1, candidate2, etc. Please summarize the debate very briefly and then conclude which single candidate is the most plausible. Output exactly in this format:
Summary: <brief summary>
Conclusion: candidate<number>
Remember to choose only one candidate as the final answer.

--------------------------------------------------
Role:
user
Content:
Please reason step by step, and put your final answer within \boxed{{}}.

The below examples are well-constructed gold question and answer pairs for the same task.

Kevin Kangaroo begins hopping on a number line at 0. He wants to get to 1, but he can hop only $\frac{1}{3}$ of the distance. Each hop tires him out so that he continues to hop $\frac{1}{

100%|██████████| 500/500 [00:00<00:00, 67003.80it/s]

Results saved to llm_prompt_with_fewshot/math500/gpt-4o-mini/math500_few.jsonl





In [None]:
def extract_user_number(response_text):
    primary_pattern = r'\[?\s*Conclusion: candidate\s*(\d+)\s*\]?'
    primary_matches = re.findall(primary_pattern, response_text, flags=re.IGNORECASE)
    if primary_matches:

        return int(primary_matches[0])
    


    sentences = re.split(r'[.\n]', response_text.strip())

    sentences = [s.strip() for s in sentences if s.strip()]
    if sentences:
        last_sentence = sentences[-1]
        fallback_matches = re.findall(r'(\d+)', last_sentence)
        if fallback_matches:

            return int(fallback_matches[-1])
    

    return -1

In [None]:
overall_summary = []

for model in ['gpt-4o-mini', 'gpt-4o', 'llama']:
    for task in tasks:
        if task == 'mmlu_pro':
            continue
        prompting_path = f"{config.output_dir}/{task}/{model}/{task}_{config.shot_type}.jsonl"
        llm_prompt = load_model_outputs(prompting_path)

        output_res_path = f"../result/{task}/{model}/{task}_{config.shot_type}.jsonl"
        res = load_model_outputs(output_res_path)
                    

        user_counter = Counter()
        total_entries = 0
        for entry in tqdm(llm_prompt, desc=f"Processing {model}-{task}"):
            total_entries += 1
            raw_output = entry.get("prompt_output")          
            if isinstance(raw_output, list):
                prompt_output = raw_output[0] if raw_output else None
            else:
                prompt_output = raw_output                   

            if prompt_output is None:
                user_counter["None"] += 1
                continue
            extracted_user = extract_user_number(prompt_output)
            if extracted_user is None:
                user_counter["None"] += 1
            else:
                user_counter[extracted_user] += 1


        for user, count in user_counter.items():
            percentage = (count / total_entries) * 100 if total_entries else 0
            overall_summary.append({
                "Model": model,
                "Task": task,
                "Extracted User": user,
                "Count": count,
                "Percentage": round(percentage, 2)
            })

        for i in range(len(res)):
            raw_output = llm_prompt[i].get("prompt_output")          
            if isinstance(raw_output, list):
                source_text = raw_output[0] if raw_output else None
            else:
                source_text = raw_output                   

            if source_text is None:
                answer_number = 1
            else:
                answer_number = extract_user_number(source_text)
                if not isinstance(answer_number, int) or not (1 <= answer_number <= 5):
                    answer_number = 1
            model_outputs = res[i].get('model_outputs') or res[i].get('resps')[0]
            if len(model_outputs) < answer_number:
                print(f"Warning: Entry {i} has less than {answer_number} model outputs. Using default output.")
                chosen_output = model_outputs[0] if model_outputs else None
            else:
                chosen_output = model_outputs[answer_number-1]
            llm_prompt[i]['prompt_output_with_fewshot'] = chosen_output

Processing gpt-4o-mini-math500: 100%|██████████| 500/500 [00:00<00:00, 57296.10it/s]
Processing gpt-4o-mini-gpqa: 100%|██████████| 198/198 [00:00<00:00, 45728.33it/s]
Processing gpt-4o-mini-drop: 100%|██████████| 500/500 [00:00<00:00, 69133.08it/s]
Processing gpt-4o-mini-hotpotqa: 100%|██████████| 500/500 [00:00<00:00, 60426.21it/s]
Processing gpt-4o-mini-musr_efficiently: 100%|██████████| 250/250 [00:00<00:00, 44006.04it/s]
Processing gpt-4o-mini-musr_location: 100%|██████████| 256/256 [00:00<00:00, 54328.16it/s]
Processing gpt-4o-math500: 100%|██████████| 500/500 [00:00<00:00, 46460.90it/s]
Processing gpt-4o-gpqa: 100%|██████████| 198/198 [00:00<00:00, 39291.83it/s]
Processing gpt-4o-drop: 100%|██████████| 500/500 [00:00<00:00, 70744.57it/s]
Processing gpt-4o-hotpotqa: 100%|██████████| 500/500 [00:00<00:00, 58589.48it/s]
Processing gpt-4o-musr_efficiently: 100%|██████████| 250/250 [00:00<00:00, 40043.38it/s]
Processing gpt-4o-musr_location: 100%|██████████| 256/256 [00:00<00:00, 4574

In [None]:
df_overall = pd.DataFrame(overall_summary)


df_overall['ExtractedUser_Num'] = pd.to_numeric(
    df_overall['Extracted User'], errors='coerce'
)


df_overall['Group'] = df_overall['ExtractedUser_Num'].apply(
    lambda x: '1-5' if 1 <= x <= 5 else 'Others'
)

for model in df_overall["Model"].unique():
    model_df = df_overall[df_overall["Model"] == model]
    

    pivot_table = model_df.pivot_table(
        index="Group",         
        columns="Task",         
        values="Percentage",    
        aggfunc="sum",          
        fill_value=0            
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)



Model: gpt-4o-mini


Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-5,100.0,95.96,100.0,100.0,100.0,100.0
Others,0.0,4.04,0.0,0.0,0.0,0.0



Model: gpt-4o


Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-5,100.0,100.02,100.0,100.0,100.0,100.01



Model: llama


Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1-5,100.0,100.01,100.0,99.8,84.4,94.92
Others,0.0,0.0,0.0,0.2,15.6,5.08


In [None]:
df_overall = pd.DataFrame(overall_summary)

for model in df_overall["Model"].unique():
    model_df = df_overall[df_overall["Model"] == model]
    
    pivot_table = model_df.pivot_table(
        index="Extracted User",       
        columns="Task",               
        values=["Percentage"], 
        fill_value=0           
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)


Model: gpt-4o-mini


Unnamed: 0_level_0,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage
Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Extracted User,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
-1,0.0,4.04,0.0,0.0,0.0,0.0
1,85.0,37.37,62.2,91.0,34.4,85.16
2,9.2,26.26,16.6,4.6,36.4,7.81
3,2.2,16.67,8.6,1.0,27.2,3.91
4,2.8,10.61,8.0,2.4,0.0,1.17
5,0.8,5.05,4.6,1.0,2.0,1.95



Model: gpt-4o


Unnamed: 0_level_0,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage
Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Extracted User,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,93.8,70.71,69.2,79.2,30.0,62.5
2,3.4,15.66,10.4,13.0,35.6,24.22
3,1.8,6.57,8.2,5.0,21.6,5.08
4,0.2,2.53,6.2,1.4,3.6,3.52
5,0.8,4.55,6.0,1.4,9.2,4.69



Model: llama


Unnamed: 0_level_0,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage
Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Extracted User,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
-1,0.0,0.0,0.0,0.0,15.6,5.08
1,57.0,22.73,32.0,38.8,23.6,26.95
2,16.2,14.65,19.6,8.8,11.6,5.47
3,6.2,13.64,17.6,14.8,5.6,21.88
4,8.4,18.18,14.6,13.6,11.6,1.56
5,12.2,30.81,16.2,23.8,32.0,39.06
10,0.0,0.0,0.0,0.2,0.0,0.0


In [None]:
overall_summary_mmlu = []

def process_mmlu_pro(model: str):
    task = "mmlu_pro"

    prompting_path = (
        f"{config.output_dir}/{task}/{model}/"
        f"{task}_{config.shot_type}.jsonl"
    )
    all_entries = load_model_outputs(prompting_path)

    grouped = defaultdict(list)  

    for entry in all_entries:
        subj = entry["entry"]["category"]          

        grouped[subj].append(entry)

    if model != 'llama': 
        for subj in grouped:
            grouped[subj].sort(key=lambda e: e["idx"])


    for subject, entries in grouped.items():
        output_res_path = f"../result/{task}/{model}/{subject}_result.jsonl"
        res = load_model_outputs(output_res_path)


        user_counter = Counter()
        total = len(entries)

        for idx, llm_prompt in enumerate(
            tqdm(entries, desc=f"{model}-{subject}")
        ):
            
            raw_output = llm_prompt.get("prompt_output")          
            if isinstance(raw_output, list):
                p_out = raw_output[0] if raw_output else None
            else:
                p_out = raw_output                   


            if p_out is None:
                user_num = None
            else:
                user_num = extract_user_number(p_out)


            key = user_num if user_num is not None else "None"
            user_counter[key] += 1


            if not isinstance(user_num, int) or not (0 <= user_num <= 4):
                user_num = 0  # fallback


            outs = res[idx].get("model_outputs") or res[idx].get("resps")[0]
            chosen = (
                outs[user_num - 1] if len(outs) >= user_num
                else (outs[0] if outs else None)
            )
            entries[idx]["prompt_output_with_fewshot"] = chosen

        dir_path = f"{config.output_dir}/{task}/{model}/{subject}"
        file_name = f"{task}_{config.shot_type}.jsonl"
        save_path = os.path.join(dir_path, file_name)


        os.makedirs(dir_path, exist_ok=True)

        for u, cnt in user_counter.items():
            overall_summary_mmlu.append({
                "Model": model,
                "Task": task,
                "Subject": subject,
                "Extracted User": u,
                "Count": cnt,
                "Percentage": round(cnt / total * 100, 2)
            })


for model in ['gpt-4o-mini', 'gpt-4o', 'llama']:
    process_mmlu_pro(model)

gpt-4o-mini-business: 100%|██████████| 300/300 [00:00<00:00, 56810.29it/s]
gpt-4o-mini-law: 100%|██████████| 300/300 [00:00<00:00, 48438.66it/s]
gpt-4o-mini-psychology: 100%|██████████| 300/300 [00:00<00:00, 57262.73it/s]
gpt-4o-mini-biology: 100%|██████████| 300/300 [00:00<00:00, 57039.49it/s]
gpt-4o-mini-chemistry: 100%|██████████| 300/300 [00:00<00:00, 50183.11it/s]
gpt-4o-mini-history: 100%|██████████| 300/300 [00:00<00:00, 54346.79it/s]
gpt-4o-mini-other: 100%|██████████| 300/300 [00:00<00:00, 57889.73it/s]
gpt-4o-mini-health: 100%|██████████| 300/300 [00:00<00:00, 58348.77it/s]
gpt-4o-mini-economics: 100%|██████████| 300/300 [00:00<00:00, 58208.41it/s]
gpt-4o-mini-math: 100%|██████████| 300/300 [00:00<00:00, 57839.17it/s]
gpt-4o-mini-physics: 100%|██████████| 300/300 [00:00<00:00, 53063.35it/s]
gpt-4o-mini-computer science: 100%|██████████| 300/300 [00:00<00:00, 55465.54it/s]
gpt-4o-mini-philosophy: 100%|██████████| 300/300 [00:00<00:00, 54794.08it/s]
gpt-4o-mini-engineering: 100

In [None]:
df_mmlu = pd.DataFrame(overall_summary_mmlu)

for model in df_mmlu["Model"].unique():
    model_df = df_mmlu[df_mmlu["Model"] == model]
    
    pivot_table = model_df.pivot_table(
        index="Extracted User",       
        columns="Task",               
        values=["Percentage"], 
        fill_value=0           
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)


Model: gpt-4o-mini


Unnamed: 0_level_0,Percentage
Task,mmlu_pro
Extracted User,Unnamed: 1_level_2
-1,1.563846
1,68.189286
2,12.595
3,8.31
4,4.737857
5,4.714286



Model: gpt-4o


Unnamed: 0_level_0,Percentage
Task,mmlu_pro
Extracted User,Unnamed: 1_level_2
1,82.667857
2,10.333571
3,3.786429
4,1.406667
5,2.308571



Model: llama


Unnamed: 0_level_0,Percentage
Task,mmlu_pro
Extracted User,Unnamed: 1_level_2
-1,1.388333
1,25.452143
2,12.119286
3,14.713571
4,16.451429
5,29.975714
6,0.33
7,1.0


In [14]:
keys_to_eval = ['prompt_output_with_fewshot']
models = ['gpt-4o-mini', 'gpt-4o', 'llama']

In [None]:
from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

for model in models:
    file_path = f"{config.output_dir}/math500/{model}/math500_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)

    scores = {k: [] for k in keys_to_eval}

    for entry in data:
        idx = entry["idx"]
        
        _, gt = parse_ground_truth(entry['entry'], "math")
        model_outputs = entry.get('model_outputs', [])
        
        for key in keys_to_eval:
            if key not in entry:
                continue


            pred = extract_answer(entry[key], "math")
            pred = strip_string(pred)


            try:
                result = math_equal_process((idx, pred, gt))

                if not result :
                    result = process_results(gt, [entry[key]])
                    if not result:
                        pred = extract_answer(pred, "math")
                        result = math_equal_process((None, pred, gt))

                scores[key].append(result)

            except TimeoutError:
                scores[key].append(False)
            except Exception as error:
                print(f"Error while processing {key} for idx={idx}: {error}")
                scores[key].append(False)

    print(f"\n===== Evaluation Results for model={model}, shot={config.shot_type} =====")
    for key in keys_to_eval:
        if len(scores[key]) == 0:
            print(f"{key} -> No data / Not found in entries")
            continue

        acc = sum(scores[key]) / len(scores[key])
        print(f"{key} -> Accuracy: {acc:.4f}")
    print("-" * 50)



===== Evaluation Results for model=gpt-4o-mini, shot=few =====
prompt_output_with_fewshot -> Accuracy: 0.7920
--------------------------------------------------

===== Evaluation Results for model=gpt-4o, shot=few =====
prompt_output_with_fewshot -> Accuracy: 0.8080
--------------------------------------------------

===== Evaluation Results for model=llama, shot=few =====
prompt_output_with_fewshot -> Accuracy: 0.4600
--------------------------------------------------


In [None]:
def extract_answer(text):
    pattern = r"answer is \(?([A-J])\)?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return extract_again(text)


for model in models:
    overall_scores = {k: 0 for k in keys_to_eval}  
    overall_total_entries = 0

    for subject in subjects:
        file_path = f"{config.output_dir}/mmlu_pro/{model}/{subject}/mmlu_pro_few.jsonl"
        data = load_model_outputs(file_path)


        subject_scores = {k: 0 for k in keys_to_eval}
        total_data_len = len(data)
        overall_total_entries += total_data_len

        for entry in data:
            model_outputs = entry.get('model_outputs', [])
            answer = entry['entry'].get('answer') or entry['entry'].get('gold')


            for key in keys_to_eval:
                if key not in entry:
                    continue
                pred = extract_answer(entry[key])
                if pred == answer: 
                    subject_scores[key] += 1


        for k in keys_to_eval:
            overall_scores[k] += subject_scores[k]

           
    print(f"\n=== Overall Results for model={model} ===")
    for key in keys_to_eval:
        if overall_total_entries == 0:
            acc = 0
        else:
            acc = overall_scores[key] / overall_total_entries
        print(f"{key} -> Accuracy: {acc:.4f}")
    print("-" * 50)



=== Overall Results for model=gpt-4o-mini ===
prompt_output_with_fewshot -> Accuracy: 0.6479
--------------------------------------------------

=== Overall Results for model=gpt-4o ===
prompt_output_with_fewshot -> Accuracy: 0.7657
--------------------------------------------------

=== Overall Results for model=llama ===
prompt_output_with_fewshot -> Accuracy: 0.4407
--------------------------------------------------


In [None]:
from gpqa.gpqa_utils import * 

examples = load_examples("../data/gpqa/gpqa_diamond.csv", seed=0)

for model in models:
    file_path = f"{config.output_dir}/gpqa/{model}/gpqa_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)

    scores = {k: 0 for k in keys_to_eval}


    total_data_len = len(data)
    if total_data_len != len(examples):
        print("Warning: data length and examples length do not match!")

    
    
    for entry, example in zip(data, examples):
        correct_index = example.correct_index  

        model_outputs = entry.get('model_outputs', [])
        
        for key in keys_to_eval:
            if key not in entry:
                continue

            pred = parse_sampled_answer(entry[key])
            
            if pred is None:
                is_correct = False
            else:
                is_correct = (LETTER_TO_INDEX[pred] == correct_index)

            scores[key] += int(is_correct)
    
    print(f"\n=== Results for model={model}, shot={config.shot_type} ===")
    for key in keys_to_eval:
        acc = scores[key] / total_data_len if total_data_len else 0
        print(f"{key} -> Accuracy: {acc:.4f}")

    print("-" * 50)


=== Results for model=gpt-4o-mini, shot=few ===
prompt_output_with_fewshot -> Accuracy: 0.3990
--------------------------------------------------

=== Results for model=gpt-4o, shot=few ===
prompt_output_with_fewshot -> Accuracy: 0.5101
--------------------------------------------------

=== Results for model=llama, shot=few ===
prompt_output_with_fewshot -> Accuracy: 0.2121
--------------------------------------------------


In [None]:
from drop.drop_utils import *

for model in models: 
    print(model)
    entry_path = f"{config.output_dir}/drop/gpt-4o/drop_{config.shot_type}.jsonl"
    entry_data = load_model_outputs(entry_path)

    file_path = f"{config.output_dir}/drop/{model}/drop_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    
    em_scores = {k: [] for k in keys_to_eval}
    f1_scores = {k: [] for k in keys_to_eval}

    def get_max_em_f1(pred, golds):
        max_em, max_f1 = 0.0, 0.0
        for gold_answer in golds:
            exact_match, f1_score = get_metrics(pred, gold_answer)
            if gold_answer[0].strip():
                max_em = max(max_em, exact_match)
                max_f1 = max(max_f1, f1_score)
        return max_em, max_f1

    for test_idx, entry in enumerate(data):
        golds = get_answers(entry_data[test_idx]['entry']) 

        model_outputs = entry.get('model_outputs', [])
        
        for k in keys_to_eval:
            if k not in entry:
                continue

            pred = extract_answer(entry[k])
            em_val, f1_val = get_max_em_f1(pred, golds)
            em_scores[k].append(em_val)
            f1_scores[k].append(f1_val)

    print(f"\n===== Results for model={model} =====")
    for k in keys_to_eval:
        if len(em_scores[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        em_mean = np.mean(em_scores[k])
        f1_mean = np.mean(f1_scores[k])
        print(f"{k} -> EM: {em_mean:.4f}, F1: {f1_mean:.4f}")

    print("-" * 50)

gpt-4o-mini

===== Results for model=gpt-4o-mini =====
prompt_output_with_fewshot -> EM: 0.7860, F1: 0.8563
--------------------------------------------------
gpt-4o

===== Results for model=gpt-4o =====
prompt_output_with_fewshot -> EM: 0.8240, F1: 0.9017
--------------------------------------------------
llama

===== Results for model=llama =====
prompt_output_with_fewshot -> EM: 0.6760, F1: 0.7402
--------------------------------------------------


In [None]:
from hotpotqa.hotpotqa_utils import *

def extract_answer(response_text):
    match = re.search(r"Answer\s+(.+)", response_text, re.DOTALL)
    if match:
        answer = match.group(1).strip()

        answer = re.sub(r"[.\n]+$", "", answer).strip()
        return answer


    match = re.search(r"(?<!\w)Answer[:\s]+(.+?)(?:[.\n]|$)", response_text, re.IGNORECASE | re.DOTALL)
    if match:
        answer = match.group(1).strip()

        answer = re.sub(r"[.\n]+$", "", answer).strip()
        return answer
    return response_text.strip()


dataset = json.load(open(f'../data/hotpotqa/BM25/hotpotqa-bm25.json'))
with open("../hotpotqa/react_prompt.json", 'r') as f:
    fewshot = json.load(f)

for model in models:
    print(model)
    file_path = f"{config.output_dir}/hotpotqa/{model}/hotpotqa_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    preds = {k: [] for k in keys_to_eval}

    for entry in data:

        model_outputs = entry.get('model_outputs', [])
        
        for k in keys_to_eval:
            if k in entry:

                answer = extract_answer(entry[k])
                preds[k].append(answer)

    print(f"\n=== Results for model={model} ===")
    for k in keys_to_eval:
        if len(preds[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        em_scores, f1_scores = get_em_f1(dataset, preds[k])
        em_mean = em_scores.mean()
        f1_mean = f1_scores.mean()
        print(f"{k} -> EM: {em_mean:.4f}, F1: {f1_mean:.4f}")
    
    print("-" * 50)

gpt-4o-mini

=== Results for model=gpt-4o-mini ===
prompt_output_with_fewshot -> EM: 0.3620, F1: 0.4836
--------------------------------------------------
gpt-4o

=== Results for model=gpt-4o ===
prompt_output_with_fewshot -> EM: 0.4660, F1: 0.6134
--------------------------------------------------
llama

=== Results for model=llama ===
prompt_output_with_fewshot -> EM: 0.2340, F1: 0.3121
--------------------------------------------------


In [23]:
from musr.musr import MuSRDataset

ta_path = '../data/musr/team_allocation.json'
ta = MuSRDataset(ta_path)

op_path = '../data/musr/object_placements.json'
op = MuSRDataset(op_path)

In [None]:
for model in models:
    print(model)

    file_path = f"{config.output_dir}/musr_location/{model}/musr_location_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    
    
    preds = {k: [] for k in keys_to_eval}


    for test_idx, entry in enumerate(data):
        model_outputs = entry.get('model_outputs', [])
        for k in keys_to_eval:
            if k in entry:  
                preds[k].append(entry[k])
            else:
                preds[k].append(None)

    total_data_len = len(data)
    
    scores = {k: 0 for k in keys_to_eval}
    
    for i, entry in enumerate(data):
        if 'entry' not in entry:
            continue
        
        for k in keys_to_eval:
            pred_answer = preds[k][i]
            if pred_answer is not None:
                metrics = op.evaluate_response([pred_answer], op[i])
                if metrics and metrics[0]['correct']:
                    scores[k] += 1


    print(f"\n=== Results for model={model} ===")
    for k in keys_to_eval:
        if len(preds[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        acc = scores[k] / total_data_len if total_data_len else 0
        print(f"{k} -> Accuracy: {acc:.4f}")

    print("-" * 50)

gpt-4o-mini

=== Results for model=gpt-4o-mini ===
prompt_output_with_fewshot -> Accuracy: 0.5859
--------------------------------------------------
gpt-4o

=== Results for model=gpt-4o ===
prompt_output_with_fewshot -> Accuracy: 0.7227
--------------------------------------------------
llama

=== Results for model=llama ===
prompt_output_with_fewshot -> Accuracy: 0.5508
--------------------------------------------------


In [None]:
for model in models:
    print(model)

    file_path = f"{config.output_dir}/musr_efficiently/{model}/musr_efficiently_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    
    
    preds = {k: [] for k in keys_to_eval}

    for test_idx, entry in enumerate(data):
        model_outputs = entry.get('model_outputs', [])
        for k in keys_to_eval:
            if k in entry:  
                preds[k].append(entry[k])
            else:
                preds[k].append(None)

    total_data_len = len(data)
    
    scores = {k: 0 for k in keys_to_eval}
    
    for i, entry in enumerate(data):
        if 'entry' not in entry:
            continue
        
        for k in keys_to_eval:
            pred_answer = preds[k][i]
            if pred_answer is not None:
                metrics = ta.evaluate_response([pred_answer], ta[i])
                if metrics and metrics[0]['correct']:
                    scores[k] += 1

    print(f"\n=== Results for model={model} ===")
    for k in keys_to_eval:
        if len(preds[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        acc = scores[k] / total_data_len if total_data_len else 0
        print(f"{k} -> Accuracy: {acc:.4f}")

    print("-" * 50)

gpt-4o-mini

=== Results for model=gpt-4o-mini ===
prompt_output_with_fewshot -> Accuracy: 0.7720
--------------------------------------------------
gpt-4o

=== Results for model=gpt-4o ===
prompt_output_with_fewshot -> Accuracy: 0.8920
--------------------------------------------------
llama

=== Results for model=llama ===
prompt_output_with_fewshot -> Accuracy: 0.6640
--------------------------------------------------
