In [None]:
import os
import sys
import json
from tqdm import tqdm 

from dotenv import load_dotenv
from openai import OpenAI

sys.path.append('../')  

from gpqa.gpqa_utils import * 

from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

from mmlu_pro.mmlu_utils import * 

from hotpotqa.hotpotqa_utils import *

from drop.drop_utils import *

from musr.musr import MuSRDataset

from utils import load_model_outputs

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def load_data_and_fewshot(args):
    if args.task == "mmlu_pro":
        dataset, fewshot = load_mmlu_pro()

    elif args.task == "math500": 
        file_path = f"../data/math500/test.jsonl"
        with open(file_path, 'r', encoding='utf-8') as f:
            dataset = [json.loads(line) for line in f]
        fewshot = load_prompt(num_shots=5)
    
    elif args.task == "gpqa":
        dataset = load_examples("../data/gpqa/gpqa_diamond.csv", seed=0)
        with open("../gpqa/chain_of_thought_examples.json", 'r') as f:
            fewshot = json.load(f)

    elif args.task == "hotpotqa":
        dataset = json.load(open(f'../data/hotpotqa/BM25/{args.task}-bm25.json'))
        with open("../hotpotqa/react_prompt.json", 'r') as f:
            fewshot = json.load(f)

    elif args.task == "drop":
        # random.seed(0)
        dataset = pd.read_parquet("../data/drop/drop_sub.parquet", engine="pyarrow")
        dataset = dataset.to_dict(orient="records")  
        # dataset = random.sample(dataset, k=500)
        dataset = convert_ndarray_to_list(dataset)
        dataset = convert_ndarray_to_list(dataset)

        with open("../drop/prompt.json", 'r') as f:
            fewshot = json.load(f)

    elif args.task == "musr_efficiently":
        ta_path = '../data/musr/team_allocation.json'
        dataset = MuSRDataset(ta_path)
        fewshot = 1

    elif args.task == "musr_location":
        op_path = '../data/musr/object_placements.json'
        dataset = MuSRDataset(op_path)
        fewshot = 1
    else: 
        return None, None
    
    return dataset, fewshot

def construct_prompt(args, dataset, fewshot): 
    # {few-shot}
    # {start_prompt} {question} 
    # {response} 
    # end_prompt
    system_prompt = ""
    start_prompt = "I have generated the following responses to the question: "
    end_prompt = """\n\nEvaluate these responses.\nSelect the most consistent response based on majority consensus.\nStart your answer with "The most consistent response is Response X" (without quotes)."""

    if args.task != 'mmlu_pro': 
        output_res_path = f"../result/{args.task}/{args.model}/{args.task}_{args.shot_type}.jsonl"
        res = load_model_outputs(output_res_path)

    samples = []
    if args.task == "math500":
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            question = entry.get('problem', '')
            model_outputs = r.get('model_outputs', [])


            if fewshot != None: 
                user_prompt = "\n\n".join([f"{q}\n\n{a}" for q, a in fewshot]) + "\n\n" 

            user_prompt += start_prompt + f"{question}\n\n"

            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)
            
        
    elif args.task == "mmlu_pro":
        from mmlu_pro.mmlu_utils import format_example
        def format_example_fixed(question, options, cot_content=""):
            example = "{}\nOptions: ".format(question)
            choice_map = "ABCDEFGHIJ"
            for i, opt in enumerate(options):
                example += "{}. {}\n".format(choice_map[i], opt)
            
            return example
        
        
        dataset, fewshot = load_mmlu_pro()
        subjects = list(dataset.keys())
        for subject in tqdm(subjects): 
            res_path = f"../result/{args.task}/{args.model}/{subject}_result.jsonl"
            res = load_model_outputs(res_path)

            for idx, r in tqdm(enumerate(res)):
                user_prompt = ""
                if fewshot != None:
                    for each in fewshot[subject]:
                        user_prompt += format_example(each["question"], each["options"], each["cot_content"])
                
                entry = r.get('entry', {})
                model_outputs = r.get('model_outputs', [])
                question = format_example_fixed(entry['question'], entry['options'])

                user_prompt += start_prompt + f"{question}\n\n"
                for i, output in enumerate(model_outputs):
                    user_prompt += f"Response {i}: {output}\n"

                user_prompt += end_prompt

                message = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ]
                sample = {"idx": idx, "prompt": message, "entry": entry}
                samples.append(sample)
    
    elif args.task == "gpqa": 
        def format_example(example) -> str:
            prompt = f"Question: {example[0]}"
            prompt += f"\nChoices:\n(A) {example[1]}\n(B) {example[2]}\n(C) {example[3]}\n(D) {example[4]}"
            return prompt
        
        for idx, (example, r) in enumerate(zip(dataset, res)):
            user_prompt = ""
            if fewshot != None: 
                user_prompt = chain_of_thought_prompt(fewshot, example)
            
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])
            question = format_example(entry)

            user_prompt += start_prompt + f"{question}\n\n"
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

    elif args.task == "hotpotqa": 
        
        for idx, (entry, r) in enumerate(zip(dataset, res)):
            user_prompt = ""
            if fewshot != None: 
                for qa in fewshot:
                    question = qa["Q"]
                    answer = qa["A"]
                    user_prompt += f"Q: {question}\nA: {answer}\n\n"

            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])

            question = entry['question']

            user_prompt += start_prompt + f"{question}\n\n"
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

    elif args.task == "drop":
        for idx, (entry, r) in enumerate(zip(dataset, res)):
            user_prompt = ""
            if fewshot != None: 
                for qa in fewshot:
                    question = qa["Q"]
                    answer = qa["A"]
                    user_prompt += f"Q: {question}\nA: {answer}\n\n"


            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])

            user_prompt += start_prompt + f"{entry['passage']} {entry['question']}\n\n" 
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt
            
            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

    elif args.task == "musr_efficiently" or args.task == "musr_location":
        from musr.op_icl_fixed import op_fewshot, few_shot_op_instruction, test_op_instruction
        from musr.ta_icl_fixed import ta_fewshot, few_shot_ta_instruction, test_ta_instruction

        if args.task == "musr_location":
            few_shot_examples = op_fewshot  
            few_instruction = few_shot_op_instruction
            test_instruction = test_op_instruction
        elif args.task == 'musr_efficiently':
            few_shot_examples = ta_fewshot
            few_instruction = few_shot_ta_instruction
            test_instruction = test_ta_instruction

        for idx, (entry, r) in enumerate(zip(dataset, res)):
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])

            question = entry['question'].strip()
            context = entry['context'].strip()
            choices = entry['choices']['text']
            labels = ['A', 'B', 'C', 'D', 'E', 'F'][:len(choices)]
            choice_str = '\n'.join([f'{labels[idx]}: {choices[idx]}' for idx in range(len(choices))])
            original_question_part = f"{context}\n\n{question}\n\n{choice_str}"
            
            user_prompt = ""
            for (q, a) in few_shot_examples:
                user_prompt += q + "\n\n" + few_instruction + "\n"
                user_prompt += a + "\n\n"

            user_prompt += start_prompt + f"{original_question_part}\n\n" 
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt
            
            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)


    else: 
        return None
        
    return samples

def generate_model_output(model: str, prompt: str, temperature: float = 1.0, n: int = 1) -> str:        
    responses = client.chat.completions.create(
        model=model,
        messages=prompt,
        n=1,
        temperature=temperature,
    )
    outputs = [choice.message.content for choice in responses.choices]

    return outputs 

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def main(config):
    save_path = f"{config.output_dir}/{config.task}/{config.model}/{config.task}_{config.shot_type}.jsonl"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
        
    dataset, fewshot = load_data_and_fewshot(config)
    if config.shot_type == "zero":
        fewshot = None
        
    samples = construct_prompt(config, dataset, fewshot)
    if config.num_examples != -1: 
        samples = samples[:config.num_examples]

    if samples:
        print(f"Model: {config.model} Task: {config.task}, Shot: {config.shot_type}")
        print(samples[0].keys())
        print("-" * 50)
        prompt = samples[0]["prompt"]
        for message in prompt:
            print(f"Role:\n{message['role']}")
            print(f"Content:\n{message['content']}")
            print("-" * 50)
    else:
        print(f"No samples found for Task: {config.task}, Shot: {config.shot_type}")

    
    if os.path.exists(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:

            existing_data = {json.loads(line)['prompt'][1]['content'] for line in f}  

    else:
        existing_data = set()  


    if samples:

        with open(save_path, "a", encoding='utf-8') as f:  

            for sample in tqdm(samples, total=len(samples)):
                if sample['prompt'][1]['content'] in existing_data:  

                    continue
                try:

                    model_outputs = generate_model_output(config.model, sample["prompt"], config.temperature)
                    sample["prompt_output"] = model_outputs
                    json.dump(sample, f)
                    f.write("\n")
                except Exception as e:
                    print(f"Error processing sample {sample['idx']}: {e}")
                    break

        print(f"Results saved to {save_path}")

In [None]:
class Config:
    def __init__(self):
        self.model = "gpt-4o-mini"  
        self.task = "drop"  # "math500", "mmlu_pro", "gpqa", "drop", "hotpotqa"
        self.shot_type = "few" 
        self.output_dir = "usc_with_fewshot"
        self.input_dir = "../result"
        self.num_examples = -1
        self.temperature = 0.0

In [None]:
tasks = ['math500', 'mmlu_pro', 'gpqa', "drop", "hotpotqa", "musr_efficiently", "musr_location"]

subjects = ['business', 'law', 'psychology', 'biology', 'chemistry', 'history', 'other', 'health', 'economics', 'math', 'physics', 'computer science', 'philosophy', 'engineering']

shots = ["few"]
models = ['gpt-4o-mini', 'gpt-4o']


config = Config()

In [None]:

for model in models:
    for task in tasks:
        for shot in shots:
            config.model = model
            config.task = task
            config.shot_type = shot
            print("=" * 50)
            main(config)



500it [00:00, 79057.26it/s]




Model: gpt-4o-mini Task: math500, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Kevin Kangaroo begins hopping on a number line at 0. He wants to get to 1, but he can hop only $\frac{1}{3}$ of the distance. Each hop tires him out so that he continues to hop $\frac{1}{3}$ of the remaining distance. How far has he hopped after five hops? Express your answer as a common fraction.

Let's think step by step
Kevin hops $1/3$ of the remaining distance with every hop.
His first hop takes $1/3$ closer.
For his second hop, he has $2/3$ left to travel, so he hops forward $(2/3)(1/3)$.
For his third hop, he has $(2/3)^2$ left to travel, so he hops forward $(2/3)^2(1/3)$.
In general, Kevin hops forward $(2/3)^{k-1}(1/3)$ on his $k$th hop.
We want to find how far he has hopped after five hops.
This is a finite geometric series with first term $1/3$, common

100%|██████████| 500/500 [00:00<00:00, 102235.27it/s]

Results saved to usc_with_fewshot/math500/gpt-4o-mini/math500_few.jsonl



300it [00:00, 25514.35it/s]0<?, ?it/s]
300it [00:00, 24506.60it/s]
300it [00:00, 26659.84it/s]
300it [00:00, 27537.34it/s]0<00:00, 28.20it/s]
300it [00:00, 26820.09it/s]
300it [00:00, 22984.59it/s]
300it [00:00, 29366.39it/s]0<00:00, 25.99it/s]
300it [00:00, 26669.45it/s]
300it [00:00, 26926.84it/s]
300it [00:00, 27626.21it/s]
300it [00:00, 29815.21it/s]00<00:00, 27.70it/s]
300it [00:00, 25584.90it/s]
300it [00:00, 26486.44it/s]
300it [00:00, 26079.66it/s]00<00:00, 27.76it/s]
100%|██████████| 14/14 [00:00<00:00, 27.24it/s]


Model: gpt-4o-mini Task: mmlu_pro, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Question: In contrast to _______, _______ aim to reward favourable behaviour by companies. The success of such campaigns have been heightened through the use of ___________, which allow campaigns to facilitate the company in achieving _________ .
Options: A. Boycotts, Buyalls, Blockchain technology, Increased Sales
B. Buycotts, Boycotts, Digital technology, Decreased Sales
C. Boycotts, Buycotts, Digital technology, Decreased Sales
D. Buycotts, Boycotts, Blockchain technology, Charitable donations
E. Boycotts, Buyalls, Blockchain technology, Charitable donations
F. Boycotts, Buycotts, Digital technology, Increased Sales
G. Buycotts, Boycotts, Digital technology, Increased Sales
H. Boycotts, Buycotts, Physical technology, Increased Sales
I. Buycotts, Buyalls, Bloc

100%|██████████| 4200/4200 [00:00<00:00, 64481.22it/s]


Results saved to usc_with_fewshot/mmlu_pro/gpt-4o-mini/mmlu_pro_few.jsonl
Model: gpt-4o-mini Task: gpqa, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Here are some example questions from experts. An explanation is given before the final answer. Answer the final question yourself, giving your reasoning beforehand.
Question: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?
Choices:
(A) 1/400
(B) 19/400
(C) 20/400
(D) 38/400
Let's think step by step: 
The expected proportion of individuals who carry the b allele but are not expected to develop the cancer equals to the frequency of heterozygous allele 

100%|██████████| 198/198 [00:00<00:00, 41517.38it/s]


Results saved to usc_with_fewshot/gpqa/gpt-4o-mini/gpqa_few.jsonl
Model: gpt-4o-mini Task: drop, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Q: Since the 1970s, U.S. governments have negotiated managed-trade agreements, such as the North American Free Trade Agreement in the 1990s, the Dominican Republic-Central America Free Trade Agreement in 2006, and a number of bilateral agreements. In Europe, six countries formed the European Coal and Steel Community in 1951 which became the European Economic Community in 1958. Two core objectives of the EEC were the development of a common market, subsequently renamed the single market, and establishing a customs union between its member states. How many years did the European Coal and Steel Community exist?
A: According to the passage, the European Coal and Steel Community was established in 1951 and

100%|██████████| 500/500 [00:00<00:00, 121489.51it/s]


Results saved to usc_with_fewshot/drop/gpt-4o-mini/drop_few.jsonl
Model: gpt-4o-mini Task: hotpotqa, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Q: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
A: Let’s think step by step. The eastern sector of Colorado orogeny extends into the High Plains. High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft. Answer 1,800 to 7,000 ft

Q: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
A: Let’s think step by step. Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon. Answer Richard Nixon

Q: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
A: Let’s think s

100%|██████████| 500/500 [00:00<00:00, 175670.30it/s]

Results saved to usc_with_fewshot/hotpotqa/gpt-4o-mini/hotpotqa_few.jsonl





Model: gpt-4o-mini Task: musr_efficiently, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
In the quaint community of Midvale, the local school stood as a beacon of enlightenment, nurturing the minds of the next generation. The teachers, the lifeblood of this institution, were tasked with the noble duty of education, while the unsung heroes—the maintenance crew—ensured the smooth functioning of the school's infrastructure. Amidst this, three town residents, Angela, Greg, and Travis, found themselves at a juncture of life where they were presented with the opportunity to serve in one of these crucial roles. The challenge now lay in the hands of the manager, who had to assign them to either teaching or maintenance, a decision that would set the course for their contributions to the school.

Angela was a fiercely independent woman, beset with a u

100%|██████████| 250/250 [00:00<00:00, 21595.19it/s]

Results saved to usc_with_fewshot/musr_efficiently/gpt-4o-mini/musr_efficiently_few.jsonl





Model: gpt-4o-mini Task: musr_location, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Petra, the dedicated housewife, felt a thrill at the thought of her surprise anniversary dinner for her husband, Daniel. She had been skillfully maneuvering around Daniel's eagerness to pitch in without disappointing him or giving up her surprise. 

Daniel, ever-the-observant-husband, noted Petra's unusual enthusiasm about the day's menu. Despite not knowing the details, he appreciated her effort and wanted to help—silently, he decided to deploy his best skill—patiently awaiting his moment to help, maybe when Petra asked for something from the pantry. Amidst the excitement, there was Clara, their maid—ever diligent and efficient, trying to keep the surroundings perfect for this special evening. 

Tucked away, under the counter, was Petra's secret recipe boo

100%|██████████| 256/256 [00:00<00:00, 24449.34it/s]


Results saved to usc_with_fewshot/musr_location/gpt-4o-mini/musr_location_few.jsonl


500it [00:00, 145091.46it/s]

Model: gpt-4o Task: math500, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Kevin Kangaroo begins hopping on a number line at 0. He wants to get to 1, but he can hop only $\frac{1}{3}$ of the distance. Each hop tires him out so that he continues to hop $\frac{1}{3}$ of the remaining distance. How far has he hopped after five hops? Express your answer as a common fraction.

Let's think step by step
Kevin hops $1/3$ of the remaining distance with every hop.
His first hop takes $1/3$ closer.
For his second hop, he has $2/3$ left to travel, so he hops forward $(2/3)(1/3)$.
For his third hop, he has $(2/3)^2$ left to travel, so he hops forward $(2/3)^2(1/3)$.
In general, Kevin hops forward $(2/3)^{k-1}(1/3)$ on his $k$th hop.
We want to find how far he has hopped after five hops.
This is a finite geometric series with first term $1/3$, common rati


100%|██████████| 500/500 [00:00<00:00, 123000.12it/s]


Results saved to usc_with_fewshot/math500/gpt-4o/math500_few.jsonl


300it [00:00, 28401.30it/s]0<?, ?it/s]
300it [00:00, 27805.08it/s]
300it [00:00, 28441.74it/s]
300it [00:00, 31084.27it/s]
300it [00:00, 29858.36it/s]0<00:00, 29.95it/s]
300it [00:00, 26230.25it/s]
300it [00:00, 30329.77it/s]
300it [00:00, 30787.65it/s]0<00:00, 28.85it/s]
300it [00:00, 28893.02it/s]
300it [00:00, 28678.35it/s]
300it [00:00, 32551.84it/s]00<00:00, 29.31it/s]
300it [00:00, 26679.63it/s]
300it [00:00, 28222.93it/s]
300it [00:00, 27494.62it/s]00<00:00, 29.44it/s]
100%|██████████| 14/14 [00:00<00:00, 28.98it/s]


Model: gpt-4o Task: mmlu_pro, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Question: In contrast to _______, _______ aim to reward favourable behaviour by companies. The success of such campaigns have been heightened through the use of ___________, which allow campaigns to facilitate the company in achieving _________ .
Options: A. Boycotts, Buyalls, Blockchain technology, Increased Sales
B. Buycotts, Boycotts, Digital technology, Decreased Sales
C. Boycotts, Buycotts, Digital technology, Decreased Sales
D. Buycotts, Boycotts, Blockchain technology, Charitable donations
E. Boycotts, Buyalls, Blockchain technology, Charitable donations
F. Boycotts, Buycotts, Digital technology, Increased Sales
G. Buycotts, Boycotts, Digital technology, Increased Sales
H. Boycotts, Buycotts, Physical technology, Increased Sales
I. Buycotts, Buyalls, Blockchai

100%|██████████| 4200/4200 [00:00<00:00, 63373.11it/s]


Results saved to usc_with_fewshot/mmlu_pro/gpt-4o/mmlu_pro_few.jsonl
Model: gpt-4o Task: gpqa, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Here are some example questions from experts. An explanation is given before the final answer. Answer the final question yourself, giving your reasoning beforehand.
Question: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?
Choices:
(A) 1/400
(B) 19/400
(C) 20/400
(D) 38/400
Let's think step by step: 
The expected proportion of individuals who carry the b allele but are not expected to develop the cancer equals to the frequency of heterozygous allele in the giv

100%|██████████| 198/198 [00:00<00:00, 42301.97it/s]


Results saved to usc_with_fewshot/gpqa/gpt-4o/gpqa_few.jsonl
Model: gpt-4o Task: drop, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Q: Since the 1970s, U.S. governments have negotiated managed-trade agreements, such as the North American Free Trade Agreement in the 1990s, the Dominican Republic-Central America Free Trade Agreement in 2006, and a number of bilateral agreements. In Europe, six countries formed the European Coal and Steel Community in 1951 which became the European Economic Community in 1958. Two core objectives of the EEC were the development of a common market, subsequently renamed the single market, and establishing a customs union between its member states. How many years did the European Coal and Steel Community exist?
A: According to the passage, the European Coal and Steel Community was established in 1951 and became th

100%|██████████| 500/500 [00:00<00:00, 122325.71it/s]

Results saved to usc_with_fewshot/drop/gpt-4o/drop_few.jsonl
Model: gpt-4o Task: hotpotqa, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Q: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
A: Let’s think step by step. The eastern sector of Colorado orogeny extends into the High Plains. High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft. Answer 1,800 to 7,000 ft

Q: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
A: Let’s think step by step. Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon. Answer Richard Nixon

Q: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
A: Let’s think step by ste


100%|██████████| 500/500 [00:00<00:00, 161879.74it/s]


Results saved to usc_with_fewshot/hotpotqa/gpt-4o/hotpotqa_few.jsonl
Model: gpt-4o Task: musr_efficiently, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
In the quaint community of Midvale, the local school stood as a beacon of enlightenment, nurturing the minds of the next generation. The teachers, the lifeblood of this institution, were tasked with the noble duty of education, while the unsung heroes—the maintenance crew—ensured the smooth functioning of the school's infrastructure. Amidst this, three town residents, Angela, Greg, and Travis, found themselves at a juncture of life where they were presented with the opportunity to serve in one of these crucial roles. The challenge now lay in the hands of the manager, who had to assign them to either teaching or maintenance, a decision that would set the course for their contributions to the 

100%|██████████| 250/250 [00:00<00:00, 22209.01it/s]


Results saved to usc_with_fewshot/musr_efficiently/gpt-4o/musr_efficiently_few.jsonl
Model: gpt-4o Task: musr_location, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Petra, the dedicated housewife, felt a thrill at the thought of her surprise anniversary dinner for her husband, Daniel. She had been skillfully maneuvering around Daniel's eagerness to pitch in without disappointing him or giving up her surprise. 

Daniel, ever-the-observant-husband, noted Petra's unusual enthusiasm about the day's menu. Despite not knowing the details, he appreciated her effort and wanted to help—silently, he decided to deploy his best skill—patiently awaiting his moment to help, maybe when Petra asked for something from the pantry. Amidst the excitement, there was Clara, their maid—ever diligent and efficient, trying to keep the surroundings perfect for this 

100%|██████████| 256/256 [00:00<00:00, 24471.63it/s]

Results saved to usc_with_fewshot/musr_location/gpt-4o/musr_location_few.jsonl





In [None]:
def extract_user_number(response_text):
    primary_pattern = r'\[?\s*The most consistent response is Response\s*(\d+)\s*\]?'
    primary_matches = re.findall(primary_pattern, response_text, flags=re.IGNORECASE)
    if primary_matches:

        return int(primary_matches[0])
    


    sentences = re.split(r'[.\n]', response_text.strip())

    sentences = [s.strip() for s in sentences if s.strip()]
    if sentences:
        last_sentence = sentences[-1]
        fallback_matches = re.findall(r'(\d+)', last_sentence)
        if fallback_matches:

            return int(fallback_matches[-1])
    

    return -1

In [6]:
models = ['gpt-4o-mini', 'gpt-4o', 'llama']

In [None]:
overall_summary = []

for model in models:
    for task in tasks:
        if task == 'mmlu_pro':
            continue
        output_res_path = f"../result/{task}/{model}/{task}_{config.shot_type}.jsonl"
        res = load_model_outputs(output_res_path)
        
        usc_path = f"{config.output_dir}/{task}/{model}/{task}_{config.shot_type}.jsonl"
        usc = load_model_outputs(usc_path) 

                

        user_counter = Counter()
        total_entries = 0
        for entry in tqdm(usc, desc=f"Processing {model}-{task}"):
            total_entries += 1
            raw_output = entry.get("prompt_output")          

            if isinstance(raw_output, list):
                prompt_output = raw_output[0] if raw_output else None
            else:
                prompt_output = raw_output                   


            if prompt_output is None:
                user_counter["None"] += 1
                continue
            extracted_user = extract_user_number(prompt_output)
            if extracted_user is None:
                user_counter["None"] += 1
            else:
                user_counter[extracted_user] += 1


        for user, count in user_counter.items():
            percentage = (count / total_entries) * 100 if total_entries else 0
            overall_summary.append({
                "Model": model,
                "Task": task,
                "Extracted User": user,
                "Count": count,
                "Percentage": round(percentage, 2)
            })

        for i in range(len(res)):
            raw_output = usc[i].get("prompt_output")          

            if isinstance(raw_output, list):
                source_text = raw_output[0] if raw_output else None
            else:
                source_text = raw_output                  


            if source_text is None:
                answer_number = 0
            else:
                answer_number = extract_user_number(source_text)
                if not isinstance(answer_number, int) or not (0 <= answer_number <= 4):
                    answer_number = 0
            model_outputs = res[i].get('model_outputs') or res[i].get('resps')[0]
            if len(model_outputs) < answer_number:
                print(f"Warning: Entry {i} has less than {answer_number} model outputs. Using default output.")
                chosen_output = model_outputs[0] if model_outputs else None
            else:
                chosen_output = model_outputs[answer_number]
            usc[i]['usc_with_fewshot'] = chosen_output


        with open(usc_path, "w", encoding="utf-8") as out_file:
            for item in usc:
                out_file.write(json.dumps(item, ensure_ascii=False) + "\n")


Processing gpt-4o-mini-math500: 100%|██████████| 500/500 [00:00<00:00, 337162.70it/s]
Processing gpt-4o-mini-gpqa: 100%|██████████| 198/198 [00:00<00:00, 370746.51it/s]
Processing gpt-4o-mini-drop: 100%|██████████| 500/500 [00:00<00:00, 402678.96it/s]
Processing gpt-4o-mini-hotpotqa: 100%|██████████| 500/500 [00:00<00:00, 387858.70it/s]
Processing gpt-4o-mini-musr_efficiently: 100%|██████████| 250/250 [00:00<00:00, 319201.22it/s]
Processing gpt-4o-mini-musr_location: 100%|██████████| 256/256 [00:00<00:00, 363610.51it/s]
Processing gpt-4o-math500: 100%|██████████| 500/500 [00:00<00:00, 369998.59it/s]
Processing gpt-4o-gpqa: 100%|██████████| 198/198 [00:00<00:00, 361861.52it/s]
Processing gpt-4o-drop: 100%|██████████| 500/500 [00:00<00:00, 409680.02it/s]
Processing gpt-4o-hotpotqa: 100%|██████████| 500/500 [00:00<00:00, 383953.13it/s]
Processing gpt-4o-musr_efficiently: 100%|██████████| 250/250 [00:00<00:00, 366763.20it/s]
Processing gpt-4o-musr_location: 100%|██████████| 256/256 [00:00<

In [None]:
df_overall = pd.DataFrame(overall_summary)


df_overall['ExtractedUser_Num'] = pd.to_numeric(
    df_overall['Extracted User'], errors='coerce'
)


df_overall['Group'] = df_overall['ExtractedUser_Num'].apply(
    lambda x: '0-4' if 0 <= x <= 4 else 'Others'
)

for model in df_overall["Model"].unique():
    model_df = df_overall[df_overall["Model"] == model]
    

    pivot_table = model_df.pivot_table(
        index="Group",         
        columns="Task",         
        values="Percentage",    
        aggfunc="sum",          
        fill_value=0            
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)



Model: gpt-4o-mini


Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-4,100.0,100.0,100.0,100.0,100.0,100.0



Model: gpt-4o


Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-4,100.0,100.01,100.0,100.0,100.0,100.0



Model: llama


Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-4,100.0,100.01,100.0,100.0,100.0,100.0


In [None]:
df_overall = pd.DataFrame(overall_summary)

for model in df_overall["Model"].unique():
    model_df = df_overall[df_overall["Model"] == model]
    
    pivot_table = model_df.pivot_table(
        index="Extracted User",       
        columns="Task",               
        values=["Percentage"], 
        fill_value=0           
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)


Model: gpt-4o-mini


Unnamed: 0_level_0,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage
Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Extracted User,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,82.8,65.66,65.4,89.2,60.8,92.58
1,13.8,25.25,23.0,8.6,33.2,7.42
2,1.2,3.03,5.4,0.8,6.0,0.0
3,0.8,1.01,3.2,0.2,0.0,0.0
4,1.4,5.05,3.0,1.2,0.0,0.0



Model: gpt-4o


Unnamed: 0_level_0,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage
Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Extracted User,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,86.8,61.62,65.2,80.6,31.6,37.5
1,10.4,19.7,17.0,12.2,33.2,37.5
2,1.8,11.11,7.4,3.2,3.6,10.94
3,1.0,5.56,7.0,2.4,30.8,13.67
4,0.0,2.02,3.4,1.6,0.8,0.39



Model: llama


Unnamed: 0_level_0,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage
Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Extracted User,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,61.4,11.62,29.2,47.8,20.4,3.52
1,26.4,26.77,36.6,37.0,32.4,1.17
2,5.6,12.63,13.4,3.4,14.4,18.36
3,4.2,33.33,13.4,10.2,18.8,1.56
4,2.4,15.66,7.4,1.6,14.0,75.39


In [None]:
overall_summary_mmlu = []

def process_mmlu_pro(model: str):
    task = "mmlu_pro"

    usc_path = (
        f"{config.output_dir}/{task}/{model}/"
        f"{task}_{config.shot_type}.jsonl"
    )
    all_entries = load_model_outputs(usc_path)

    grouped = defaultdict(list)  

    for entry in all_entries:
        subj = entry["entry"]["category"]          

        grouped[subj].append(entry)

    if model != 'llama': 
        for subj in grouped:
            grouped[subj].sort(key=lambda e: e["idx"])


    for subject, entries in grouped.items():
        output_res_path = f"../result/{task}/{model}/{subject}_result.jsonl"
        res = load_model_outputs(output_res_path)


        user_counter = Counter()
        total = len(entries)

        for idx, usc in enumerate(
            tqdm(entries, desc=f"{model}-{subject}")
        ):
            
            raw_output = usc.get("prompt_output")          
            if isinstance(raw_output, list):
                p_out = raw_output[0] if raw_output else None
            else:
                p_out = raw_output                   


            if p_out is None:
                user_num = None
            else:
                user_num = extract_user_number(p_out)


            key = user_num if user_num is not None else "None"
            user_counter[key] += 1


            if not isinstance(user_num, int) or not (0 <= user_num <= 4):
                user_num = 0  # fallback


            outs = res[idx].get("model_outputs") or res[idx].get("resps")[0]
            chosen = (
                outs[user_num - 1] if len(outs) >= user_num
                else (outs[0] if outs else None)
            )
            entries[idx]["usc_with_fewshot"] = chosen


        dir_path = f"{config.output_dir}/{task}/{model}/{subject}"
        file_name = f"{task}_{config.shot_type}.jsonl"
        save_path = os.path.join(dir_path, file_name)


        os.makedirs(dir_path, exist_ok=True)


        with open(save_path, "w", encoding="utf-8") as f:
            for item in entries:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
                

        for u, cnt in user_counter.items():
            overall_summary_mmlu.append({
                "Model": model,
                "Task": task,
                "Subject": subject,
                "Extracted User": u,
                "Count": cnt,
                "Percentage": round(cnt / total * 100, 2)
            })


for model in ['gpt-4o-mini', 'gpt-4o', 'llama']:
    process_mmlu_pro(model)

gpt-4o-mini-business: 100%|██████████| 300/300 [00:00<00:00, 264180.39it/s]
gpt-4o-mini-law: 100%|██████████| 300/300 [00:00<00:00, 339803.19it/s]
gpt-4o-mini-psychology: 100%|██████████| 300/300 [00:00<00:00, 350303.79it/s]
gpt-4o-mini-biology: 100%|██████████| 300/300 [00:00<00:00, 363143.20it/s]
gpt-4o-mini-chemistry: 100%|██████████| 300/300 [00:00<00:00, 345969.54it/s]
gpt-4o-mini-history: 100%|██████████| 300/300 [00:00<00:00, 341834.07it/s]
gpt-4o-mini-other: 100%|██████████| 300/300 [00:00<00:00, 334385.12it/s]
gpt-4o-mini-health: 100%|██████████| 300/300 [00:00<00:00, 362411.06it/s]
gpt-4o-mini-economics: 100%|██████████| 300/300 [00:00<00:00, 340538.89it/s]
gpt-4o-mini-math: 100%|██████████| 300/300 [00:00<00:00, 333852.80it/s]
gpt-4o-mini-physics: 100%|██████████| 300/300 [00:00<00:00, 361577.93it/s]
gpt-4o-mini-computer science: 100%|██████████| 300/300 [00:00<00:00, 348653.70it/s]
gpt-4o-mini-philosophy: 100%|██████████| 300/300 [00:00<00:00, 333057.49it/s]
gpt-4o-mini-eng

In [None]:
df_mmlu = pd.DataFrame(overall_summary_mmlu)

for model in df_mmlu["Model"].unique():
    model_df = df_mmlu[df_mmlu["Model"] == model]
    

    pivot_table = model_df.pivot_table(
        index="Extracted User",      
        columns="Task",              
        values=["Percentage"], 
        fill_value=0           
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)


Model: gpt-4o-mini


Unnamed: 0_level_0,Percentage
Task,mmlu_pro
Extracted User,Unnamed: 1_level_2
-1,0.33
0,73.691429
1,20.903571
2,1.595
3,1.237857
4,2.548571



Model: gpt-4o


Unnamed: 0_level_0,Percentage
Task,mmlu_pro
Extracted User,Unnamed: 1_level_2
0,72.427857
1,15.737857
2,8.524286
3,2.380714
4,0.999231



Model: llama


Unnamed: 0_level_0,Percentage
Task,mmlu_pro
Extracted User,Unnamed: 1_level_2
-1,0.33
0,18.952143
1,39.309286
2,12.476429
3,19.857143
4,9.382143


In [13]:
keys_to_eval = ['usc_with_fewshot']

In [None]:
from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

for model in models:
    usc_path = f"{config.output_dir}/math500/{model}/math500_{config.shot_type}.jsonl"

    data = load_model_outputs(usc_path)

    scores = {k: [] for k in keys_to_eval}

    for entry in data:
        idx = entry["idx"]
        

        _, gt = parse_ground_truth(entry['entry'], "math")


        model_outputs = entry.get('model_outputs', [])
        
        for key in keys_to_eval:
            if key not in entry:
                continue


            pred = extract_answer(entry[key], "math")
            pred = strip_string(pred)


            try:
                result = math_equal_process((idx, pred, gt))

                if not result :
                    result = process_results(gt, [entry[key]])
                    if not result:
                        pred = extract_answer(pred, "math")
                        result = math_equal_process((None, pred, gt))

                scores[key].append(result)

            except TimeoutError:
                scores[key].append(False)
            except Exception as error:
                print(f"Error while processing {key} for idx={idx}: {error}")
                scores[key].append(False)


    print(f"\n===== Evaluation Results for model={model}, shot={config.shot_type} =====")
    for key in keys_to_eval:

        if len(scores[key]) == 0:
            print(f"{key} -> No data / Not found in entries")
            continue

        acc = sum(scores[key]) / len(scores[key])
        print(f"{key} -> Accuracy: {acc:.4f}")
    print("-" * 50)



===== Evaluation Results for model=gpt-4o-mini, shot=few =====
usc_with_fewshot -> Accuracy: 0.7940
--------------------------------------------------

===== Evaluation Results for model=gpt-4o, shot=few =====
usc_with_fewshot -> Accuracy: 0.8020
--------------------------------------------------

===== Evaluation Results for model=llama, shot=few =====
usc_with_fewshot -> Accuracy: 0.4780
--------------------------------------------------


In [None]:
def extract_answer(text):
    pattern = r"answer is \(?([A-J])\)?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return extract_again(text)


for model in models:
    overall_scores = {k: 0 for k in keys_to_eval}  
    overall_total_entries = 0

    for subject in subjects:
        file_path = f"{config.output_dir}/mmlu_pro/{model}/{subject}/mmlu_pro_few.jsonl"
        data = load_model_outputs(file_path)


        subject_scores = {k: 0 for k in keys_to_eval}
        total_data_len = len(data)
        overall_total_entries += total_data_len

        for entry in data:
            model_outputs = entry.get('model_outputs', [])
            answer = entry['entry'].get('answer') or entry['entry'].get('gold')


            for key in keys_to_eval:
                if key not in entry:
                    continue
                pred = extract_answer(entry[key])
                if pred == answer: 
                    subject_scores[key] += 1


        for k in keys_to_eval:
            overall_scores[k] += subject_scores[k]

           
    print(f"\n=== Overall Results for model={model} ===")
    for key in keys_to_eval:
        if overall_total_entries == 0:
            acc = 0
        else:
            acc = overall_scores[key] / overall_total_entries
        print(f"{key} -> Accuracy: {acc:.4f}")
    print("-" * 50)



=== Overall Results for model=gpt-4o-mini ===
usc_with_fewshot -> Accuracy: 0.6286
--------------------------------------------------

=== Overall Results for model=gpt-4o ===
usc_with_fewshot -> Accuracy: 0.7217
--------------------------------------------------

=== Overall Results for model=llama ===
usc_with_fewshot -> Accuracy: 0.3648
--------------------------------------------------


In [None]:
from gpqa.gpqa_utils import * 

examples = load_examples("../data/gpqa/gpqa_diamond.csv", seed=0)

for model in models:
    file_path = f"{config.output_dir}/gpqa/{model}/gpqa_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)

    scores = {k: 0 for k in keys_to_eval}


    total_data_len = len(data)
    if total_data_len != len(examples):
        print("Warning: data length and examples length do not match!")

    
    
    for entry, example in zip(data, examples):
        correct_index = example.correct_index  

        model_outputs = entry.get('model_outputs', [])
        
        for key in keys_to_eval:
            if key not in entry:
                continue

            pred = parse_sampled_answer(entry[key])
            
            if pred is None:
                is_correct = False
            else:
                is_correct = (LETTER_TO_INDEX[pred] == correct_index)

            scores[key] += int(is_correct)
    
    print(f"\n=== Results for model={model}, shot={config.shot_type} ===")
    for key in keys_to_eval:
        acc = scores[key] / total_data_len if total_data_len else 0
        print(f"{key} -> Accuracy: {acc:.4f}")

    print("-" * 50)


=== Results for model=gpt-4o-mini, shot=few ===
usc_with_fewshot -> Accuracy: 0.3889
--------------------------------------------------

=== Results for model=gpt-4o, shot=few ===
usc_with_fewshot -> Accuracy: 0.4747
--------------------------------------------------

=== Results for model=llama, shot=few ===
usc_with_fewshot -> Accuracy: 0.2828
--------------------------------------------------


In [None]:
from drop.drop_utils import *

for model in models: 
    print(model)
    file_path = f"{config.output_dir}/drop/{model}/drop_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    
    em_scores = {k: [] for k in keys_to_eval}
    f1_scores = {k: [] for k in keys_to_eval}

    def get_max_em_f1(pred, golds):
        max_em, max_f1 = 0.0, 0.0
        for gold_answer in golds:
            exact_match, f1_score = get_metrics(pred, gold_answer)
            if gold_answer[0].strip():
                max_em = max(max_em, exact_match)
                max_f1 = max(max_f1, f1_score)
        return max_em, max_f1

    for entry in data:
        golds = get_answers(entry['entry']) 

        model_outputs = entry.get('model_outputs', [])
        
        for k in keys_to_eval:
            if k not in entry:
                continue

            pred = extract_answer(entry[k])
            em_val, f1_val = get_max_em_f1(pred, golds)
            em_scores[k].append(em_val)
            f1_scores[k].append(f1_val)

    print(f"\n===== Results for model={model} =====")
    for k in keys_to_eval:
        if len(em_scores[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        em_mean = np.mean(em_scores[k])
        f1_mean = np.mean(f1_scores[k])
        print(f"{k} -> EM: {em_mean:.4f}, F1: {f1_mean:.4f}")

    print("-" * 50)

gpt-4o-mini

===== Results for model=gpt-4o-mini =====
usc_with_fewshot -> EM: 0.7780, F1: 0.8551
--------------------------------------------------
gpt-4o

===== Results for model=gpt-4o =====
usc_with_fewshot -> EM: 0.8260, F1: 0.9065
--------------------------------------------------
llama

===== Results for model=llama =====
usc_with_fewshot -> EM: 0.6900, F1: 0.7528
--------------------------------------------------


In [None]:
from hotpotqa.hotpotqa_utils import *

def extract_answer(response_text):
    match = re.search(r"Answer\s+(.+)", response_text, re.DOTALL)
    if match:
        answer = match.group(1).strip()

        answer = re.sub(r"[.\n]+$", "", answer).strip()
        return answer


    match = re.search(r"(?<!\w)Answer[:\s]+(.+?)(?:[.\n]|$)", response_text, re.IGNORECASE | re.DOTALL)
    if match:
        answer = match.group(1).strip()

        answer = re.sub(r"[.\n]+$", "", answer).strip()
        return answer
    return response_text.strip()


dataset = json.load(open(f'../data/hotpotqa/BM25/hotpotqa-bm25.json'))
with open("../hotpotqa/react_prompt.json", 'r') as f:
    fewshot = json.load(f)

for model in models:
    print(model)
    file_path = f"{config.output_dir}/hotpotqa/{model}/hotpotqa_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    preds = {k: [] for k in keys_to_eval}

    for entry in data:

        model_outputs = entry.get('model_outputs', [])
        
        for k in keys_to_eval:
            if k in entry:

                answer = extract_answer(entry[k])
                preds[k].append(answer)

    print(f"\n=== Results for model={model} ===")
    for k in keys_to_eval:
        if len(preds[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        em_scores, f1_scores = get_em_f1(dataset, preds[k])
        em_mean = em_scores.mean()
        f1_mean = f1_scores.mean()
        print(f"{k} -> EM: {em_mean:.4f}, F1: {f1_mean:.4f}")
    
    print("-" * 50)

gpt-4o-mini

=== Results for model=gpt-4o-mini ===
usc_with_fewshot -> EM: 0.3620, F1: 0.4826
--------------------------------------------------
gpt-4o



=== Results for model=gpt-4o ===
usc_with_fewshot -> EM: 0.4620, F1: 0.6066
--------------------------------------------------
llama

=== Results for model=llama ===
usc_with_fewshot -> EM: 0.2520, F1: 0.3234
--------------------------------------------------


In [19]:
from musr.musr import MuSRDataset

ta_path = '../data/musr/team_allocation.json'
ta = MuSRDataset(ta_path)

op_path = '../data/musr/object_placements.json'
op = MuSRDataset(op_path)

In [None]:
for model in models:
    print(model)

    file_path = f"{config.output_dir}/musr_location/{model}/musr_location_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    
    
    preds = {k: [] for k in keys_to_eval}

    for test_idx, entry in enumerate(data):
        model_outputs = entry.get('model_outputs', [])
        for k in keys_to_eval:
            if k in entry:  
                preds[k].append(entry[k])
            else:
                preds[k].append(None)

    total_data_len = len(data)
    
    scores = {k: 0 for k in keys_to_eval}
    
    for i, entry in enumerate(data):
        if 'entry' not in entry:
            continue
        
        for k in keys_to_eval:
            pred_answer = preds[k][i]
            if pred_answer is not None:
                metrics = op.evaluate_response([pred_answer], op[i])
                if metrics and metrics[0]['correct']:
                    scores[k] += 1

    print(f"\n=== Results for model={model} ===")
    for k in keys_to_eval:
        if len(preds[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        acc = scores[k] / total_data_len if total_data_len else 0
        print(f"{k} -> Accuracy: {acc:.4f}")

    print("-" * 50)

gpt-4o-mini

=== Results for model=gpt-4o-mini ===
usc_with_fewshot -> Accuracy: 0.5898
--------------------------------------------------
gpt-4o

=== Results for model=gpt-4o ===
usc_with_fewshot -> Accuracy: 0.7109
--------------------------------------------------
llama

=== Results for model=llama ===
usc_with_fewshot -> Accuracy: 0.5391
--------------------------------------------------


In [None]:
for model in models:
    print(model)

    file_path = f"{config.output_dir}/musr_efficiently/{model}/musr_efficiently_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    
    
    preds = {k: [] for k in keys_to_eval}

    for test_idx, entry in enumerate(data):
        model_outputs = entry.get('model_outputs', [])
        for k in keys_to_eval:
            if k in entry:  
                preds[k].append(entry[k])
            else:
                preds[k].append(None)

    total_data_len = len(data)
    
    scores = {k: 0 for k in keys_to_eval}
    
    for i, entry in enumerate(data):
        if 'entry' not in entry:
            continue
        
        for k in keys_to_eval:
            pred_answer = preds[k][i]
            if pred_answer is not None:
                metrics = ta.evaluate_response([pred_answer], ta[i])
                if metrics and metrics[0]['correct']:
                    scores[k] += 1

    print(f"\n=== Results for model={model} ===")
    for k in keys_to_eval:
        if len(preds[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        acc = scores[k] / total_data_len if total_data_len else 0
        print(f"{k} -> Accuracy: {acc:.4f}")

    print("-" * 50)

gpt-4o-mini



=== Results for model=gpt-4o-mini ===
usc_with_fewshot -> Accuracy: 0.7720
--------------------------------------------------
gpt-4o

=== Results for model=gpt-4o ===
usc_with_fewshot -> Accuracy: 0.8960
--------------------------------------------------
llama

=== Results for model=llama ===
usc_with_fewshot -> Accuracy: 0.7000
--------------------------------------------------


In [29]:
(80.2 + 72.2 + 47.5 + 82.6 + 46.2 + + 71.1 + 89.6)/7

69.91428571428571

In [22]:
(47.8 + 36.5 + 28.3 + 69.0 + + 25.2 + + 53.9 + 70.0) / 7

47.24285714285714