In [None]:
import os
import json
from tqdm import tqdm 

from dotenv import load_dotenv
from openai import OpenAI

from gpqa.gpqa_utils import * 

from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

from mmlu_pro.mmlu_utils import * 

from hotpotqa.hotpotqa_utils import *

from drop.drop_utils import *

from musr.musr import MuSRDataset

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def load_data_and_fewshot(args):
    if args.task == "mmlu_pro":
        dataset, fewshot = load_mmlu_pro()

    elif args.task == "math500": 
        file_path = f"data/math500/test.jsonl"
        with open(file_path, 'r', encoding='utf-8') as f:
            dataset = [json.loads(line) for line in f]
        fewshot = load_prompt(num_shots=5)

    elif args.task == "gpqa":
        dataset = load_examples("data/gpqa/gpqa_diamond.csv", seed=0)
        prompt_path = "gpqa/chain_of_thought_examples.json"
        with open(prompt_path, 'r') as f:
            fewshot = json.load(f)

    elif args.task == "hotpotqa":
        dataset = json.load(open(f'data/hotpotqa/{args.task}.json'))
        with open("hotpotqa/react_prompt.json", 'r') as f:
            fewshot = json.load(f)

    elif args.task == "drop":
        dataset = pd.read_parquet("data/drop/drop_sub.parquet", engine="pyarrow")
        dataset = dataset.to_dict(orient="records") 
        dataset = convert_ndarray_to_list(dataset)
        dataset = convert_ndarray_to_list(dataset)
        with open("drop/prompt.json", 'r') as f:
            fewshot = json.load(f)

    elif args.task == "musr_efficiently":
        ta_path = 'data/musr/team_allocation.json'
        dataset = MuSRDataset(ta_path)
        fewshot = 1

    elif args.task == "musr_location":
        op_path = 'data/musr/object_placements.json'
        dataset = MuSRDataset(op_path)
        fewshot = 1
    else: 
        return None, None
    
    return dataset, fewshot

def construct_prompt(args, dataset, fewshot): 
    samples = []

    if args.task == "math500":
        system_prompt = "Please reason step by step, and put your final answer within \\boxed{{}}.\n"

        for idx, entry in tqdm(enumerate(dataset)): 
            user_prompt = ""
            if fewshot != None: 
                user_prompt = "\n\n".join([f"{q}\n\n{a}" for q, a in fewshot]) + "\n\n" 
            user_prompt += entry['problem'] + "\n"
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]
            sample = {"idx": idx,"prompt": message,"entry": entry}
            samples.append(sample)
        
    elif args.task == "mmlu_pro":
        subjects = list(dataset.keys())
        for subject in tqdm(subjects): 
            user_prompt = "The following are multiple choice questions (with answers) about {}. Think step by" \
                " step and then output the answer in the format of \"The answer is (X)\" at the end.\n\n" \
            .format(subject)
            if fewshot != None:
                for each in fewshot[subject]:
                    user_prompt += format_example(each["question"], each["options"], each["cot_content"])
            random.seed(42)
            test_data = random.sample(dataset[subject], min(300, len(dataset[subject])))
    
            for entry in test_data:
                input_text = format_example(entry['question'], entry['options'])
                message = [{"role": "user", "content": user_prompt + input_text}]
                sample = {"idx": entry['question_id'],"prompt": message,"entry": entry}
                samples.append(sample)
    
    elif args.task == "gpqa": 
        system_prompt = "You are a very intelligent assistant, who follows instructions directly.\n\n"

        samples = []
        for example_id, example in tqdm(enumerate(dataset)):
            if fewshot != None: 
                user_prompt = chain_of_thought_prompt(fewshot, example)
            else:
                user_prompt = zero_shot_chain_of_thought_prompt(example)
            
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

            sample = {"idx": example_id,"prompt": message,"entry": example}
            samples.append(sample)
    
    elif args.task == "hotpotqa": 
        system_prompt = ""

        samples = []
        if fewshot != None: 
            fewshot_prompt = ""
            for qa in fewshot:
                question = qa["Q"]
                answer = qa["A"]

                fewshot_prompt += f"Q: {question}\nA: {answer}\n\n"

        for idx, entry in tqdm(enumerate(dataset)):
            if fewshot != None: 
                user_prompt = fewshot_prompt + f"Q: {entry['question']}." + "\n\nEnd your answer with \"Answer <answer>\". Think step by step." + "\n\nA: " 
            else:
                user_prompt = f"Q: {entry['question']}." + "\n\nEnd your answer with \"Answer <answer>\". Think step by step." + "\n\nA: " 
            
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

            sample = {"idx": idx,"prompt": message,"entry": entry, "answer": entry['answers']}
            samples.append(sample)

    elif args.task == "drop": 
        system_prompt = ""

        samples = []
        if fewshot != None: 
            fewshot_prompt = ""
            for qa in fewshot:
                question = qa["Q"]
                answer = qa["A"]

                fewshot_prompt += f"Q: {question}\nA: {answer}\n\n"

        for idx, entry in tqdm(enumerate(dataset)):
            if fewshot != None: 
                user_prompt = fewshot_prompt + f"Q: {entry['passage']}{entry['question']}" + "\n\nEnd your answer with \"So the answer is <answer>\". Think step by step." + "\n\nA: " 
            else:
                user_prompt = f"Q: {entry['passage']}{entry['question']}" + "\n\nEnd your answer with \"So the answer is <answer>\". Think step by step." + "\n\nA: " 
            
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

            sample = {"idx": idx,"prompt": message,"entry": entry}
            samples.append(sample)

    elif args.task == "musr_efficiently" or args.task == "musr_location":
        for idx, entry in tqdm(enumerate(dataset)):
            if fewshot != None:
                message = entry['fs_cot_messages']
            else:
                message = entry['zs_cot_messages']
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)
    else: 
        return None

    return samples

def generate_model_output(model: str, prompt: str, temperature: float = 1.0, n: int = 5) -> str:        
    responses = client.chat.completions.create(
        model=model,
        messages=prompt,
        n=n,
        temperature=temperature,
    )

    outputs = [choice.message.content for choice in responses.choices]


    return outputs 

In [None]:
def main(config):
    save_path = f"{config.output_dir}/{config.task}/{config.model}/{config.task}_{config.shot_type}.jsonl"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    dataset, fewshot = load_data_and_fewshot(config)
    if config.shot_type == "zero":
        fewshot = None
        
    samples = construct_prompt(config, dataset, fewshot)
    if config.num_examples != -1: 
        samples = samples[:config.num_examples] 

    if samples:
        print(f"Model: {config.model} Task: {config.task}, Shot: {config.shot_type}, Temperature: {config.temperature}, N: {config.n}")
        print("-" * 50)
        prompt = samples[0]["prompt"]
        for message in prompt:
            print(f"Role:\n{message['role']}")
            print(f"Content:\n{message['content']}")
            print("-" * 50)
    else:
        print(f"No samples found for Task: {config.task}, Shot: {config.shot_type}, Temperature: {config.temperature}, N: {config.n}")

    if os.path.exists(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:

            existing_data = {json.loads(line)['idx'] for line in f}  
    else:
        existing_data = set()  

    flag = 0
    if samples:
        with open(save_path, "a", encoding='utf-8') as f:  
            for sample in tqdm(samples, total=len(samples)):
                if sample['idx'] in existing_data:  
                    continue
                try:
                    model_outputs = generate_model_output(config.model, sample["prompt"], config.temperature, config.n)
                    sample["model_outputs"] = model_outputs
                    if flag == 0:
                        print(model_outputs[0])
                        print("-" * 50)
                        flag = 1
                    json.dump(sample, f)
                    f.write("\n")
                except Exception as e:
                    print(f"Error processing sample {sample['idx']}: {e}")
                    break
        print(f"Results saved to {save_path}")

In [None]:
class Config:
    def __init__(self):
        self.model = "gpt-4o"  
        self.task = "drop"  
        self.shot_type = "few" 
        self.output_dir = "result"
        self.num_examples = 1
        self.temperature = 1
        self.n = 5
        self.seed = 42
        
tasks = ['math500', 'mmlu_pro', 'gpqa', 'drop', 'hotpotqa', 'musr_efficiently', 'musr_location']
shots = ["few", "zero"]
models = ['gpt-4o-mini']

config = Config()

In [None]:
for model in models:
    for task in tasks:
        for shot in shots:
            config.model = model
            config.task = task
            config.shot_type = shot
            config.n = 5 
            print("=" * 50)
            main(config)



500it [00:00, 267630.42it/s]


Model: gpt-4o-mini Task: math500, Shot: few, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:
Please reason step by step, and put your final answer within \boxed{{}}.

--------------------------------------------------
Role:
user
Content:
Kevin Kangaroo begins hopping on a number line at 0. He wants to get to 1, but he can hop only $\frac{1}{3}$ of the distance. Each hop tires him out so that he continues to hop $\frac{1}{3}$ of the remaining distance. How far has he hopped after five hops? Express your answer as a common fraction.

Let's think step by step
Kevin hops $1/3$ of the remaining distance with every hop.
His first hop takes $1/3$ closer.
For his second hop, he has $2/3$ left to travel, so he hops forward $(2/3)(1/3)$.
For his third hop, he has $(2/3)^2$ left to travel, so he hops forward $(2/3)^2(1/3)$.
In general, Kevin hops forward $(2/3)^{k-1}(1/3)$ on his $k$th hop.
We want to find how far he has hopped after five hops.
This i

500it [00:00, 982272.60it/s]

Model: gpt-4o-mini Task: math500, Shot: zero, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:
Please reason step by step, and put your final answer within \boxed{{}}.

--------------------------------------------------
Role:
user
Content:
Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\theta),$ where $r > 0$ and $0 \le \theta < 2 \pi.$

--------------------------------------------------



100%|██████████| 14/14 [00:00<00:00, 302.94it/s]


Model: gpt-4o-mini Task: mmlu_pro, Shot: few, Temperature: 1, N: 5
--------------------------------------------------
Role:
user
Content:
The following are multiple choice questions (with answers) about business. Think step by step and then output the answer in the format of "The answer is (X)" at the end.

Question: In contrast to _______, _______ aim to reward favourable behaviour by companies. The success of such campaigns have been heightened through the use of ___________, which allow campaigns to facilitate the company in achieving _________ .
Options: A. Boycotts, Buyalls, Blockchain technology, Increased Sales
B. Buycotts, Boycotts, Digital technology, Decreased Sales
C. Boycotts, Buycotts, Digital technology, Decreased Sales
D. Buycotts, Boycotts, Blockchain technology, Charitable donations
E. Boycotts, Buyalls, Blockchain technology, Charitable donations
F. Boycotts, Buycotts, Digital technology, Increased Sales
G. Buycotts, Boycotts, Digital technology, Increased Sales
H. Bo

100%|██████████| 14/14 [00:00<00:00, 72.15it/s]


Model: gpt-4o-mini Task: mmlu_pro, Shot: zero, Temperature: 1, N: 5
--------------------------------------------------
Role:
user
Content:
The following are multiple choice questions (with answers) about business. Think step by step and then output the answer in the format of "The answer is (X)" at the end.

Question: Mr. Okada owns a home worth $28,750 in a town where homes are assessed at 38% of their market value. If Mr. Okada pays property taxes at the rate of $8.42 per $100, and 47% of his property taxes is spent on education, how much of his taxes goes toward education?
Options: A. $2,187.50
B. $765.23
C. $850.19
D. $432.35
E. $300.58
F. $28,750
G. $10,925
H. $1,150.47
I. $919.89
J. $500.00
Answer: Let's think step by step.


--------------------------------------------------


198it [00:00, 60684.85it/s]


Model: gpt-4o-mini Task: gpqa, Shot: few, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:
You are a very intelligent assistant, who follows instructions directly.


--------------------------------------------------
Role:
user
Content:
Here are some example questions from experts. An explanation is given before the final answer. Answer the final question yourself, giving your reasoning beforehand.
Question: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?
Choices:
(A) 1/400
(B) 19/400
(C) 20/400
(D) 38/400
Let's think step by step: 
The expected proportion of individuals who carry the b allele but are not expected to develop the cancer equals to the frequency of heterozygous allele in the given pop

198it [00:00, 349525.33it/s]


Model: gpt-4o-mini Task: gpqa, Shot: zero, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:
You are a very intelligent assistant, who follows instructions directly.


--------------------------------------------------
Role:
user
Content:
What is the correct answer to this question: Two quantum states with energies E1 and E2 have a lifetime of 10^-9 sec and 10^-8 sec, respectively. We want to clearly distinguish these two energy levels. Which one of the following options could be their energy difference so that they can be clearly resolved?


Choices:
(A) 10^-9 eV
(B) 10^-11 eV
(C) 10^-8 eV

(D) 10^-4 eV
Give step by step reasoning before you answer, and when you're ready to answer, please use the format "The correct answer is (insert answer here)":

--------------------------------------------------


500it [00:00, 390676.60it/s]


Model: gpt-4o-mini Task: drop, Shot: few, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Q: Since the 1970s, U.S. governments have negotiated managed-trade agreements, such as the North American Free Trade Agreement in the 1990s, the Dominican Republic-Central America Free Trade Agreement in 2006, and a number of bilateral agreements. In Europe, six countries formed the European Coal and Steel Community in 1951 which became the European Economic Community in 1958. Two core objectives of the EEC were the development of a common market, subsequently renamed the single market, and establishing a customs union between its member states. How many years did the European Coal and Steel Community exist?
A: According to the passage, the European Coal and Steel Community was established in 1951 and became the EEC in 1958. 1958 - 1951 = 7. So the answer is 7.

Q: In the county, t

500it [00:00, 633963.72it/s]


Model: gpt-4o-mini Task: drop, Shot: zero, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Q: As of the census of 2000, there were 49,129 people, 18,878 households, and 13,629 families residing in the county.  The population density was 88 people per square mile (34/km2).  There were 21,779 housing units at an average density of 39 per square mile (15/km2).  The racial makeup of the county was 74.4% Race (United States Census), 20.4% Race (United States Census) or Race (United States Census), 0.60% Race (United States Census), 1.1% Race (United States Census), 0.15% Race (United States Census), 1.3% from Race (United States Census), and 2.2% from two or more races.  3.4% of the population were Race (United States Census) or Race (United States Census) of any race. 2.85% of the population reported speaking Spanish language at home, while 1.51% speak German language.How m

500it [00:00, 479130.00it/s]


Model: gpt-4o-mini Task: hotpotqa, Shot: few, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Q: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
A: Let’s think step by step. The eastern sector of Colorado orogeny extends into the High Plains. High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft. Answer 1,800 to 7,000 ft

Q: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
A: Let’s think step by step. Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon. Answer Richard Nixon

Q: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
A: Let’s think step by step. Adam Clayton Powell (film) is a documentary about an African-American

500it [00:00, 720423.22it/s]


Model: gpt-4o-mini Task: hotpotqa, Shot: zero, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Q: What is the nationality of the foreign born victim of Singapore's caning punishment before Oliver Fricker experienced the same?.

End your answer with "Answer <answer>". Think step by step.

A: 
--------------------------------------------------


250it [00:00, 223196.25it/s]


Model: gpt-4o-mini Task: musr_efficiently, Shot: few, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:
You are a helpful AI assistant that will answer reasoning questions. You may reason over the question but you will always say at the end "Answer: <Your Answer Letter Choice>". You must only pick one answer and you must end your response with "Answer: <Your Answer Letter Choice>" everytime!
--------------------------------------------------
Role:
user
Content:

In the quaint community of Midvale, the local school stood as a beacon of enlightenment, nurturing the minds of the next generation. The teachers, the lifeblood of this institution, were tasked with the noble duty of education, while the unsung heroes—the maintenance crew—ensured the smooth functioning of the school's infrastructure. Amidst this, three town residents, Angela, Greg, and Travis, found themselves at a juncture of life where they were presented with the opportunity to ser

250it [00:00, 249660.95it/s]


Model: gpt-4o-mini Task: musr_efficiently, Shot: zero, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:
You are a helpful AI assistant that will answer reasoning questions. You may reason over the question but you will always say at the end "Answer: <Your Answer Letter Choice>". You must only pick one answer and you must end your response with "Answer: <Your Answer Letter Choice>" everytime!
--------------------------------------------------
Role:
user
Content:
Amidst the vibrant chaos of the Redwood Zoo, nestled in the heart of the city's sprawling jungle, the task of assigning roles was a crucial cog in the machinery of its operation. As the manager, the responsibility of allocating Olivia, Alex, and Mia to the positions of Animal Caretaker and Exhibit Cleaner presented an intriguing conundrum. Each individual, with their distinct personalities and skill sets, added a layer of complexity to this assignment puzzle.

Let's begin with Alex, t

256it [00:00, 143337.58it/s]


Model: gpt-4o-mini Task: musr_location, Shot: few, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:
You are a helpful AI assistant that will answer reasoning questions. You may reason over the question but you will always say at the end "Answer: <Your Answer Letter Choice>". You must only pick one answer and you must end your response with "Answer: <Your Answer Letter Choice>" everytime!
--------------------------------------------------
Role:
user
Content:
Petra, the dedicated housewife, felt a thrill at the thought of her surprise anniversary dinner for her husband, Daniel. She had been skillfully maneuvering around Daniel's eagerness to pitch in without disappointing him or giving up her surprise. 

Daniel, ever-the-observant-husband, noted Petra's unusual enthusiasm about the day's menu. Despite not knowing the details, he appreciated her effort and wanted to help—silently, he decided to deploy his best skill—patiently awaiting his momen

256it [00:00, 258670.64it/s]

Model: gpt-4o-mini Task: musr_location, Shot: zero, Temperature: 1, N: 5
--------------------------------------------------
Role:
system
Content:
You are a helpful AI assistant that will answer reasoning questions. You may reason over the question but you will always say at the end "Answer: <Your Answer Letter Choice>". You must only pick one answer and you must end your response with "Answer: <Your Answer Letter Choice>" everytime!
--------------------------------------------------
Role:
user
Content:
In the heart of the bustling studio, Ricky, Emma, and Danny readied themselves for a day of creating magic. Ricky, holding the helm as the gifted singer-songwriter, was poised for perfection, his precious notebook of lyrics awaiting its call to duty on the producer's desk. Emma, their dutiful and talented producer, was just as eager to breathe life into Ricky's lyrics. She was cognizant of the notebook's place at her desk, awaiting the melodies they would cultivate together.

Across the 


