In [None]:
import os
import json
from tqdm import tqdm 
import sys

from dotenv import load_dotenv
from openai import OpenAI

sys.path.append('../')  

from gpqa.gpqa_utils import * 

from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

from mmlu_pro.mmlu_utils import * 

from hotpotqa.hotpotqa_utils import *

from drop.drop_utils import *


from musr.musr import MuSRDataset

from musr.op_icl_fixed import op_fewshot, few_shot_op_instruction, test_op_instruction
from musr.ta_icl_fixed import ta_fewshot, few_shot_ta_instruction, test_ta_instruction


load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def load_data_and_fewshot(args):
    if args.task == "mmlu_pro":
        dataset, fewshot = load_mmlu_pro()
    elif args.task == "math500": 
        file_path = f"../data/{args.task}/test.jsonl"
        with open(file_path, 'r', encoding='utf-8') as f:
            dataset = [json.loads(line) for line in f]
        fewshot = load_prompt(num_shots=5)
    elif args.task == "gpqa":
        dataset = load_examples("../data/gpqa/gpqa_diamond.csv", seed=0)
        with open("../gpqa/chain_of_thought_examples.json", 'r') as f:
            fewshot = json.load(f)
    elif args.task == "hotpotqa":
        dataset = json.load(open(f'../data/hotpotqa/{args.task}.json'))
        with open("../hotpotqa/react_prompt.json", 'r') as f:
            fewshot = json.load(f)
    elif args.task == "drop":
        dataset = pd.read_parquet("../data/drop/drop_sub.parquet", engine="pyarrow")
        dataset = dataset.to_dict(orient="records")  
        dataset = convert_ndarray_to_list(dataset)
        dataset = convert_ndarray_to_list(dataset)

        with open("../drop/prompt.json", 'r') as f:
            fewshot = json.load(f)
    elif args.task == "musr_location":
        op_path = '../data/musr/object_placements.json'
        dataset = MuSRDataset(op_path)
        fewshot = op_fewshot
        
    elif args.task == 'musr_efficiently': 
        ta_path = '../data/musr/team_allocation.json'
        dataset = MuSRDataset(ta_path)
        fewshot = ta_fewshot
    else: 
        return None, None
    
    return dataset, fewshot

In [3]:
def generate_model_output(model: str, prompt: str, temperature: float = 1.0, n: int = 15) -> str:        
    responses = client.chat.completions.create(
        model=model,
        messages=prompt,
        n=n,
        temperature=temperature,
    )

    outputs = [choice.message.content for choice in responses.choices]
    return outputs

In [4]:
def construct_prompt(args, dataset, fewshot): 
    samples = []
    system_prompt = ""
    
    if args.task == "math500":
        system_prompt = "Please reason step by step, and put your final answer within \\boxed{{}}.\n"

        user_prompt = ""
        if fewshot != None: 
            for q, a in fewshot: 
                user_prompt = q + '\n'
                message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]
                sample = {"question": q, "answer": a, "prompt": message, "subject": "math500"}
                samples.append(sample)
        
    elif args.task == "mmlu_pro":
        def format_example(question, options):
            example = "{}\nOptions: ".format(question)
            choice_map = "ABCDEFGHIJ"
            for i, opt in enumerate(options):
                example += "{}. {}\n".format(choice_map[i], opt)
            return example
        
        subjects = list(dataset.keys())
        for subject in tqdm(subjects): 
            if fewshot != None:
                for each in fewshot[subject]:
                    user_prompt = "The following are multiple choice questions (with answers) about {}. Think step by" \
                        " step and then output the answer in the format of \"The answer is (X)\" at the end.\n\n" \
                    .format(subject)
                    question = format_example(each["question"], each["options"])
                    answer = each["cot_content"]
                    if answer.startswith("A: "):
                        answer = answer[3:]
                    message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt + "Question: " + question + "Answer: " + "Let's think step by step." + "\n\n"}]
                    sample = {"question": question, "answer": answer, "prompt": message, "subject": subject}
                    samples.append(sample)

    elif args.task == "gpqa": 
        system_prompt = "You are a very intelligent assistant, who follows instructions directly.\n\n"

        if fewshot != None: 
            for q in fewshot["questions"]:
                question = f"What is the correct answer to this question: {q['question']}\nChoices:\n"
                for choice, value in q["choices"].items():
                    question += f'({choice}) {value}\n'
                user_prompt = "\nGive step by step reasoning before you answer, and when you're ready to answer, please use the format \"The correct answer is (insert answer here)\":\n"
                answer = f"Let's think step by step: \n{q['explanation']}\n"
                answer += f'The correct answer is ({q["correct_answer"]})\n'
                message = [{"role": "system","content": system_prompt},{"role": "user","content": question + user_prompt}]
                sample = {"question": question, "answer": answer, "prompt": message, "subject": "gpqa"}
                samples.append(sample)
    
    elif args.task == "hotpotqa": 
        system_prompt = ""

        if fewshot != None: 
            for qa in fewshot:
                question = qa["Q"]
                answer = qa["A"]
                user_prompt = f"Q: {question}."
                user_prompt += "\n\nEnd your answer with \"Answer <answer>\". Think step by step." + "\n\nA: " 
                message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

                sample = {"question": question, "answer": answer, "prompt": message, "subject": "hotpotqa"}
                samples.append(sample)

    elif args.task == "drop": 
        system_prompt = ""

        if fewshot != None: 
            for qa in fewshot:
                question = qa["Q"]
                answer = qa["A"]

                user_prompt = f"Q: {question}" + "\n\nEnd your answer with \"So the answer is <answer>\". Think step by step." + "\n\nA: " 
                message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

                sample = {"question": question, "answer": answer, "prompt": message, "subject": "drop"}
                samples.append(sample)
    
    elif args.task == "musr_location":
        system_prompt = dataset[0]['prompt_parts']['cot_system_prompt']

        few_instruction = few_shot_op_instruction 
        for (q, a) in fewshot:
            user_prompt = f"{q}\n\n{few_instruction}\n\n"
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

            sample = {"question": q, "answer": a, "prompt": message, "subject": "musr_location"}
            samples.append(sample)

    elif args.task == "musr_efficiently": 
        system_prompt = dataset[0]['prompt_parts']['cot_system_prompt']

        few_instruction = few_shot_ta_instruction 
        for (q, a) in fewshot:
            user_prompt = f"{q}\n\n{few_instruction}\n\n"

            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

            sample = {"question": q, "answer": a, "prompt": message, "subject": "musr_efficiently"}
            samples.append(sample)

    else: 
        return None

    return samples

In [None]:
class Config:
    def __init__(self):
        self.model = "gpt-4o-mini"  
        self.task = "drop"  # "math500", "mmlu_pro", "gpqa", "drop", "hotpotqa"
        self.shot_type = "few"  
        self.output_dir = "leap"
        self.num_examples = -1


tasks = ['math500']
shots = ["few"]

models = ['gpt-4o-mini']
subjects = ['business', 'law', 'psychology', 'biology', 'chemistry', 'history', 'other', 'health', 'economics', 'math', 'physics', 'computer science', 'philosophy', 'engineering']

config = Config()
        

In [None]:
def main(config):

    save_path = f"{config.output_dir}/{config.task}/{config.model}/{config.task}_mistakes.jsonl"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)


    dataset, fewshot = load_data_and_fewshot(config)
    if config.shot_type == "zero":
        fewshot = None
        
    samples = construct_prompt(config, dataset, fewshot)
    if samples:
        print(f"Model: {config.model} Task: {config.task}, Shot: {config.shot_type}")
        print("-" * 50)
        prompt = samples[0]["prompt"]
        print(f"Question: {samples[0]['question']}")
        print(f"Answer: {samples[0]['answer']}")

        for message in prompt:
            print(f"Role:\n{message['role']}")
            print(f"Content:\n{message['content']}")
            print("-" * 50)
    else:
        print(f"No samples found for Task: {config.task}, Shot: {config.shot_type}")

    
    if os.path.exists(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:

            existing_data = {json.loads(line)['question'] for line in f}  
    else:
        existing_data = set() 

    if samples:

        with open(save_path, "a", encoding='utf-8') as f:  
            for sample in tqdm(samples, total=len(samples)):
                if sample['question'] in existing_data:  
                    continue
                try:

                    model_outputs = generate_model_output(config.model, sample["prompt"])
                    sample["model_outputs"] = model_outputs


                    json.dump(sample, f)
                    f.write("\n")
                except Exception as e:
                    print(f"Error processing sample {sample['question']}: {e}")
                    break

        print(f"Results saved to {save_path}")

In [None]:
for model in models:
    for task in tasks:
        for shot in shots:
            config.model = model
            config.task = task
            config.shot_type = shot
            main(config)


In [72]:
import os
import json
from math500.math_utils import *
from math500.parser import *
from math500.grader import *

for model in models: 
    file_path = f"{config.output_dir}/math500/{model}/math500_mistakes.jsonl"
    wrong_file_path = f"{config.output_dir}/math500/{model}/math500_wrong_predictions.jsonl"

    scores = [[] for _ in range(15)]

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]

        for idx, entry in enumerate(data):
            # Parse ground truth once for this entry
            gt = extract_answer(entry['answer'], "math")
            gt = strip_string(gt)

            for i, mo in enumerate(entry['model_outputs']):
                # Extract and clean predicted answer
                pred = extract_answer(mo, "math")
                pred = strip_string(pred)

                try:
                    result = math_equal_process((idx, pred, gt))
                    if not result:
                        result = process_results(gt, [mo])

                        if not result :
                            wrong_entries.append({
                                "idx": idx,
                                "entry": entry,
                                "gt": gt,
                                "model_index": i,
                                "model_output": mo,
                                "pred": pred
                            })
                            break
                    # Record the success/failure in scores
                    scores[i].append(result)

                except TimeoutError:
                    scores[i].append(False)
                except Exception as error:
                    print(f"Error encountered: {error}")
                    exit()

        if wrong_entries:
            with open(wrong_file_path, 'w', encoding='utf-8') as wf:
                for we in wrong_entries:
                    wf.write(json.dumps(we, ensure_ascii=False) + "\n")
            print(f"Saved wrong predictions to {wrong_file_path}")
        else:
            print("No wrong entries found.")


Saved wrong predictions to leap/math500/gpt-4o-mini/math500_wrong_predictions.jsonl
Saved wrong predictions to leap/math500/gpt-4o/math500_wrong_predictions.jsonl


In [None]:
from collections import defaultdict

def extract_answer(text):
    pattern = r"answer is \(?([A-J])\)?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return extract_again(text)

for model in models:
    file_path = f"{config.output_dir}/mmlu_pro/{model}/mmlu_pro_mistakes.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    

    wrong_entries_by_subject = defaultdict(list)


    for idx, entry in enumerate(data):
        subject = entry['subject']
        gt = extract_answer(entry['answer'])
        
        for i, mo in enumerate(entry['model_outputs']):
            pred = extract_answer(mo)
            if pred != gt:
                wrong_entries_by_subject[subject].append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break
    

    for subject, wrong_entries in wrong_entries_by_subject.items():
        wrong_file_path = f"{config.output_dir}/mmlu_pro/{model}/{subject}/mmlu_pro_wrong_predictions.jsonl"
        os.makedirs(os.path.dirname(wrong_file_path), exist_ok=True)
        
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
    print(f"Saved wrong predictions to {wrong_file_path}")
    
  


Saved wrong predictions to leap/mmlu_pro/gpt-4o-mini/engineering/mmlu_pro_wrong_predictions.jsonl
Saved wrong predictions to leap/mmlu_pro/gpt-4o/engineering/mmlu_pro_wrong_predictions.jsonl


In [74]:
from gpqa.gpqa_utils import *

for model in models: 
    file_path = f"{config.output_dir}/gpqa/{model}/gpqa_mistakes.jsonl"
    wrong_file_path = f"{config.output_dir}/gpqa/{model}/gpqa_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = parse_sampled_answer(entry['answer'])

        for i, mo in enumerate(entry['model_outputs']):
            pred = parse_sampled_answer(mo)
            if pred != gt: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to leap/gpqa/gpt-4o-mini/gpqa_wrong_predictions.jsonl
Saved wrong predictions to leap/gpqa/gpt-4o/gpqa_wrong_predictions.jsonl


In [75]:
from hotpotqa.hotpotqa_utils import *

for model in models: 
    file_path = f"{config.output_dir}/hotpotqa/{model}/hotpotqa_mistakes.jsonl"
    wrong_file_path = f"{config.output_dir}/hotpotqa/{model}/hotpotqa_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = extract_answer(entry['answer'])
        gt = normalize_answer(gt)
        for i, mo in enumerate(entry['model_outputs']):
            pred = extract_answer(mo)
            pred = normalize_answer(pred)
            if pred != gt: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to leap/hotpotqa/gpt-4o-mini/hotpotqa_wrong_predictions.jsonl
Saved wrong predictions to leap/hotpotqa/gpt-4o/hotpotqa_wrong_predictions.jsonl


In [77]:
from drop.drop_utils import *
from drop.drop_utils import _normalize

for model in models: 
    file_path = f"{config.output_dir}/drop/{model}/drop_mistakes.jsonl"
    wrong_file_path = f"{config.output_dir}/drop/{model}/drop_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = extract_answer(entry['answer'])

        for i, mo in enumerate(entry['model_outputs']):
            pred = extract_answer(mo)
            pred = _normalize(pred)

            em, _ = get_metrics(pred, gt)
            if not em: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to leap/drop/gpt-4o-mini/drop_wrong_predictions.jsonl
Saved wrong predictions to leap/drop/gpt-4o/drop_wrong_predictions.jsonl


In [78]:
op_path = '../data/musr/object_placements.json'
op = MuSRDataset(op_path)

for model in models: 
    file_path = f"{config.output_dir}/musr_location/{model}/musr_location_mistakes.jsonl"
    wrong_file_path = f"{config.output_dir}/musr_location/{model}/musr_location_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = op.evaluate_response([entry['answer']], op[0])[0]['model_answer']
        

        for i, mo in enumerate(entry['model_outputs']):
            pred = op.evaluate_response([mo], op[0])[0]['model_answer']
            
            if pred != gt: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to leap/musr_location/gpt-4o-mini/musr_location_wrong_predictions.jsonl
Saved wrong predictions to leap/musr_location/gpt-4o/musr_location_wrong_predictions.jsonl


In [79]:
ta_path = '../data/musr/team_allocation.json'
ta = MuSRDataset(ta_path)

for model in models: 
    file_path = f"{config.output_dir}/musr_efficiently/{model}/musr_efficiently_mistakes.jsonl"
    wrong_file_path = f"{config.output_dir}/musr_efficiently/{model}/musr_efficiently_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = ta.evaluate_response([entry['answer']], ta[0])[0]['model_answer']
        

        for i, mo in enumerate(entry['model_outputs']):
            pred = ta.evaluate_response([mo], ta[0])[0]['model_answer']
            
            if pred != gt: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to leap/musr_efficiently/gpt-4o-mini/musr_efficiently_wrong_predictions.jsonl
Saved wrong predictions to leap/musr_efficiently/gpt-4o/musr_efficiently_wrong_predictions.jsonl


In [8]:
def construct_low_level_principle(question, response, generated_answer, correct_reasoning, correct_answer):
    prompt = (
        f"Question: {question}\n"
        f"Generated Reasoning: {response}\n\n"
        f"Generated Answer: {generated_answer}\n\n"
        f"Correct Reasoning: {correct_reasoning}\n\n"
        f"Correct Answer: {correct_answer}\n\n"
        "Instruction: Conduct a thorough analysis of the generated answer in comparison "
        "to the correct answer. Also observe how the generated reasoning differs from the "
        "correct reasoning. Identify any discrepancies, misunderstandings, or errors. "
        "Provide clear insights, principles, or guidelines that can be derived from this "
        "analysis to improve future responses. We are not focused on this one data point, "
        "but rather on the general principle.\n\n"
        "Reasoning: <discuss why the generated answer is wrong>\n"
        "Insights: <what principle should be looked at carefully to improve the performance "
        "in the future>\n"
    )
    return prompt

def get_low_level_principle(model, path): 
    with open(path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    low_principles = []
    for idx, entry in enumerate(data): 
        question = entry['entry']['question']
        response = entry['model_output']
        generated_answer = entry['pred']
        correct_reasoning = entry['entry']['answer']
        correct_answer = entry['gt'] 

        prompt = construct_low_level_principle(question, response, generated_answer, correct_reasoning, correct_answer)
        
        message = [{"role": "user","content": prompt}]

        low_principle = generate_model_output(model, message, temperature=0, n=1)
        low_principles.append(low_principle)
        if idx == 0: 
            print(prompt)
            print("="*30)
            print(low_principle)
            print("="*30)

    return low_principles

In [None]:

for model in models:
    for task in tasks:
        for shot in shots:
            if task == 'mmlu_pro': 
                for subject in subjects:
                    path = f"leap/{task}/{model}/{subject}/{task}_wrong_predictions.jsonl"
                    save_path = f"leap/{task}/{model}/{subject}/{task}_low_level_principle.json"

                    if not os.path.exists(path):
                        print(f"[WARN] File not found: {path}")
                        continue
                    
                    with open(path, 'r', encoding='utf-8') as f:
                        wrong = [json.loads(line) for line in f]

                    if os.path.exists(save_path):
                        print(save_path)
                        with open(save_path, 'r', encoding='utf-8') as f:
                            low_level = json.load(f)
                        if len(low_level) != 0: 
                            print(len(wrong), len(low_level))
                            continue
                    

                    os.makedirs(os.path.dirname(save_path), exist_ok=True)
                    

                    low_principles = get_low_level_principle(model, path)
                    
                    with open(save_path, 'w', encoding='utf-8') as wf:
                        json.dump(low_principles, wf, ensure_ascii=False, indent=2)
                    
                    print(f"[*] Saved {len(low_principles)} low-level principles to {save_path}")
            
            else:
                path = f"leap/{task}/{model}/{task}_wrong_predictions.jsonl"
                save_path = f"leap/{task}/{model}/{task}_low_level_principle.json"

                if not os.path.exists(path):
                        print(f"[WARN] File not found: {path}")
                        continue
                    
                with open(path, 'r', encoding='utf-8') as f:
                    wrong = [json.loads(line) for line in f]

                if os.path.exists(save_path):
                    print(save_path)
                    with open(save_path, 'r', encoding='utf-8') as f:
                        low_level = json.load(f)
                    if len(low_level) != 0: 
                        print(len(wrong), len(low_level))
                        continue

                os.makedirs(os.path.dirname(save_path), exist_ok=True)


                low_principles = get_low_level_principle(model, path)

                with open(save_path, 'w', encoding='utf-8') as wf:
                    json.dump(low_principles, wf, ensure_ascii=False, indent=2)

                print(f"[*] Saved {len(low_principles)} low-level principles to {save_path}")


[WARN] File not found: leap/math500/gpt-4o-mini/math500_wrong_predictions.jsonl


In [None]:
import re
import json
from typing import List

def extract_from_insight(text: str) -> str:

    match = re.search(r'Insight.*', text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(0).strip()
    else:
        return text.strip()

def construct_high_level_prompt(low_principles: List[List[str]]) -> str:


    joined_principles = "\n".join(
        f"{i+1}. {extract_from_insight(lp[0])}"
        for i, lp in enumerate(low_principles)
    )
    
    prompt = (
        f"Low-level principles:\n{joined_principles}\n\n"
        "Create a list of *unique* and insightful principles to improve future responses based on the analysis above.\n"
        "Focus on capturing the essence of the feedback while eliminating redundancies.\n"
        "Ensure that each point is clear, concise, and directly derived from the introspection results.\n"
        "Create a numbered list of principles. Leave specific details in place.\n"
        "Limit to at most 8 principles.\n\n"
        "List of Principles:"
    )
    return prompt


def get_high_level_principle(model, path_to_low_level):
    with open(path_to_low_level, 'r', encoding='utf-8') as f:

        data = json.load(f)

    low_principles = data
    
    if not low_principles:
        print("[WARN] No low-level principles found in file.")
        return None
    

    prompt = construct_high_level_prompt(low_principles)
    messages = [{"role": "user", "content": prompt}]

    print("=== Prompt Preview ===")
    print(prompt)
    print("="*30)
    
    high_principle = generate_model_output(model, messages, temperature=0, n=1)
    
    print("=== Model Output ===")
    print(high_principle)
    print("="*30)
    
    return high_principle



In [None]:

for model in models:
    for task in tasks:
        for shot in shots:
            if task == 'mmlu_pro': 
                for subject in subjects:
                    path = f"leap/{task}/{model}/{subject}/{task}_wrong_predictions.jsonl"
                    low_level_path = f"leap/{task}/{model}/{subject}/{task}_low_level_principle.json"
                    high_level_path = f"leap/{task}/{model}/{subject}/{task}_high_level_principle.json"

                    if not os.path.exists(low_level_path):
                        print(f"[WARN] File not found: {low_level_path}")
                        continue
                    

                    os.makedirs(os.path.dirname(high_level_path), exist_ok=True)
                    

                    high_principles = get_high_level_principle(model, low_level_path)
                    
                    with open(high_level_path, 'w', encoding='utf-8') as wf:
                        json.dump(high_principles, wf, ensure_ascii=False, indent=2)
                    
                    print(f"[*] Saved {len(high_principles)} higt-level principles to {high_level_path}")
            
            else:
                path = f"leap/{task}/{model}/{task}_wrong_predictions.jsonl"
                low_level_path = f"leap/{task}/{model}/{task}_low_level_principle.json"
                high_level_path = f"leap/{task}/{model}/{task}_high_level_principle.json"

                if not os.path.exists(low_level_path):
                    print(f"[WARN] File not found: {low_level_path}")
                    continue

                os.makedirs(os.path.dirname(high_level_path), exist_ok=True)


                high_principles = get_high_level_principle(model, low_level_path)

                with open(high_level_path, 'w', encoding='utf-8') as wf:
                    json.dump(high_principles, wf, ensure_ascii=False, indent=2)

                print(f"[*] Saved {len(high_principles)} low-level principles to {high_level_path}")


=== Prompt Preview ===
Low-level principles:
1. Insights and Principles for Improvement

1. **Clarify the Scope of the Question:** When answering questions about historical entities or agreements, it is essential to clarify the specific time frame or context being asked about. In this case, understanding that the ECSC's identity changed in 1958 is crucial.

2. **Focus on Relevant Dates:** Always ensure that the dates used in calculations are relevant to the specific aspect of the question. In this case, the question was about the ECSC's existence, not its later developments.

3. **Contextual Awareness:** Develop a habit of considering the broader context of historical events or agreements. This includes understanding how entities evolve over time and how that affects their classification and duration.

4. **Double-Check Calculations Against Context:** Before finalizing an answer, it is beneficial to double-check that the calculations align with the context provided in the question. Thi

In [None]:
import random 

def load_data_and_fewshot(args):
    if args.task == "mmlu_pro":
        dataset, fewshot = load_mmlu_pro()
    elif args.task == "math500": 
        file_path = f"../data/math500/test.jsonl"
        with open(file_path, 'r', encoding='utf-8') as f:
            dataset = [json.loads(line) for line in f]
        fewshot = load_prompt(num_shots=5)
    elif args.task == "gpqa": 
        dataset = load_examples("../data/gpqa/gpqa_diamond.csv", seed=0)
        with open("../gpqa/chain_of_thought_examples.json", 'r') as f:
            fewshot = json.load(f)
    elif args.task == "hotpotqa":
        dataset = json.load(open(f'../data/hotpotqa/{args.task}.json'))
        with open("../hotpotqa/react_prompt.json", 'r') as f:
            fewshot = json.load(f)
    elif args.task == "drop":

        dataset = pd.read_parquet("../data/drop/drop_sub.parquet", engine="pyarrow")
        dataset = dataset.to_dict(orient="records")  

        dataset = convert_ndarray_to_list(dataset)
        dataset = convert_ndarray_to_list(dataset)

        with open("../drop/prompt.json", 'r') as f:
            fewshot = json.load(f)

    elif args.task == "musr_efficiently":
        ta_path = '../data/musr/team_allocation.json'
        dataset = MuSRDataset(ta_path)
        fewshot = 1

    elif args.task == "musr_location":
        op_path = '../data/musr/object_placements.json'
        dataset = MuSRDataset(op_path)
        fewshot = 1
    else: 
        return None, None
    
    return dataset, fewshot

def construct_prompt(args, dataset, fewshot): 
    if args.task != 'mmlu_pro': 
        high_level_path = f"leap/{task}/{model}/{task}_high_level_principle.json"
        if os.path.exists(high_level_path):
            with open(high_level_path, 'r', encoding='utf-8') as hf:
                high_principles = json.load(hf)
            high_principles = high_principles[0]

    samples = []
    if args.task == "math500":
        system_prompt = "Please reason step by step, and put your final answer within \\boxed{{}}.\n"

        for idx, entry in tqdm(enumerate(dataset)): 
            user_prompt = ""
            user_prompt += f"Please carefully note the following principles:\n\nPrinciples: {high_principles}\n\n"

            if fewshot != None: 
                user_prompt += "\n\n".join([f"{q}\n\n{a}" for q, a in fewshot]) + "\n\n" 
            user_prompt += entry['problem'] + "\n"
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]
            sample = {"idx": idx,"prompt": message,"entry": entry}
            samples.append(sample)

        
    elif args.task == "mmlu_pro":
        subjects = list(dataset.keys())
        for subject in tqdm(subjects): 
            user_prompt = "The following are multiple choice questions (with answers) about {}. Think step by" \
                " step and then output the answer in the format of \"The answer is (X)\" at the end.\n\n" \
            .format(subject)

            high_level_path = f"leap/{task}/{model}/{subject}/{task}_high_level_principle.json"
            if os.path.exists(high_level_path):
                with open(high_level_path, 'r', encoding='utf-8') as hf:
                    high_principles = json.load(hf)

            user_prompt += f"Please carefully note the following principles:\n\nPrinciples: {high_principles[0]}\n\n"

            if fewshot != None:
                for each in fewshot[subject]:
                    user_prompt += format_example(each["question"], each["options"], each["cot_content"])
            
            random.seed(42)
            test_data = random.sample(dataset[subject], min(300, len(dataset[subject])))
    
            for entry in test_data:
                input_text = format_example(entry['question'], entry['options'])
                message = [{"role": "user", "content": user_prompt + input_text}]
                sample = {"idx": entry['question_id'],"prompt": message,"entry": entry}
                samples.append(sample)
    
    elif args.task == "gpqa": 
        system_prompt = "You are a very intelligent assistant, who follows instructions directly.\n\n"

        samples = []
        for example_id, example in tqdm(enumerate(dataset)):
            user_prompt = f"Please carefully note the following principles:\n\nPrinciples: {high_principles}\n\n"

            if fewshot != None: 
                user_prompt += chain_of_thought_prompt(fewshot, example)
            else:
                user_prompt += zero_shot_chain_of_thought_prompt(example)
            
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

            sample = {"idx": example_id,"prompt": message,"entry": example}
            samples.append(sample)
    
    elif args.task == "hotpotqa": 
        system_prompt = ""

        samples = []

        if fewshot != None: 
            fewshot_prompt = f"Please carefully note the following principles:\n\nPrinciples: {high_principles}\n\n"
            for qa in fewshot:
                question = qa["Q"]
                answer = qa["A"]

                fewshot_prompt += f"Q: {question}\nA: {answer}\n\n"

        for idx, entry in tqdm(enumerate(dataset)):
            if fewshot != None: 
                user_prompt = fewshot_prompt + f"Q: {entry['question']}." + "\n\nEnd your answer with \"Answer <answer>\". Think step by step." + "\n\nA: " 
            else:
                user_prompt = f"Q: {entry['question']}." + "\n\nEnd your answer with \"Answer <answer>\". Think step by step." + "\n\nA: " 
            
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

            sample = {"idx": idx,"prompt": message,"entry": entry, "answer": entry['answers']}
            samples.append(sample)

    elif args.task == "drop": 
        system_prompt = ""

        samples = []
        if fewshot != None: 
            fewshot_prompt = f"Please carefully note the following principles:\n\nPrinciples: {high_principles}\n\n"
            for qa in fewshot:
                question = qa["Q"]
                answer = qa["A"]

                fewshot_prompt += f"Q: {question}\nA: {answer}\n\n"

        for idx, entry in tqdm(enumerate(dataset)):
            if fewshot != None: 
                user_prompt = fewshot_prompt + f"Q: {entry['passage']}{entry['question']}" + "\n\nEnd your answer with \"So the answer is <answer>\". Think step by step." + "\n\nA: " 
            else:
                user_prompt = f"Q: {entry['passage']}{entry['question']}" + "\n\nEnd your answer with \"So the answer is <answer>\". Think step by step." + "\n\nA: " 
            
            message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

            sample = {"idx": idx,"prompt": message,"entry": entry}
            samples.append(sample)

    elif args.task == "musr_efficiently" or args.task == "musr_location":
        for idx, entry in tqdm(enumerate(dataset)):
            if fewshot != None:
                user_prompt = f"Please carefully note the following principles:\n\nPrinciples: {high_principles}\n\n"
                system_prompt = entry['fs_cot_messages'][0]['content']
                user_prompt = user_prompt + entry['fs_cot_messages'][1]['content']
                message = [{"role": "system","content": system_prompt},{"role": "user","content": user_prompt}]

            else:
                message = entry['zs_cot_messages']
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)
    else: 
        return None

    return samples

def generate_model_output(model: str, prompt: str, temperature: float = 1.0, n: int = 5) -> str:        
    responses = client.chat.completions.create(
        model=model,
        messages=prompt,
        n=n,
        temperature=temperature,
    )

    outputs = [choice.message.content for choice in responses.choices]

    return outputs

In [None]:
def main(config):
    save_path = f"{config.output_dir}/{config.task}/{config.model}/{config.task}_{config.shot_type}.jsonl"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    dataset, fewshot = load_data_and_fewshot(config)
    if config.shot_type == "zero":
        fewshot = None
        

    samples = construct_prompt(config, dataset, fewshot)
    if config.num_examples != -1: 
        samples = samples[:config.num_examples] 

    if samples:
        print(f"Model: {config.model} Task: {config.task}, Shot: {config.shot_type}")
        print("-" * 50)
        prompt = samples[0]["prompt"]
        for message in prompt:
            print(f"Role:\n{message['role']}")
            print(f"Content:\n{message['content']}")
            print("-" * 50)
    else:
        print(f"No samples found for Task: {config.task}, Shot: {config.shot_type}")

   
    if os.path.exists(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:

            existing_data = {json.loads(line)['idx'] for line in f}  
    else:
        existing_data = set()  

    if samples:

        with open(save_path, "a", encoding='utf-8') as f:  
            for sample in tqdm(samples, total=len(samples)):
                if sample['idx'] in existing_data:  
                    continue
                try:

                    model_outputs = generate_model_output(config.model, sample["prompt"], config.temperature)
                    sample["leap_output"] = model_outputs
                    json.dump(sample, f)
                    f.write("\n")
                except Exception as e:
                    print(f"Error processing sample {sample['idx']}: {e}")
                    break

        print(f"Results saved to {save_path}")


In [None]:
class Config:
    def __init__(self):
        self.model = "gpt-4o-mini"  
        self.task = "drop"  # "math500", "mmlu_pro", "gpqa", "drop", "hotpotqa"
        self.shot_type = "few"  
        self.output_dir = "leap_baseline"
        self.num_examples = -1
        self.temperature = 1
        self.n = 1
        

In [None]:
tasks = ["math500", "mmlu_pro", "gpqa", "drop", "hotpotqa", "musr_location", 'musr_efficiently']
shots = ["few"]
models = ['gpt-4o-mini', 'gpt-4o']

config = Config()

In [None]:
for model in models:
    for task in tasks:
        for shot in shots:
            config.model = model
            config.task = task
            config.shot_type = shot
            config.n = 1 
            print("=" * 50)
            main(config)



500it [00:00, 305796.44it/s]


Model: gpt-4o-mini Task: math500, Shot: few
--------------------------------------------------
Role:
system
Content:
Please reason step by step, and put your final answer within \boxed{{}}.

--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Meticulous Verification**: Always verify each step in algebraic processes to prevent errors that can lead to incorrect conclusions.

2. **Deep Understanding of Conics**: Cultivate a strong grasp of the properties and characteristics of various conic sections to ensure accurate identification and analysis.

3. **Geometric Clarity**: Clearly identify and articulate the geometric properties of shapes derived from equations to facilitate correct application of relevant formulas.

4. **Contextual Review**: Before finalizing answers, review results in the context of the problem to ensure alignment with expected outcomes.

5. **Step-by-Step Documentation**: Document each

100%|██████████| 500/500 [00:00<00:00, 3226387.69it/s]

Results saved to leap_baseline/math500/gpt-4o-mini/math500_few.jsonl



100%|██████████| 14/14 [00:00<00:00, 27.06it/s]


Model: gpt-4o-mini Task: mmlu_pro, Shot: few
--------------------------------------------------
Role:
user
Content:
The following are multiple choice questions (with answers) about business. Think step by step and then output the answer in the format of "The answer is (X)" at the end.

Please carefully note the following principles:

Principles: 1. **Maintain Structural Integrity**: Ensure that the original structure and order of terms in statements are preserved when formulating responses to enhance clarity and correctness.

2. **Clarify and Differentiate Key Terms**: Clearly define and distinguish between relevant terms to avoid confusion and ensure accurate application in context.

3. **Prioritize Contextual Relevance**: Focus on the specific context of questions to guide the selection of the most appropriate answers, rather than relying solely on general knowledge.

4. **Encourage Critical Evaluation**: Promote a thorough analysis of options by considering their specific characteri

100%|██████████| 4200/4200 [00:00<00:00, 1637486.22it/s]


Results saved to leap_baseline/mmlu_pro/gpt-4o-mini/mmlu_pro_few.jsonl


198it [00:00, 22073.52it/s]


Model: gpt-4o-mini Task: gpqa, Shot: few
--------------------------------------------------
Role:
system
Content:
You are a very intelligent assistant, who follows instructions directly.


--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Comprehensive Reaction Identification**: Always identify all species and potential reactions in a given scenario, including secondary reactions that may affect outcomes.

2. **Precise Stoichiometric Calculations**: Ensure accurate calculations of moles, concentrations, and volumes, paying attention to the stoichiometry of each reaction involved.

3. **Clarity in Conceptual Explanation**: Clearly articulate complex concepts, such as equivalence points and inheritance patterns, to enhance understanding and avoid confusion.

4. **Contextual Relevance**: Consider the specific context and terminology of the scientific field when analyzing statements or questions to ensur

100%|██████████| 198/198 [00:00<00:00, 308496.36it/s]


Results saved to leap_baseline/gpqa/gpt-4o-mini/gpqa_few.jsonl


500it [00:00, 121977.08it/s]


Model: gpt-4o-mini Task: drop, Shot: few
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Define the Question's Context:** Always clarify the specific time frame or context of the question to ensure accurate responses.

2. **Prioritize Relevant Dates:** Use dates that are directly related to the question's focus to avoid confusion and maintain accuracy.

3. **Understand Historical Evolution:** Recognize how entities change over time and how this impacts their classification and duration.

4. **Verify Contextual Alignment:** Double-check that calculations and interpretations align with the context provided in the question.

5. **Emphasize Clarity in Communication:** Strive for clear and concise language to enhance understanding and avoid ambiguity.

6. **Encourage Critical Thinking:** Foster a habit of questioning assumptions a

100%|██████████| 500/500 [00:00<00:00, 1879168.46it/s]


Results saved to leap_baseline/drop/gpt-4o-mini/drop_few.jsonl


500it [00:00, 386429.33it/s]


Model: gpt-4o-mini Task: hotpotqa, Shot: few
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Contextual Understanding**: Develop a comprehensive grasp of the specific regions, historical figures, or subjects being discussed to enhance the accuracy and relevance of responses.

2. **Data Verification**: Prioritize fact-checking against reliable sources to ensure the accuracy of historical data, geographical features, or any other critical information.

3. **Clarity and Completeness**: Use precise language and full names or terms to avoid ambiguity, ensuring that responses are clear and thorough.

4. **Structured Reasoning**: Organize reasoning in a logical manner, breaking down complex topics into clear, manageable steps to enhance understanding.

5. **Attention to Detail**: Focus on small but significant details that can impac

100%|██████████| 500/500 [00:00<00:00, 1736052.98it/s]


Results saved to leap_baseline/hotpotqa/gpt-4o-mini/hotpotqa_few.jsonl


256it [00:00, 42822.92it/s]


Model: gpt-4o-mini Task: musr_location, Shot: few
--------------------------------------------------
Role:
system
Content:
You are a helpful AI assistant that will answer reasoning questions. You may reason over the question but you will always say at the end "Answer: <Your Answer Letter Choice>". You must only pick one answer and you must end your response with "Answer: <Your Answer Letter Choice>" everytime!
--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Contextual Awareness**: Always consider the context and relationships between characters, as their observations and interactions can significantly influence beliefs about object locations.

2. **Sequential Tracking**: Maintain a clear sequence of events, ensuring that each character's actions and observations are accounted for in relation to the objects in question.

3. **Attention to Detail**: Focus on small details within the narrative that ma

100%|██████████| 256/256 [00:00<00:00, 1148386.98it/s]


Results saved to leap_baseline/musr_location/gpt-4o-mini/musr_location_few.jsonl


250it [00:00, 51042.98it/s]


Model: gpt-4o-mini Task: musr_efficiently, Shot: few
--------------------------------------------------
Role:
system
Content:
You are a helpful AI assistant that will answer reasoning questions. You may reason over the question but you will always say at the end "Answer: <Your Answer Letter Choice>". You must only pick one answer and you must end your response with "Answer: <Your Answer Letter Choice>" everytime!
--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Comprehensive Role Analysis**: Conduct a detailed evaluation of candidates' skills, strengths, and weaknesses to ensure alignment with specific role requirements.

2. **Prioritize Team Dynamics**: Assess interpersonal relationships and past teamwork experiences to enhance compatibility and overall team performance.

3. **Holistic Skill Evaluation**: Consider both hard and soft skills in assessments, focusing on how individual capabilities can

100%|██████████| 250/250 [00:00<00:00, 1413175.20it/s]


Results saved to leap_baseline/musr_efficiently/gpt-4o-mini/musr_efficiently_few.jsonl


500it [00:00, 304111.37it/s]


Model: gpt-4o Task: math500, Shot: few
--------------------------------------------------
Role:
system
Content:
Please reason step by step, and put your final answer within \boxed{{}}.

--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Understand Inverse Function Properties:**
   - Grasp the geometric and algebraic implications of functions that are their own inverses, focusing on symmetry and asymptotic behavior to simplify problem-solving.

2. **Leverage Simplification and Symmetry:**
   - Utilize the asymptotic behavior and symmetry in rational functions to gain insights into relationships between coefficients and simplify the problem.

3. **Prioritize Core Problem Properties:**
   - Concentrate on the essential properties and conditions of the problem, avoiding unnecessary algebraic manipulations that do not aid in finding the solution.

4. **Verify Solutions Thoroughly:**
   - Ensure the final s

100%|██████████| 500/500 [00:00<00:00, 1760832.91it/s]


Results saved to leap_baseline/math500/gpt-4o/math500_few.jsonl


100%|██████████| 14/14 [00:00<00:00, 401.88it/s]


Model: gpt-4o Task: mmlu_pro, Shot: few
--------------------------------------------------
Role:
user
Content:
The following are multiple choice questions (with answers) about business. Think step by step and then output the answer in the format of "The answer is (X)" at the end.

Please carefully note the following principles:

Principles: 1. **Attention to Detail**: Ensure that the final answer selection aligns precisely with the reasoning process, avoiding mislabeling and ensuring consistency between reasoning and conclusion.

2. **Verification and Consistency**: Double-check the alignment between reasoning and the final answer choice, ensuring consistency and accuracy in the selection process.

3. **Clarification of Definitions**: Develop a clear understanding of key terms and concepts, ensuring accurate application in context, particularly when terms may have multiple interpretations.

4. **Contextual Application**: Consider the specific context in which concepts operate, recogniz

100%|██████████| 4200/4200 [10:19:32<00:00,  8.85s/it]  


Results saved to leap_baseline/mmlu_pro/gpt-4o/mmlu_pro_few.jsonl


198it [00:00, 69922.72it/s]


Model: gpt-4o Task: gpqa, Shot: few
--------------------------------------------------
Role:
system
Content:
You are a very intelligent assistant, who follows instructions directly.


--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Comprehensive Reaction Analysis:** Always evaluate all potential reactions in a system, especially in complex scenarios with multiple analytes, to ensure no reaction is overlooked.

2. **Redox Potential Awareness:** Utilize knowledge of redox potentials to predict reaction sequences and prioritize reactions based on their likelihood and thermodynamic favorability.

3. **Identification of Multiple Equivalence Points:** Recognize that titrations involving multiple analytes may have several equivalence points, each corresponding to the complete reaction of a different analyte.

4. **Contextual Interpretation:** Consider the specific context in which terms and phrases are us

100%|██████████| 198/198 [42:17<00:00, 12.82s/it] 


Results saved to leap_baseline/gpqa/gpt-4o/gpqa_few.jsonl


500it [00:00, 397489.01it/s]


Model: gpt-4o Task: drop, Shot: few
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Historical Accuracy:** Ensure a thorough understanding of historical events and timelines, verifying facts to avoid misunderstandings and incorrect conclusions.

2. **Fact Verification:** Always cross-check historical facts and timelines to maintain accuracy and prevent errors in reasoning.

3. **Logical Clarity:** Present reasoning in a clear, logical manner that directly follows from verified facts, avoiding any misinterpretations.

4. **Consistency with Records:** Align information with established historical records, understanding the distinct roles and timelines of different organizations and agreements.

5. **Clarity and Conciseness:** Deliver answers clearly and concisely, avoiding unnecessary repetition or extraneous information.

6. *

100%|██████████| 500/500 [25:59<00:00,  3.12s/it]


Results saved to leap_baseline/drop/gpt-4o/drop_few.jsonl


500it [00:00, 318861.49it/s]


Model: gpt-4o Task: hotpotqa, Shot: few
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Geographical and Geological Accuracy**: Ensure precise identification of geographical areas and geological terms to avoid incorrect conclusions, especially when discussing elevation and regional characteristics.

2. **Clarity and Conciseness**: Provide clear and concise answers that directly address the core question, avoiding unnecessary details that could confuse the reader.

3. **Comprehensive Analysis**: Conduct thorough analyses of all relevant aspects of individuals' careers or fields, recognizing multifaceted roles and contributions.

4. **Cross-Verification**: Validate information with multiple sources or datasets to ensure accuracy, particularly when dealing with complex or interdisciplinary topics.

5. **Attention to Detail in Pr

100%|██████████| 500/500 [19:25<00:00,  2.33s/it]


Results saved to leap_baseline/hotpotqa/gpt-4o/hotpotqa_few.jsonl


256it [00:00, 72525.62it/s]


Model: gpt-4o Task: musr_location, Shot: few
--------------------------------------------------
Role:
system
Content:
You are a helpful AI assistant that will answer reasoning questions. You may reason over the question but you will always say at the end "Answer: <Your Answer Letter Choice>". You must only pick one answer and you must end your response with "Answer: <Your Answer Letter Choice>" everytime!
--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Observation and Awareness:** Track characters' direct observations and awareness of changes. If a character does not witness an object being moved, they will likely believe it remains in its last known location.

2. **Character Focus:** Consider what each character is focused on at different points in the story. This helps determine what they are likely to notice or miss, affecting their beliefs about object locations.

3. **Narrative Clarity:** Ensu

100%|██████████| 256/256 [26:08<00:00,  6.13s/it]


Results saved to leap_baseline/musr_location/gpt-4o/musr_location_few.jsonl


250it [00:00, 75764.16it/s]


Model: gpt-4o Task: musr_efficiently, Shot: few
--------------------------------------------------
Role:
system
Content:
You are a helpful AI assistant that will answer reasoning questions. You may reason over the question but you will always say at the end "Answer: <Your Answer Letter Choice>". You must only pick one answer and you must end your response with "Answer: <Your Answer Letter Choice>" everytime!
--------------------------------------------------
Role:
user
Content:
Please carefully note the following principles:

Principles: 1. **Skill and Suitability Assessment:** Accurately evaluate individual skills and limitations to assign roles that maximize strengths and minimize weaknesses.

2. **Interpersonal Dynamics:** Consider how individuals collaborate, leveraging successful past partnerships to enhance team performance.

3. **Contextual Understanding:** Develop a deep understanding of the context and nuances of each team member's background and interactions for informed deci

100%|██████████| 250/250 [49:06<00:00, 11.79s/it]

Results saved to leap_baseline/musr_efficiently/gpt-4o/musr_efficiently_few.jsonl





In [None]:
from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

models = ['gpt-4o-mini', 'gpt-4o', 'llama']

for model in models: 
    file_path = f"{config.output_dir}/math500/{model}/math500_few.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]

        scores = [0,0,0,0,0]
        for entry in data:
            idx = entry.get('idx', None)
            
            temp = entry.get('entry') or entry.get('doc')
            if model == 'llama': 
                temp['solution'] = temp['input_correct_responses']
            _, gt = parse_ground_truth(temp, "math")
            
            model_outputs = entry.get('leap_output') or entry.get('resps')[0]
            

            for i, model_output in enumerate(model_outputs): 
            

                pred = extract_answer(model_output, "math")
                pred = strip_string(pred)


                result = math_equal_process((idx, pred, gt))

                if not result:
                    result = process_results(gt, [model_output])
                    if not result:
                        pred = extract_answer(pred, "math")
                        result = math_equal_process((None, pred, gt))
                scores[i] += int(result)
        total = 0
        for score in scores: 
            acc = 100 *(score / len(data))
            total += acc 
        total = (total / 5)
        print(f"{model} leap_output -> Accuracy: {total:.1f}")
    print("-" * 50)


gpt-4o-mini leap_output -> Accuracy: 74.5
--------------------------------------------------
gpt-4o leap_output -> Accuracy: 75.6
--------------------------------------------------
llama leap_output -> Accuracy: 42.3
--------------------------------------------------


In [None]:
def extract_answer(text):
    pattern = r"answer is \(?([A-J])\)?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return extract_again(text)



for model in models:
    scores = [0,0,0,0,0]

    file_path = f"{config.output_dir}/mmlu_pro/{model}//mmlu_pro_few.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]

    
    total_data_len = len(data)

    for entry in data:
        model_outputs = entry.get('leap_output') or entry.get('resps')[0]
        temp = entry.get('entry') or entry.get('doc')
        answer = temp.get('answer') or temp.get('gold')
        
        for i, mo in enumerate(model_outputs):
            pred = extract_answer(mo)
            
            if pred == answer: 
                scores[i] += 1

    total = 0 
    for score in scores: 
        acc = 100* (score / total_data_len)
        total += acc   
    total = total / 5
    print(f"{model} -> Accuracy: {acc:.1f}")
    print("-" * 50)


gpt-4o-mini -> Accuracy: 63.2
--------------------------------------------------
gpt-4o -> Accuracy: 74.0
--------------------------------------------------
llama -> Accuracy: 37.3
--------------------------------------------------


In [None]:
from gpqa.gpqa_utils import * 



examples = load_examples("../data/gpqa/gpqa_diamond.csv", seed=0)

for model in models:
    file_path = f"{config.output_dir}/gpqa/{model}/gpqa_{config.shot_type}.jsonl"
    
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    

    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    scores = [0,0,0,0,0]


    total_data_len = len(data)
    if total_data_len != len(examples):
        print("Warning: data length and examples length do not match!")
        
    for entry, example in zip(data, examples):
       
        correct_index = example.correct_index  

        model_outputs = entry.get('leap_output') or entry.get('resps')[0]

        for i, mo in enumerate(model_outputs):
            pred = parse_sampled_answer(mo)
            if pred is None:
                scores[i] += 0 

            else: 
                is_correct = (LETTER_TO_INDEX[pred] == correct_index)
                scores[i] += int(is_correct)

        
    total = 0 
    for score in scores: 
        acc = 100* (score / total_data_len)
        total += acc   
    total = total / 5
    print(f"{model} -> Accuracy: {acc:.1f}")
    print("-" * 50)


gpt-4o-mini -> Accuracy: 43.9
--------------------------------------------------
gpt-4o -> Accuracy: 45.5
--------------------------------------------------
llama -> Accuracy: 27.8
--------------------------------------------------


In [None]:
from drop.drop_utils import *

for model in models: 
    file_path = f"{config.output_dir}/drop/{model}/drop_{config.shot_type}.jsonl"
    
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]

    em = [[] for _ in range(5)]
    f1 = [[] for _ in range(5)]


        
    def get_max_em_f1(pred, golds):
        max_em, max_f1 = 0.0, 0.0
        for gold_answer in golds:
            exact_match, f1_score = get_metrics(pred, gold_answer)

            if gold_answer[0].strip():
                max_em = max(max_em, exact_match)
                max_f1 = max(max_f1, f1_score)
        return max_em, max_f1


    for entry in data:
        temp = entry.get('entry') or entry.get('doc')
        golds = get_answers(temp)  

        model_outputs = entry.get('leap_output') or entry.get('resps')[0]
        for i, mo in enumerate(model_outputs):
            pred = extract_answer(mo)
            em_val, f1_val = get_max_em_f1(pred, golds)
            em[i].append(em_val)
            f1[i].append(f1_val)


    avg_em = 100 * np.mean([np.mean(em[i]) for i in range(5)])
    avg_f1 = 100 * np.mean([np.mean(f1[i]) for i in range(5)])
    print(f"{model} -> EM: {avg_em:.1f}, F1: {avg_f1:.1f}")
    print("-" * 50)

gpt-4o-mini -> EM: 75.8, F1: 83.0
--------------------------------------------------
gpt-4o -> EM: 81.5, F1: 89.8
--------------------------------------------------
llama -> EM: 58.2, F1: 64.1
--------------------------------------------------


In [None]:
from hotpotqa.hotpotqa_utils import *

def extract_answer(response_text):
    match = re.search(r"Answer\s+(.+)", response_text, re.DOTALL)
    if match:
        answer = match.group(1).strip()

        answer = re.sub(r"[.\n]+$", "", answer).strip()
        return answer

    match = re.search(r"(?<!\w)Answer[:\s]+(.+?)(?:[.\n]|$)", response_text, re.IGNORECASE | re.DOTALL)
    if match:
        answer = match.group(1).strip()

        answer = re.sub(r"[.\n]+$", "", answer).strip()
        return answer
    return response_text.strip()


dataset = json.load(open(f'../data/hotpotqa/hotpotqa.json'))

for model in models:
    file_path = f"{config.output_dir}/hotpotqa/{model}/hotpotqa_{config.shot_type}.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]
    
        preds = [[] for i in range(5)]
        for entry in data:

            model_outputs = entry.get('leap_output') or entry.get('resps')[0]

            for i, mo in enumerate(model_outputs):
                pred = extract_answer(mo)
                preds[i].append(pred)

        em_scores = []
        f1_scores = []
        for i in range(5):
            em, f1 = get_em_f1(dataset, preds[i])
            em_scores.append(em.mean())
            f1_scores.append(f1.mean())


        avg_em = 100* (sum(em_scores) / len(em_scores))
        avg_f1 = 100* (sum(f1_scores) / len(f1_scores))
        print(f"{model} -> EM: {avg_em:.1f}, F1: {avg_f1:.1f}")
        
        print("-" * 50)

gpt-4o-mini -> EM: 34.0, F1: 45.1
--------------------------------------------------
gpt-4o -> EM: 45.1, F1: 58.4
--------------------------------------------------
llama -> EM: 19.9, F1: 26.8
--------------------------------------------------


In [197]:
from musr.musr import MuSRDataset

ta_path = '../data/musr/team_allocation.json'
ta = MuSRDataset(ta_path)

op_path = '../data/musr/object_placements.json'
op = MuSRDataset(op_path)

In [198]:
for model in models:
    file_path = f"{config.output_dir}/musr_location/{model}/musr_location_{config.shot_type}.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]

        scores = [0,0,0,0,0]
        for test_idx, entry in enumerate(data):

            model_outputs = entry.get('leap_output') or entry.get('resps')[0]
            
            for i, mo in enumerate(model_outputs):
                scores[i] += op.evaluate_response([mo], op[test_idx])[0]['correct']

        total = 0 
        for score in scores: 
            acc = 100* (score / len(data))
            total += acc   
        total = total / 5
        print(f"{model} -> Accuracy: {acc:.1f}")
        print("-" * 50)

gpt-4o-mini -> Accuracy: 59.8
--------------------------------------------------
gpt-4o -> Accuracy: 66.8
--------------------------------------------------
llama -> Accuracy: 51.6
--------------------------------------------------


In [199]:
for model in models:
    file_path = f"{config.output_dir}/musr_efficiently/{model}/musr_efficiently_{config.shot_type}.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]

        scores = [0,0,0,0,0]
        for test_idx, entry in enumerate(data):

            model_outputs = entry.get('leap_output') or entry.get('resps')[0]
            
            for i, mo in enumerate(model_outputs):
                scores[i] += ta.evaluate_response([mo], ta[test_idx])[0]['correct']

        total = 0 
        for score in scores: 
            acc = 100* (score / len(data))
            total += acc   
        total = total / 5
        print(f"{model} -> Accuracy: {acc:.1f}")
        print("-" * 50)

gpt-4o-mini -> Accuracy: 74.4
--------------------------------------------------
gpt-4o -> Accuracy: 87.2
--------------------------------------------------
llama -> Accuracy: 69.2
--------------------------------------------------
