In [None]:
import os
import json
from tqdm import tqdm 
import sys 

from dotenv import load_dotenv
from openai import OpenAI

sys.path.append('../')  

from gpqa.gpqa_utils import * 

from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

from mmlu_pro.mmlu_utils import * 

from hotpotqa.hotpotqa_utils import *

from drop.drop_utils import *

from utils import load_model_outputs

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def construct_prompt(args): 
    if args.task != 'mmlu_pro': 
        output_res_path = f"../result/{args.task}/{args.model}/{args.task}_{args.shot_type}.jsonl"
        res = load_model_outputs(output_res_path)
        
    system_prompt = ""
    start_prompt = "I have generated the following responses to the question: "
    end_prompt = """\n\nEvaluate these responses.\nSelect the most consistent response based on majority consensus.\nStart your answer with "The most consistent response is Response X" (without quotes)."""

    samples = []
    if args.task == "math500":
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            question = entry.get('problem', '')
            model_outputs = r.get('model_outputs', [])

            user_prompt = start_prompt + f"{question}\n\n"
            #
            model_outputs = model_outputs[::-1]
            #
            
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)
        
    elif args.task == "mmlu_pro":
        def format_example(question, options, cot_content=""):
            example = "{}\nOptions: ".format(question)
            choice_map = "ABCDEFGHIJ"
            for i, opt in enumerate(options):
                example += "{}. {}\n".format(choice_map[i], opt)
            
            return example
        
        dataset, fewshot = load_mmlu_pro()
        subjects = list(dataset.keys())
        for subject in tqdm(subjects): 
            res_path = f"../result/{args.task}/{args.model}/{subject}_result.jsonl"
            res = load_model_outputs(res_path)

            for idx, r in tqdm(enumerate(res)):
                entry = r.get('entry', {})
                model_outputs = r.get('model_outputs', [])
                question = format_example(entry['question'], entry['options'])

                user_prompt = start_prompt + f"{question}\n\n"
                for i, output in enumerate(model_outputs):
                    user_prompt += f"Response {i}: {output}\n"

                user_prompt += end_prompt

                message = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ]
                sample = {"idx": idx, "prompt": message, "entry": entry}
                samples.append(sample)
    
    elif args.task == "gpqa": 
        def format_example(example) -> str:
            prompt = f"Question: {example[0]}"
            prompt += f"\nChoices:\n(A) {example[1]}\n(B) {example[2]}\n(C) {example[3]}\n(D) {example[4]}"
            return prompt
        
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])
            question = format_example(entry)

            user_prompt = start_prompt + f"{question}\n\n"
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

    elif args.task == "hotpotqa": 
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])
            question = entry['question']

            user_prompt = start_prompt + f"{question}\n\n"
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

    elif args.task == "drop": 
         for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])
            
            user_prompt = start_prompt + f"{entry['passage']} {entry['question']}\n\n" 
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt
            
            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

    elif args.task == "musr_location": 
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])

            question = entry['question'].strip()
            context = entry['context'].strip()
            choices = entry['choices']['text']
            
            labels = ['A', 'B', 'C', 'D', 'E', 'F'][:len(choices)]
            choice_str = '\n'.join([f'{labels[idx]}: {choices[idx]}' for idx in range(len(choices))])
            original_question_part = f"{context}\n\n{question}\n\n{choice_str}"
                        
            user_prompt = start_prompt + f"{original_question_part}\n\n" 
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt
            
            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)
        
    elif args.task == "musr_efficiently":
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])

            question = entry['question'].strip()
            context = entry['context'].strip()
            choices = entry['choices']['text']
            
            labels = ['A', 'B', 'C', 'D', 'E', 'F'][:len(choices)]
            choice_str = '\n'.join([f'{labels[idx]}: {choices[idx]}' for idx in range(len(choices))])
            original_question_part = f"{context}\n\n{question}\n\n{choice_str}"
                        
            user_prompt = start_prompt + f"{original_question_part}\n\n" 
            for i, output in enumerate(model_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt
            
            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)
    else: 
        return None

    return samples

def generate_model_output(model: str, prompt: str, temperature: float = 1.0) -> str:        
    responses = client.chat.completions.create(
        model=model,
        messages=prompt,
        n=1,
        temperature=temperature,
    )

    outputs = [choice.message.content for choice in responses.choices]
    return outputs

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def main(config):
    save_path = f"{config.output_dir}/{config.task}/{config.model}/{config.task}_{config.shot_type}.jsonl"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    samples = construct_prompt(config)
    if config.num_examples != -1: 
        samples = samples[:config.num_examples]

    if samples:
        print(f"Model: {config.model} Task: {config.task}, Shot: {config.shot_type}")
        print(samples[0].keys())
        print("-" * 50)
        prompt = samples[0]["prompt"]
        for message in prompt:
            print(f"Role:\n{message['role']}")
            print(f"Content:\n{message['content']}")
            print("-" * 50)
    else:
        print(f"No samples found for Task: {config.task}, Shot: {config.shot_type}")

    
    if os.path.exists(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:

            existing_data = {json.loads(line)['prompt'][1]['content'] for line in f}  

    else:
        existing_data = set()  


    if samples:

        with open(save_path, "a", encoding='utf-8') as f:  

            for sample in tqdm(samples, total=len(samples)):
                if sample['prompt'][1]['content'] in existing_data:  

                    continue
                try:

                    model_outputs = generate_model_output(config.model, sample["prompt"], config.temperature)
                    sample["prompt_output"] = model_outputs
                    json.dump(sample, f)
                    f.write("\n")
                except Exception as e:
                    print(f"Error processing sample {sample['idx']}: {e}")
                    break

        print(f"Results saved to {save_path}")

In [None]:
class Config:
    def __init__(self):
        self.model = "gpt-4o-mini" 
        self.task = "drop"  # "math500", "mmlu_pro", "gpqa", "drop", "hotpotqa"
        self.shot_type = "few"  
        self.output_dir = "usc"
        self.num_examples = -1
        self.temperature = 0.0

In [None]:
tasks = ['math500', 'gpqa', "drop", "hotpotqa", "musr_location", 'musr_efficiently']
# tasks = ['math500']
subjects = ['business', 'law', 'psychology', 'biology', 'chemistry', 'history', 'other', 'health', 'economics', 'math', 'physics', 'computer science', 'philosophy', 'engineering']

shots = ["few"]
models = ['gpt-4o-mini']

config = Config()

In [None]:

for model in models:
    for task in tasks:
        for shot in shots:
            config.model = model
            config.task = task
            config.shot_type = shot
            print("=" * 50)
            main(config)



500it [00:00, 105474.63it/s]


Model: gpt-4o-mini Task: math500, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
I have generated the following responses to the question: Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\theta),$ where $r > 0$ and $0 \le \theta < 2 \pi.$

Response 0: To convert the point \((0, 3)\) from rectangular coordinates to polar coordinates, we apply the formulas for conversion:

1. The radius \(r\) can be calculated using the formula:
   \[
   r = \sqrt{x^2 + y^2}
   \]
   where \(x = 0\) and \(y = 3\).

   Substituting the values, we get:
   \[
   r = \sqrt{0^2 + 3^2} = \sqrt{0 + 9} = \sqrt{9} = 3.
   \]

2. The angle \(\theta\) is determined using the formula:
   \[
   \theta = \tan^{-1}\left(\frac{y}{x}\right).
   \]
   However, since \(x = 0\), we cannot directly use this formula. When

100%|██████████| 500/500 [08:54<00:00,  1.07s/it]

Results saved to usc_reverse/math500/gpt-4o-mini/math500_few.jsonl





In [None]:
def extract_user_number(response_text):
    """
    주어진 응답 문자열에서 user 번호를 추출합니다.
    
    1. 먼저 '[user1]', 'user1' 등과 같이 user 뒤에 숫자가 붙은 패턴을 찾습니다.
    2. 만약 해당 패턴 매치에 실패하면, 마지막 문장을 추출한 후 그 문장 내의 마지막 숫자를 반환합니다.
    3. 모두 실패하면 None을 반환합니다.
    """

    primary_pattern = r'\[?\s*The most consistent response is Response\s*(\d+)\s*\]?'
    primary_matches = re.findall(primary_pattern, response_text, flags=re.IGNORECASE)
    if primary_matches:

        return int(primary_matches[0])
    


    sentences = re.split(r'[.\n]', response_text.strip())

    sentences = [s.strip() for s in sentences if s.strip()]
    if sentences:
        last_sentence = sentences[-1]
        fallback_matches = re.findall(r'(\d+)', last_sentence)
        if fallback_matches:

            return int(fallback_matches[-1])
    

    return -1

In [None]:
overall_summary = []

for model in models:
    for task in tasks:
        if task == 'mmlu_pro':
            continue
        usc_path = f"{config.output_dir}/{task}/{model}/{task}_{config.shot_type}.jsonl"
        usc = load_model_outputs(usc_path)

        output_res_path = f"../result/{task}/{model}/{task}_{config.shot_type}.jsonl"
        res = load_model_outputs(output_res_path)
                    

        user_counter = Counter()
        total_entries = 0
        for entry in tqdm(usc, desc=f"Processing {model}-{task}"):
            total_entries += 1
            raw_output = entry.get("prompt_output")          
            if isinstance(raw_output, list):
                prompt_output = raw_output[0] if raw_output else None
            else:
                prompt_output = raw_output                   

            if prompt_output is None:
                user_counter["None"] += 1
                continue
            extracted_user = extract_user_number(prompt_output)
            if extracted_user is None:
                user_counter["None"] += 1
            else:
                user_counter[extracted_user] += 1


        for user, count in user_counter.items():
            percentage = (count / total_entries) * 100 if total_entries else 0
            overall_summary.append({
                "Model": model,
                "Task": task,
                "Extracted User": user,
                "Count": count,
                "Percentage": round(percentage, 2)
            })

        for i in range(len(res)):
            raw_output = usc[i].get("prompt_output")          
            if isinstance(raw_output, list):
                source_text = raw_output[0] if raw_output else None
            else:
                source_text = raw_output                   

            if source_text is None:
                answer_number = 0
            else:
                answer_number = extract_user_number(source_text)
                if not isinstance(answer_number, int) or not (0 <= answer_number <= 4):
                    answer_number = 0
            model_outputs = res[i].get('model_outputs') or res[i].get('resps')[0]

            model_outputs = model_outputs[::-1]
 
            if len(model_outputs) <= answer_number:
                print(f"Warning: Entry {i} has less than {answer_number} model outputs. Using default output.")
                chosen_output = model_outputs[0] if model_outputs else None
            else:
                chosen_output = model_outputs[answer_number]
            usc[i]['usc'] = chosen_output







Processing gpt-4o-mini-math500: 100%|██████████| 500/500 [00:00<00:00, 209526.63it/s]


Processing gpt-4o-mini-gpqa: 100%|██████████| 198/198 [00:00<00:00, 352342.89it/s]
Processing gpt-4o-mini-drop: 100%|██████████| 500/500 [00:00<00:00, 417676.16it/s]
Processing gpt-4o-mini-hotpotqa: 100%|██████████| 500/500 [00:00<00:00, 371967.36it/s]
Processing gpt-4o-mini-musr_location: 100%|██████████| 256/256 [00:00<00:00, 340546.09it/s]
Processing gpt-4o-mini-musr_efficiently: 100%|██████████| 250/250 [00:00<00:00, 256626.53it/s]


In [None]:
df_overall = pd.DataFrame(overall_summary)


df_overall['ExtractedUser_Num'] = pd.to_numeric(
    df_overall['Extracted User'], errors='coerce'
)


df_overall['Group'] = df_overall['ExtractedUser_Num'].apply(
    lambda x: '0-4' if 0 <= x <= 4 else 'Others'
)

for model in df_overall["Model"].unique():
    model_df = df_overall[df_overall["Model"] == model]
    

    pivot_table = model_df.pivot_table(
        index="Group",         
        columns="Task",         
        values="Percentage",    
        aggfunc="sum",          
        fill_value=0            
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)



Model: gpt-4o-mini


Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-4,100.0,100.0,100.0,100.0,90.4,100.0
Others,0.0,0.0,0.0,0.0,9.6,0.0


In [None]:
df_overall = pd.DataFrame(overall_summary)

for model in df_overall["Model"].unique():
    model_df = df_overall[df_overall["Model"] == model]
    
    
    pivot_table = model_df.pivot_table(
        index="Extracted User",       
        columns="Task",               
        values=["Percentage"], 
        fill_value=0           
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)


Model: gpt-4o-mini


Unnamed: 0_level_0,Percentage,Percentage,Percentage,Percentage,Percentage,Percentage
Task,drop,gpqa,hotpotqa,math500,musr_efficiently,musr_location
Extracted User,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
-1,0.0,0.0,0.0,0.0,9.6,0.0
0,81.8,66.16,52.6,87.8,55.6,91.41
1,15.4,29.29,33.6,9.4,31.2,7.42
2,1.2,0.51,7.4,1.0,2.4,0.39
3,0.6,0.0,3.8,0.4,1.2,0.39
4,1.0,4.04,2.6,1.4,0.0,0.39


In [None]:
overall_summary_mmlu = []

def process_mmlu_pro(model: str):
    task = "mmlu_pro"

    usc_path = (
        f"{config.output_dir}/{task}/{model}/"
        f"{task}_{config.shot_type}.jsonl"
    )
    all_entries = load_model_outputs(usc_path)

    grouped = defaultdict(list)  
    for entry in all_entries:
        subj = entry["entry"]["category"]         
        grouped[subj].append(entry)

    if model != 'llama': 
        for subj in grouped:
            grouped[subj].sort(key=lambda e: e["idx"])


    for subject, entries in grouped.items():
        output_res_path = f"../result/{task}/{model}/{subject}_result.jsonl"
        res = load_model_outputs(output_res_path)


        user_counter = Counter()
        total = len(entries)

        for idx, usc in enumerate(
            tqdm(entries, desc=f"{model}-{subject}")
        ):
            
            raw_output = usc.get("prompt_output")          
            if isinstance(raw_output, list):
                p_out = raw_output[0] if raw_output else None
            else:
                p_out = raw_output                   

            if p_out is None:
                user_num = None
            else:
                user_num = extract_user_number(p_out)


            key = user_num if user_num is not None else "None"
            user_counter[key] += 1


            if not isinstance(user_num, int) or not (0 <= user_num <= 4):
                user_num = 0  # fallback


            outs = res[idx].get("model_outputs") or res[idx].get("resps")[0]
            chosen = (
                outs[user_num - 1] if len(outs) >= user_num
                else (outs[0] if outs else None)
            )
            entries[idx]["usc"] = chosen


        dir_path = f"{config.output_dir}/{task}/{model}/{subject}"
        file_name = f"{task}_{config.shot_type}.jsonl"
        save_path = os.path.join(dir_path, file_name)


        os.makedirs(dir_path, exist_ok=True)





                

        for u, cnt in user_counter.items():
            overall_summary_mmlu.append({
                "Model": model,
                "Task": task,
                "Subject": subject,
                "Extracted User": u,
                "Count": cnt,
                "Percentage": round(cnt / total * 100, 2)
            })


for model in ['gpt-4o-mini', 'gpt-4o', 'llama']:
    process_mmlu_pro(model)

gpt-4o-mini-business: 100%|██████████| 300/300 [00:00<00:00, 357266.10it/s]
gpt-4o-mini-law: 100%|██████████| 300/300 [00:00<00:00, 335008.31it/s]
gpt-4o-mini-psychology: 100%|██████████| 300/300 [00:00<00:00, 356962.04it/s]
gpt-4o-mini-biology: 100%|██████████| 300/300 [00:00<00:00, 362202.42it/s]
gpt-4o-mini-chemistry: 100%|██████████| 300/300 [00:00<00:00, 348460.59it/s]
gpt-4o-mini-history: 100%|██████████| 300/300 [00:00<00:00, 360335.40it/s]
gpt-4o-mini-other: 100%|██████████| 300/300 [00:00<00:00, 327339.02it/s]
gpt-4o-mini-health: 100%|██████████| 300/300 [00:00<00:00, 363562.90it/s]
gpt-4o-mini-economics: 100%|██████████| 300/300 [00:00<00:00, 292421.85it/s]
gpt-4o-mini-math: 100%|██████████| 300/300 [00:00<00:00, 355349.11it/s]
gpt-4o-mini-physics: 100%|██████████| 300/300 [00:00<00:00, 351183.70it/s]
gpt-4o-mini-computer science: 100%|██████████| 300/300 [00:00<00:00, 317029.78it/s]
gpt-4o-mini-philosophy: 100%|██████████| 300/300 [00:00<00:00, 369976.83it/s]
gpt-4o-mini-eng

In [None]:
df_mmlu = pd.DataFrame(overall_summary_mmlu)

for model in df_mmlu["Model"].unique():
    model_df = df_mmlu[df_mmlu["Model"] == model]
    
    pivot_table = model_df.pivot_table(
        index="Extracted User",       
        columns="Task",               
        values=["Percentage"], 
        fill_value=0           
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)


Model: gpt-4o-mini


Unnamed: 0_level_0,Percentage
Task,mmlu_pro
Extracted User,Unnamed: 1_level_2
-1,0.33
0,73.214286
1,22.213571
2,1.095714
3,1.358462
4,2.142857



Model: gpt-4o


Unnamed: 0_level_0,Percentage
Task,mmlu_pro
Extracted User,Unnamed: 1_level_2
0,71.832143
1,16.857143
2,7.762857
3,2.595714
4,1.110833



Model: llama


Unnamed: 0_level_0,Percentage
Task,mmlu_pro
Extracted User,Unnamed: 1_level_2
-1,0.665
0,34.666429
1,31.307857
2,11.5
3,14.977857
4,7.451429


In [11]:
keys_to_eval = ['usc']
# models = ['gpt-4o-mini', 'gpt-4o', 'llama']

In [None]:
from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

for model in models:
    file_path = f"{config.output_dir}/math500/{model}/math500_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)

    scores = {k: [] for k in keys_to_eval}

    for entry in data:
        idx = entry["idx"]
        
        _, gt = parse_ground_truth(entry['entry'], "math")
        model_outputs = entry.get('model_outputs', [])
        
        for key in keys_to_eval:
            if key not in entry:
                continue


            pred = extract_answer(entry[key], "math")
            pred = strip_string(pred)


            try:
                result = math_equal_process((idx, pred, gt))

                if not result :
                    result = process_results(gt, [entry[key]])
                    if not result:
                        pred = extract_answer(pred, "math")
                        result = math_equal_process((None, pred, gt))

                scores[key].append(result)

            except TimeoutError:
                scores[key].append(False)
            except Exception as error:
                print(f"Error while processing {key} for idx={idx}: {error}")
                scores[key].append(False)

    print(f"\n===== Evaluation Results for model={model}, shot={config.shot_type} =====")
    for key in keys_to_eval:
        if len(scores[key]) == 0:
            print(f"{key} -> No data / Not found in entries")
            continue

        acc = sum(scores[key]) / len(scores[key])
        print(f"{key} -> Accuracy: {acc:.4f}")
    print("-" * 50)



===== Evaluation Results for model=gpt-4o-mini, shot=few =====
usc -> Accuracy: 0.7960
--------------------------------------------------


In [None]:
def extract_answer(text):
    pattern = r"answer is \(?([A-J])\)?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return extract_again(text)


for model in models:
    overall_scores = {k: 0 for k in keys_to_eval}  
    overall_total_entries = 0

    for subject in subjects:
        file_path = f"{config.output_dir}/mmlu_pro/{model}/{subject}/mmlu_pro_few.jsonl"
        data = load_model_outputs(file_path)


        subject_scores = {k: 0 for k in keys_to_eval}
        total_data_len = len(data)
        overall_total_entries += total_data_len

        for entry in data:
            model_outputs = entry.get('model_outputs', [])
            answer = entry['entry'].get('answer') or entry['entry'].get('gold')


            for key in keys_to_eval:
                if key not in entry:
                    continue
                pred = extract_answer(entry[key])
                if pred == answer: 
                    subject_scores[key] += 1


        for k in keys_to_eval:
            overall_scores[k] += subject_scores[k]

           
    print(f"\n=== Overall Results for model={model} ===")
    for key in keys_to_eval:
        if overall_total_entries == 0:
            acc = 0
        else:
            acc = overall_scores[key] / overall_total_entries
        print(f"{key} -> Accuracy: {acc:.4f}")
    print("-" * 50)



=== Overall Results for model=gpt-4o-mini ===
usc -> Accuracy: 0.6252
--------------------------------------------------

=== Overall Results for model=gpt-4o ===
usc -> Accuracy: 0.7210
--------------------------------------------------

=== Overall Results for model=llama ===
usc -> Accuracy: 0.3560
--------------------------------------------------


In [None]:
from gpqa.gpqa_utils import * 

examples = load_examples("../data/gpqa/gpqa_diamond.csv", seed=0)

for model in models:
    file_path = f"{config.output_dir}/gpqa/{model}/gpqa_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)

    scores = {k: 0 for k in keys_to_eval}


    total_data_len = len(data)
    if total_data_len != len(examples):
        print("Warning: data length and examples length do not match!")

    
    
    for entry, example in zip(data, examples):
        correct_index = example.correct_index  

        model_outputs = entry.get('model_outputs', [])
        
        for key in keys_to_eval:
            if key not in entry:
                continue

            pred = parse_sampled_answer(entry[key])
            
            if pred is None:
                is_correct = False
            else:
                is_correct = (LETTER_TO_INDEX[pred] == correct_index)

            scores[key] += int(is_correct)
    
    print(f"\n=== Results for model={model}, shot={config.shot_type} ===")
    for key in keys_to_eval:
        acc = scores[key] / total_data_len if total_data_len else 0
        print(f"{key} -> Accuracy: {acc:.4f}")

    print("-" * 50)

The history saving thread hit an unexpected error (DatabaseError('file is not a database')).History will not be written to the database.

=== Results for model=gpt-4o-mini, shot=few ===
usc -> Accuracy: 0.4242
--------------------------------------------------

=== Results for model=gpt-4o, shot=few ===
usc -> Accuracy: 0.5051
--------------------------------------------------

=== Results for model=llama, shot=few ===
usc -> Accuracy: 0.2879
--------------------------------------------------


In [None]:
from drop.drop_utils import *

for model in models: 
    print(model)
    file_path = f"{config.output_dir}/drop/{model}/drop_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    
    em_scores = {k: [] for k in keys_to_eval}
    f1_scores = {k: [] for k in keys_to_eval}

    def get_max_em_f1(pred, golds):
        max_em, max_f1 = 0.0, 0.0
        for gold_answer in golds:
            exact_match, f1_score = get_metrics(pred, gold_answer)
            if gold_answer[0].strip():
                max_em = max(max_em, exact_match)
                max_f1 = max(max_f1, f1_score)
        return max_em, max_f1

    for entry in data:
        golds = get_answers(entry['entry']) 

        model_outputs = entry.get('model_outputs', [])
        
        for k in keys_to_eval:
            if k not in entry:
                continue

            pred = extract_answer(entry[k])
            em_val, f1_val = get_max_em_f1(pred, golds)
            em_scores[k].append(em_val)
            f1_scores[k].append(f1_val)

    print(f"\n===== Results for model={model} =====")
    for k in keys_to_eval:
        if len(em_scores[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        em_mean = np.mean(em_scores[k])
        f1_mean = np.mean(f1_scores[k])
        print(f"{k} -> EM: {em_mean:.4f}, F1: {f1_mean:.4f}")

    print("-" * 50)

gpt-4o-mini

===== Results for model=gpt-4o-mini =====
usc -> EM: 0.7880, F1: 0.8579
--------------------------------------------------
gpt-4o

===== Results for model=gpt-4o =====
usc -> EM: 0.8200, F1: 0.9020
--------------------------------------------------
llama

===== Results for model=llama =====
usc -> EM: 0.6960, F1: 0.7579
--------------------------------------------------


In [None]:
from hotpotqa.hotpotqa_utils import *

def extract_answer(response_text):
    match = re.search(r"Answer\s+(.+)", response_text, re.DOTALL)
    if match:
        answer = match.group(1).strip()

        answer = re.sub(r"[.\n]+$", "", answer).strip()
        return answer


    match = re.search(r"(?<!\w)Answer[:\s]+(.+?)(?:[.\n]|$)", response_text, re.IGNORECASE | re.DOTALL)
    if match:
        answer = match.group(1).strip()

        answer = re.sub(r"[.\n]+$", "", answer).strip()
        return answer
    return response_text.strip()


dataset = json.load(open(f'../data/hotpotqa/BM25/hotpotqa-bm25.json'))
with open("../hotpotqa/react_prompt.json", 'r') as f:
    fewshot = json.load(f)

for model in models:
    print(model)
    file_path = f"{config.output_dir}/hotpotqa/{model}/hotpotqa_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    preds = {k: [] for k in keys_to_eval}

    for entry in data:

        model_outputs = entry.get('model_outputs', [])
        
        for k in keys_to_eval:
            if k in entry:

                answer = extract_answer(entry[k])
                preds[k].append(answer)

    print(f"\n=== Results for model={model} ===")
    for k in keys_to_eval:
        if len(preds[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        em_scores, f1_scores = get_em_f1(dataset, preds[k])
        em_mean = em_scores.mean()
        f1_mean = f1_scores.mean()
        print(f"{k} -> EM: {em_mean:.4f}, F1: {f1_mean:.4f}")
    
    print("-" * 50)

gpt-4o-mini

=== Results for model=gpt-4o-mini ===
usc -> EM: 0.3660, F1: 0.4818
--------------------------------------------------
gpt-4o

=== Results for model=gpt-4o ===
usc -> EM: 0.4580, F1: 0.6044
--------------------------------------------------
llama

=== Results for model=llama ===
usc -> EM: 0.2440, F1: 0.3247
--------------------------------------------------


In [58]:
from musr.musr import MuSRDataset

ta_path = '../data/musr/team_allocation.json'
ta = MuSRDataset(ta_path)

op_path = '../data/musr/object_placements.json'
op = MuSRDataset(op_path)

In [None]:
for model in models:
    print(model)

    file_path = f"{config.output_dir}/musr_location/{model}/musr_location_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    
    
    preds = {k: [] for k in keys_to_eval}


    for test_idx, entry in enumerate(data):
        model_outputs = entry.get('model_outputs', [])
        for k in keys_to_eval:
            if k in entry:  
                preds[k].append(entry[k])
            else:
                preds[k].append(None)

    total_data_len = len(data)
    
    scores = {k: 0 for k in keys_to_eval}
    
    for i, entry in enumerate(data):
        if 'entry' not in entry:
            continue
        
        for k in keys_to_eval:
            pred_answer = preds[k][i]
            if pred_answer is not None:
                metrics = op.evaluate_response([pred_answer], op[i])
                if metrics and metrics[0]['correct']:
                    scores[k] += 1


    print(f"\n=== Results for model={model} ===")
    for k in keys_to_eval:
        if len(preds[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        acc = scores[k] / total_data_len if total_data_len else 0
        print(f"{k} -> Accuracy: {acc:.4f}")

    print("-" * 50)

gpt-4o-mini



=== Results for model=gpt-4o-mini ===
usc -> Accuracy: 0.5977
--------------------------------------------------
gpt-4o

=== Results for model=gpt-4o ===
usc -> Accuracy: 0.7344
--------------------------------------------------
llama

=== Results for model=llama ===
usc -> Accuracy: 0.5234
--------------------------------------------------


In [None]:
for model in models:
    print(model)

    file_path = f"{config.output_dir}/musr_efficiently/{model}/musr_efficiently_{config.shot_type}.jsonl"
    data = load_model_outputs(file_path)
    
    
    preds = {k: [] for k in keys_to_eval}

    for test_idx, entry in enumerate(data):
        model_outputs = entry.get('model_outputs', [])
        for k in keys_to_eval:
            if k in entry:  
                preds[k].append(entry[k])
            else:
                preds[k].append(None)

    total_data_len = len(data)
    
    scores = {k: 0 for k in keys_to_eval}
    
    for i, entry in enumerate(data):
        if 'entry' not in entry:
            continue
        
        for k in keys_to_eval:
            pred_answer = preds[k][i]
            if pred_answer is not None:
                metrics = ta.evaluate_response([pred_answer], ta[i])
                if metrics and metrics[0]['correct']:
                    scores[k] += 1


    print(f"\n=== Results for model={model} ===")
    for k in keys_to_eval:
        if len(preds[k]) == 0:
            print(f"{k}: No entries found, skip.")
            continue

        acc = scores[k] / total_data_len if total_data_len else 0
        print(f"{k} -> Accuracy: {acc:.4f}")

    print("-" * 50)

gpt-4o-mini

=== Results for model=gpt-4o-mini ===
usc -> Accuracy: 0.7640
--------------------------------------------------
gpt-4o

=== Results for model=gpt-4o ===
usc -> Accuracy: 0.8760
--------------------------------------------------
llama

=== Results for model=llama ===
usc -> Accuracy: 0.6720
--------------------------------------------------


In [61]:
(78.6 + 62.5 + 42.4 + 78.8 + 36.6 + 59.8 + 76.4 ) / 7

62.15714285714286

In [62]:
(79.8 + 72.1 + 50.5 + 82.0 + 45.8 + 73.4 + 87.6 ) / 7


70.17142857142858

In [67]:
(49.6 + 35.6 + 28.8 + 69.6 + 24.4 + + 52.3 + 67.2) /7 

46.785714285714285