In [None]:
import os
import sys
import json

sys.path.append('../')  

from math500.math_utils import *
from math500.parser import *
from math500.grader import *

for model in ['llama']: 
    file_path = f"../leap/math500/{model}/math500_mistakes.jsonl"
    wrong_file_path = f"../leap/math500/{model}/math500_wrong_predictions.jsonl"

    scores = [[] for _ in range(15)]

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]

        for idx, entry in enumerate(data):

            gt = extract_answer(entry['answer'], "math")
            gt = strip_string(gt)

            for i, mo in enumerate(entry['model_outputs']):

                pred = extract_answer(mo, "math")
                pred = strip_string(pred)

                try:
                    result = math_equal_process((idx, pred, gt))
                    if not result:
                        result = process_results(gt, [mo])

                        if not result :
                            wrong_entries.append({
                                "idx": idx,
                                "entry": entry,
                                "gt": gt,
                                "model_index": i,
                                "model_output": mo,
                                "pred": pred
                            })
                            break

                    scores[i].append(result)

                except TimeoutError:
                    scores[i].append(False)
                except Exception as error:
                    print(f"Error encountered: {error}")
                    exit()

        if wrong_entries:
            with open(wrong_file_path, 'w', encoding='utf-8') as wf:
                for we in wrong_entries:
                    wf.write(json.dumps(we, ensure_ascii=False) + "\n")
            print(f"Saved wrong predictions to {wrong_file_path}")
        else:
            print("No wrong entries found.")


Saved wrong predictions to ../leap/math500/llama/math500_wrong_predictions.jsonl


In [None]:
from collections import defaultdict
from mmlu_pro.mmlu_utils import * 

def extract_answer(text):
    pattern = r"answer is \(?([A-J])\)?"
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    else:
        return extract_again(text)

for model in ['llama']:
    file_path = f"../leap/mmlu_pro/{model}/mmlu_pro_mistakes.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    

    wrong_entries_by_subject = defaultdict(list)


    for idx, entry in enumerate(data):
        subject = entry['subject']
        gt = extract_answer(entry['answer'])
        
        for i, mo in enumerate(entry['model_outputs']):
            pred = extract_answer(mo)
            if pred != gt:
                wrong_entries_by_subject[subject].append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break
    

    for subject, wrong_entries in wrong_entries_by_subject.items():
        wrong_file_path = f"../leap/mmlu_pro/{model}/{subject}/mmlu_pro_wrong_predictions.jsonl"
        os.makedirs(os.path.dirname(wrong_file_path), exist_ok=True)
        
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
    print(f"Saved wrong predictions to {wrong_file_path}")
    
  


  from .autonotebook import tqdm as notebook_tqdm


Saved wrong predictions to ../leap/mmlu_pro/llama/engineering/mmlu_pro_wrong_predictions.jsonl


In [2]:
from gpqa.gpqa_utils import *

for model in ['llama']: 
    file_path = f"../leap/gpqa/{model}/gpqa_mistakes.jsonl"
    wrong_file_path = f"../leap/gpqa/{model}/gpqa_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = parse_sampled_answer(entry['answer'])

        for i, mo in enumerate(entry['model_outputs']):
            pred = parse_sampled_answer(mo)
            if pred != gt: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to ../leap/gpqa/llama/gpqa_wrong_predictions.jsonl


In [7]:
from hotpotqa.hotpotqa_utils import *

for model in ['llama']: 
    file_path = f"../leap/hotpotqa/{model}/hotpotqa_mistakes.jsonl"
    wrong_file_path = f"../leap/hotpotqa/{model}/hotpotqa_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = extract_answer(entry['answer'])
        gt = normalize_answer(gt)
        for i, mo in enumerate(entry['model_outputs']):
            pred = extract_answer(mo)
            pred = normalize_answer(pred)
            if pred != gt: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to ../leap/hotpotqa/llama/hotpotqa_wrong_predictions.jsonl


In [4]:
from drop.drop_utils import *
from drop.drop_utils import _normalize

for model in ['llama']: 
    file_path = f"../leap/drop/{model}/drop_mistakes.jsonl"
    wrong_file_path = f"../leap/drop/{model}/drop_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = extract_answer(entry['answer'])

        for i, mo in enumerate(entry['model_outputs']):
            pred = extract_answer(mo)
            pred = _normalize(pred)

            em, _ = get_metrics(pred, gt)
            if not em: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to ../leap/drop/llama/drop_wrong_predictions.jsonl


In [5]:
from musr.musr import MuSRDataset

from musr.op_icl_fixed import op_fewshot, few_shot_op_instruction, test_op_instruction
from musr.ta_icl_fixed import ta_fewshot, few_shot_ta_instruction, test_ta_instruction


op_path = '../data/musr/object_placements.json'
op = MuSRDataset(op_path)

for model in ['llama']: 
    file_path = f"../leap/musr_location/{model}/musr_location_mistakes.jsonl"
    wrong_file_path = f"../leap/musr_location/{model}/musr_location_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = op.evaluate_response([entry['answer']], op[0])[0]['model_answer']
        

        for i, mo in enumerate(entry['model_outputs']):
            pred = op.evaluate_response([mo], op[0])[0]['model_answer']
            
            if pred != gt: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to ../leap/musr_location/llama/musr_location_wrong_predictions.jsonl


In [6]:
ta_path = '../data/musr/team_allocation.json'
ta = MuSRDataset(ta_path)

for model in ['llama']: 
    file_path = f"../leap/musr_efficiently/{model}/musr_efficiently_mistakes.jsonl"
    wrong_file_path = f"../leap/musr_efficiently/{model}/musr_efficiently_wrong_predictions.jsonl"

    wrong_entries = []  

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    for idx, entry in enumerate(data):
        gt = ta.evaluate_response([entry['answer']], ta[0])[0]['model_answer']
        

        for i, mo in enumerate(entry['model_outputs']):
            pred = ta.evaluate_response([mo], ta[0])[0]['model_answer']
            
            if pred != gt: 
                wrong_entries.append({
                    "idx": idx,
                    "entry": entry,
                    "gt": gt,
                    "model_index": i,
                    "model_output": mo,
                    "pred": pred
                })
                break

    if wrong_entries:
        with open(wrong_file_path, 'w', encoding='utf-8') as wf:
            for we in wrong_entries:
                wf.write(json.dumps(we, ensure_ascii=False) + "\n")
        print(f"Saved wrong predictions to {wrong_file_path}")
    else:
        print("No wrong entries found.")


Saved wrong predictions to ../leap/musr_efficiently/llama/musr_efficiently_wrong_predictions.jsonl
