In [None]:
import sys
import os
from pathlib import Path
import json 
from math_utils import *
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt 

from parser import *  
from grader import *
from tqdm import tqdm

sys.path.append('../')  

from utils import * 

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from collections import Counter

test_path = f"../data/math500/test.jsonl"
with open(test_path, 'r', encoding='utf-8') as f:
    test = [json.loads(line) for line in f]

for _type in ["zero", "few"]:
    for model in ['gpt-4o-mini', 'gpt-4o']:
        file_path = f"../result/math500/{model}/math500_{_type}.jsonl"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue

        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]

        scores = [[] for _ in range(5)]

        any_correct = []

        all_correct_count = 0

        for entry in data:
            idx = entry["idx"]

            preds = [extract_answer(mo, "math") for mo in entry['model_outputs']]

            _, gt = parse_ground_truth(test[idx], "math")  

            sample_results = []  

            for i, pred in enumerate(preds):
                result = math_equal_process((None, pred, gt))
                if not result: 
                    result = process_results(gt, [entry['model_outputs'][i]])
                    if not result:
                        pred = extract_answer(pred, "math")
                        result = math_equal_process((None, pred, gt))
                sample_results.append(result)
                scores[i].append(result)


            any_correct.append(any(sample_results))
            if all(sample_results):
                all_correct_count += 1

        num_questions = len(data)

        worse_case_acc = all_correct_count / num_questions if num_questions > 0 else 0.0

        rep_accuracies = []
        for i in range(5):
            rep_acc = sum(scores[i]) / num_questions if num_questions > 0 else 0.0
            rep_accuracies.append(rep_acc)

        avg_rep_acc = sum(rep_accuracies) / 5.0


        any_correct_acc = sum(any_correct) / num_questions if num_questions > 0 else 0.0

        print(f"Results for {_type} using model {model}:")
        print(f"  Worse-case (all correct in a single question): {worse_case_acc:.3f}")

        for i, acc_val in enumerate(rep_accuracies):
            print(f"  Repetition {i}: {acc_val:.3f}")

        print(f"  Average (across 5 samples): {avg_rep_acc:.3f}")
        print(f"  Any-correct Accuracy: {any_correct_acc:.3f}\n")

In [None]:
'''
llama-3.1-8B-Instruct
few,zero CoT
do_sample=True, temperatue=1, repeats=5
'''
test_path = f"../data/math500/test.jsonl"
with open(test_path, 'r', encoding='utf-8') as f:
    test = [json.loads(line) for line in f]

for _type in ['few', 'zero']:

    file_path = f"../result/math500/llama/math500_{_type}.jsonl"
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    n = len(data[0]['resps'][0])
    any_correct = 0 
    idx_acc = [[] for i in range(n)]
    for idx, entry in enumerate(data):
        flag = False

        _, gt = parse_ground_truth(test[idx], "math")  
        
        for idx, resps in enumerate(entry['resps'][0]):
            pred = extract_answer(resps, "math")
            pred = strip_string(pred)
            
            result = math_equal_process((idx, pred, gt))
            if not result:
                result = process_results(gt, [resps])
                if not result:
                    pred = extract_answer(pred, "math")
                    result = math_equal_process((None, pred, gt))
            idx_acc[idx].append(int(result))
            
            if result: 
                flag = True
        if flag:
            any_correct += 1
    total = 0 
    for idx, acc in enumerate(idx_acc): 
        print(f"{idx}: ", sum(acc)/500)
        total += sum(acc) / 500
    print(f"{_type} Avg: ", total / n)
    print("Any_correct: ", any_correct/500)

0:  0.416
1:  0.442
2:  0.458
3:  0.42
4:  0.41
few Avg:  0.42919999999999997
Any_correct:  0.672
0:  0.412
1:  0.446
2:  0.442
3:  0.45
4:  0.458
zero Avg:  0.44160000000000005
Any_correct:  0.69


In [None]:
def update_predictions_with_is_correct(file_path, data_path):

    if not os.path.exists(file_path):
        print(f"Error: JSON file not found at {file_path}")
        return
    with open(file_path, "r", encoding="utf-8") as f:
        json_data = json.load(f)

    json_data = [cand_list[:500] for cand_list in json_data]

    if not os.path.exists(data_path):
        print(f"Error: jsonl file not found at {data_path}")
        return

    with open(data_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    problem_groups = list(zip(*json_data))[:500]
    cnt = 0
    for problem_likelihoods in tqdm(problem_groups, desc="Updating problems"):
        problem_list = list(problem_likelihoods)
        idx = problem_list[0].get("id") 
    
        _, gt = parse_ground_truth(data[cnt], "math")

        for cl in problem_list: 
            if cl['id'] != cnt: 
                cl['id'] = cnt 
            pred = extract_answer(cl['model_output'], "math")
            pred = strip_string(pred)
            cl['pred'] = pred
            
            result = math_equal_process((None, pred, gt)) 
            if not result: 
                try:
                    result = process_results(gt, [cl['model_output']])
                    if not result:
                        pred = extract_answer(pred, "math")
                        result = math_equal_process((None, pred, gt))
                except AssertionError:
                    print(cl['model_output'])
                    result = 0
            
            cl['is_correct'] = result
        cnt += 1
    
    print(len(json_data))
    print(len(json_data[0]))

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(json_data, f, indent=4, ensure_ascii=False)
    print(f"Updated predictions saved at: {file_path}")

In [None]:

output_dir = "../likelihood/math500"

models = ['gpt-4o-mini']
file_name = 'few/all_likelihoods.json'

for model in models:
    file_path = f"{output_dir}/{model}/{file_name}"
    data_path = f"../data/math500/test.jsonl"
    
    update_predictions_with_is_correct(file_path, data_path)

Updating problems: 100%|██████████| 500/500 [00:33<00:00, 14.96it/s]


5
500
Updated predictions saved at: ../likelihood_diff/math500/gpt-4o-mini/few/all_likelihoods_2025.json
