In [None]:
import os
import json
from tqdm import tqdm 
import sys 

from dotenv import load_dotenv
from openai import OpenAI

sys.path.append('../')  

from gpqa.gpqa_utils import * 

from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

from mmlu_pro.mmlu_utils import * 

from hotpotqa.hotpotqa_utils import *

from drop.drop_utils import *

from utils import load_model_outputs

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

def construct_prompt(args): 
    k = 5
    if args.task != 'mmlu_pro': 
        output_res_path = f"../result/{args.task}/{args.model}/{args.task}_{args.shot_type}.jsonl"
        res = load_model_outputs(output_res_path)

        test_time_path = f"../test_time_result/{args.task}/{args.model}/{args.task}_{args.shot_type}.jsonl"
        test_res = load_model_outputs(test_time_path)

        
    system_prompt = ""
    start_prompt = "I have generated the following responses to the question: "
    end_prompt = """\n\nEvaluate these responses.\nSelect the most consistent response based on majority consensus.\nStart your answer with "The most consistent response is Response X" (without quotes)."""

    samples = []
    if args.task == "math500":
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            question = entry.get('problem', '')
            model_outputs = r.get('model_outputs', [])
            test_outputs = test_res[idx].get('model_outputs', [])
            additional = test_outputs[:k]  


            all_outputs = model_outputs + additional

            user_prompt = start_prompt + f"{question}\n\n"

            for i, output in enumerate(all_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)
        
    
    
    elif args.task == "gpqa": 
        def format_example(example) -> str:
            prompt = f"Question: {example[0]}"
            prompt += f"\nChoices:\n(A) {example[1]}\n(B) {example[2]}\n(C) {example[3]}\n(D) {example[4]}"
            return prompt
        
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])
            test_outputs = test_res[idx].get('model_outputs', [])
            additional = test_outputs[:k]  

            all_outputs = model_outputs + additional

            question = format_example(entry)

            user_prompt = start_prompt + f"{question}\n\n"
            for i, output in enumerate(all_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt

            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)

        
    elif args.task == "musr_efficiently":
        for idx, r in tqdm(enumerate(res)):
            entry = r.get('entry', {})
            model_outputs = r.get('model_outputs', [])

            test_outputs = test_res[idx].get('model_outputs', [])
            additional = test_outputs[:k]  

            all_outputs = model_outputs + additional

            question = entry['question'].strip()
            context = entry['context'].strip()
            choices = entry['choices']['text']
            

            labels = ['A', 'B', 'C', 'D', 'E', 'F'][:len(choices)]
            choice_str = '\n'.join([f'{labels[idx]}: {choices[idx]}' for idx in range(len(choices))])
            original_question_part = f"{context}\n\n{question}\n\n{choice_str}"
                        
            user_prompt = start_prompt + f"{original_question_part}\n\n" 
            for i, output in enumerate(all_outputs):
                user_prompt += f"Response {i}: {output}\n"

            user_prompt += end_prompt
            
            message = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
            sample = {"idx": idx, "prompt": message, "entry": entry}
            samples.append(sample)
    else: 
        return None

    return samples

def generate_model_output(model: str, prompt: str, temperature: float = 1.0) -> str:        
    responses = client.chat.completions.create(
        model=model,
        messages=prompt,
        n=1,
        temperature=temperature,
    )

    outputs = [choice.message.content for choice in responses.choices]
    return outputs

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def main(config):
    save_path = f"{config.output_dir}/{config.task}/{config.model}/{config.task}_{config.shot_type}_5.jsonl"
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    samples = construct_prompt(config)
    if config.num_examples != -1: 
        samples = samples[:config.num_examples]

    if samples:
        print(f"Model: {config.model} Task: {config.task}, Shot: {config.shot_type}")
        print(samples[0].keys())
        print("-" * 50)
        prompt = samples[0]["prompt"]
        for message in prompt:
            print(f"Role:\n{message['role']}")
            print(f"Content:\n{message['content']}")
            print("-" * 50)
    else:
        print(f"No samples found for Task: {config.task}, Shot: {config.shot_type}")

    
    if os.path.exists(save_path):
        with open(save_path, 'r', encoding='utf-8') as f:

            existing_data = {json.loads(line)['prompt'][1]['content'] for line in f}  

    else:
        existing_data = set()  


    if samples:

        with open(save_path, "a", encoding='utf-8') as f:  

            for sample in tqdm(samples, total=len(samples)):
                if sample['prompt'][1]['content'] in existing_data:  
                    continue
                try:
                    model_outputs = generate_model_output(config.model, sample["prompt"], config.temperature)
                    sample["prompt_output"] = model_outputs
                    json.dump(sample, f)
                    f.write("\n")
                except Exception as e:
                    print(f"Error processing sample {sample['idx']}: {e}")
                    break

        print(f"Results saved to {save_path}")

In [None]:
class Config:
    def __init__(self):
        self.model = "gpt-4o-mini"  
        self.task = "drop"  # "math500", "mmlu_pro", "gpqa", "drop", "hotpotqa"
        self.shot_type = "few"  
        self.output_dir = "test_time_usc"
        self.num_examples = -1
        self.temperature = 0.0

In [None]:
tasks = ['math500', 'gpqa', 'musr_efficiently']

shots = ["few"]
models = ['gpt-4o-mini']

config = Config()

In [None]:

for model in models:
    for task in tasks:
        for shot in shots:
            config.model = model
            config.task = task
            config.shot_type = shot
            print("=" * 50)
            main(config)



500it [00:00, 53764.86it/s]


Model: gpt-4o-mini Task: math500, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
I have generated the following responses to the question: Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\theta),$ where $r > 0$ and $0 \le \theta < 2 \pi.$

Response 0: To convert the point \((0, 3)\) from rectangular coordinates to polar coordinates, we need to determine the values of \(r\) and \(\theta\).

1. **Calculate \(r\)**:
   The polar coordinate \(r\) is the distance from the origin to the point \((x, y)\). It can be calculated using the formula:
   \[
   r = \sqrt{x^2 + y^2}
   \]
   For our point \((0, 3)\):
   \[
   r = \sqrt{0^2 + 3^2} = \sqrt{0 + 9} = \sqrt{9} = 3
   \]

2. **Calculate \(\theta\)**:
   The angle \(\theta\) is calculated from the positive x-axis. Since the point \((0, 3

100%|██████████| 500/500 [08:15<00:00,  1.01it/s]


Results saved to test_time_usc/math500/gpt-4o-mini/math500_few_5.jsonl


198it [00:00, 46446.99it/s]


Model: gpt-4o-mini Task: gpqa, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
I have generated the following responses to the question: Question: Two quantum states with energies E1 and E2 have a lifetime of 10^-9 sec and 10^-8 sec, respectively. We want to clearly distinguish these two energy levels. Which one of the following options could be their energy difference so that they can be clearly resolved?

Choices:
(A) 10^-9 eV
(B) 10^-11 eV
(C) 10^-8 eV

(D) 10^-4 eV

Response 0: To determine the energy difference that allows us to clearly resolve two quantum states with lifetimes of \(10^{-9}\) sec and \(10^{-8}\) sec, we need to relate the energy and the uncertainty in energy defined by the Heisenberg uncertainty principle.

The uncertainty principle states that:

\[
\Delta E \cdot \Delta t \geq \frac{\hbar}{2}
\]

where \(\Delta E\) is th

100%|██████████| 198/198 [02:55<00:00,  1.13it/s]


Results saved to test_time_usc/gpqa/gpt-4o-mini/gpqa_few_5.jsonl


250it [00:00, 25654.49it/s]


Model: gpt-4o-mini Task: musr_efficiently, Shot: few
dict_keys(['idx', 'prompt', 'entry'])
--------------------------------------------------
Role:
system
Content:

--------------------------------------------------
Role:
user
Content:
I have generated the following responses to the question: Amidst the vibrant chaos of the Redwood Zoo, nestled in the heart of the city's sprawling jungle, the task of assigning roles was a crucial cog in the machinery of its operation. As the manager, the responsibility of allocating Olivia, Alex, and Mia to the positions of Animal Caretaker and Exhibit Cleaner presented an intriguing conundrum. Each individual, with their distinct personalities and skill sets, added a layer of complexity to this assignment puzzle.

Let's begin with Alex, the tall lad with bright eyes, whose history with the mighty beast of the animal kingdom, lacked a certain comfort. The lad, known to express an almost innate unease around animals larger than him, fell short of the pr

100%|██████████| 250/250 [04:30<00:00,  1.08s/it]

Results saved to test_time_usc/musr_efficiently/gpt-4o-mini/musr_efficiently_few_5.jsonl





In [None]:
def extract_user_number(response_text):
    primary_pattern = r'\[?\s*The most consistent response is Response\s*(\d+)\s*\]?'
    primary_matches = re.findall(primary_pattern, response_text, flags=re.IGNORECASE)
    if primary_matches:

        return int(primary_matches[0])
    

    sentences = re.split(r'[.\n]', response_text.strip())

    sentences = [s.strip() for s in sentences if s.strip()]
    if sentences:
        last_sentence = sentences[-1]
        fallback_matches = re.findall(r'(\d+)', last_sentence)
        if fallback_matches:

            return int(fallback_matches[-1])
    

    return -1

In [None]:
overall_summary = []
for model in models:
    for k in [5, 10]:
        for task in tasks:
            output_res_path = f"../result/{task}/{model}/{task}_{config.shot_type}.jsonl"
            res = load_model_outputs(output_res_path)

            test_time_path = f"../test_time_result/{task}/{model}/{task}_{config.shot_type}.jsonl"
            test_res = load_model_outputs(test_time_path)

            baseline_count = len(res[0].get('model_outputs', []))

            user_counter = Counter()
            total_entries = len(res)

            usc_path = f"{config.output_dir}/{task}/{model}/{task}_{config.shot_type}_{k}.jsonl"

            usc = load_model_outputs(usc_path)


            for entry in tqdm(usc, desc=f"Counting {model}-{task}"):
                raw = entry.get("prompt_output")
                if isinstance(raw, list):
                    raw = raw[0] if raw else None
                num = extract_user_number(raw) if raw else None
                user_counter[num if isinstance(num, int) else "None"] += 1

            for user, cnt in user_counter.items():
                pct = cnt / total_entries * 100
                overall_summary.append({
                    "Model": model,
                    "Task": task,
                    "Extracted User": user,
                    "Count": cnt,
                    "Percentage": round(pct, 2)
                })

            
            out_entries = []
            for i, entry in enumerate(res):
                raw = usc[i].get("prompt_output")
                if isinstance(raw, list):
                    raw = raw[0] if raw else None
                num = extract_user_number(raw) if raw else None
                if not isinstance(num, int):
                    num = 0

                if num < baseline_count:
                    chosen = entry['model_outputs'][num]
                else:
                    tt_idx = num - baseline_count
                    tt_outputs = test_res[i].get('model_outputs', [])
                    if 0 <= tt_idx < len(tt_outputs):
                        chosen = tt_outputs[tt_idx]
                    else:
                        chosen = entry['model_outputs'][0]  # fallback

                usc[i][f'test_time_usc_{k}'] = chosen

            with open(usc_path, "w", encoding="utf-8") as out_file:
                for item in usc:
                    out_file.write(json.dumps(item, ensure_ascii=False) + "\n")

Counting gpt-4o-mini-math500: 100%|██████████| 500/500 [00:00<00:00, 201959.94it/s]
Counting gpt-4o-mini-gpqa: 100%|██████████| 198/198 [00:00<00:00, 343312.19it/s]
Counting gpt-4o-mini-musr_efficiently: 100%|██████████| 250/250 [00:00<00:00, 344473.06it/s]


In [None]:
df_overall = pd.DataFrame(overall_summary)

for model in df_overall["Model"].unique():
    model_df = df_overall[df_overall["Model"] == model]
    
    pivot_table = model_df.pivot_table(
        index="Extracted User",      
        columns="Task",              
        values=["Percentage"], 
        fill_value=0           
    )
    
    print(f"\nModel: {model}")
    display(pivot_table)

In [12]:
keys_to_eval = ['test_time_usc_5', 'test_time_usc_10']

In [None]:
from math500.math_utils import * 
from math500.parser import *
from math500.grader import * 

for model in models:
    for k in [5, 10]: 
        usc_path = f"{config.output_dir}/math500/{model}/math500_{config.shot_type}_{k}.jsonl"

        data = load_model_outputs(usc_path)

        scores = {k: [] for k in keys_to_eval}

        for entry in data:
            idx = entry["idx"]
            

            _, gt = parse_ground_truth(entry['entry'], "math")

            model_outputs = entry.get('model_outputs', [])
            
            for key in keys_to_eval:
                if key not in entry:
                    continue


                pred = extract_answer(entry[key], "math")
                pred = strip_string(pred)


                try:
                    result = math_equal_process((idx, pred, gt))

                    if not result :
                        result = process_results(gt, [entry[key]])
                        if not result:
                            pred = extract_answer(pred, "math")
                            result = math_equal_process((None, pred, gt))

                    scores[key].append(result)

                except TimeoutError:
                    scores[key].append(False)
                except Exception as error:
                    print(f"Error while processing {key} for idx={idx}: {error}")
                    scores[key].append(False)


        print(f"\n===== Evaluation Results for model={model}, shot={config.shot_type} =====")
        for key in keys_to_eval:

            if len(scores[key]) == 0:
                print(f"{key} -> No data / Not found in entries")
                continue

            acc = sum(scores[key]) / len(scores[key])
            print(f"{key} -> Accuracy: {acc:.4f}")
        print("-" * 50)



===== Evaluation Results for model=gpt-4o-mini, shot=few =====
test_time_usc_5 -> Accuracy: 0.7980
test_time_usc_10 -> No data / Not found in entries
--------------------------------------------------

===== Evaluation Results for model=gpt-4o-mini, shot=few =====
test_time_usc_5 -> No data / Not found in entries
test_time_usc_10 -> Accuracy: 0.7960
--------------------------------------------------


In [None]:
from gpqa.gpqa_utils import * 

examples = load_examples("../data/gpqa/gpqa_diamond.csv", seed=0)

for model in models:
    for k in [5, 10]: 
        file_path = f"{config.output_dir}/gpqa/{model}/gpqa_few_{k}.jsonl"
        data = load_model_outputs(file_path)

        scores = {k: 0 for k in keys_to_eval}


        total_data_len = len(data)
        
        for entry, example in zip(data, examples):
            correct_index = example.correct_index  


            model_outputs = entry.get('model_outputs', [])
            results_info = entry.get('results', [])

        
            for key in keys_to_eval:

                if key not in entry:
                    continue


                pred = parse_sampled_answer(entry[key])
                

                if pred is None:
                    is_correct = False
                else:

                    is_correct = (LETTER_TO_INDEX[pred] == correct_index)


                scores[key] += int(is_correct)
        

        print(f"\n===== Evaluation Results for model={model}, shot={config.shot_type} =====")
        for key in keys_to_eval:
            acc = scores[key] / total_data_len if total_data_len else 0
            print(f"{key} -> Accuracy: {acc:.4f}")

        print("-" * 50)



===== Evaluation Results for model=gpt-4o-mini, shot=few =====
test_time_usc_5 -> Accuracy: 0.4141
test_time_usc_10 -> Accuracy: 0.0000
--------------------------------------------------

===== Evaluation Results for model=gpt-4o-mini, shot=few =====
test_time_usc_5 -> Accuracy: 0.0000
test_time_usc_10 -> Accuracy: 0.4242
--------------------------------------------------


In [15]:
from musr.musr import MuSRDataset

mm_path = '../data/musr/murder_mystery.json'
mm = MuSRDataset(mm_path)

ta_path = '../data/musr/team_allocation.json'
ta = MuSRDataset(ta_path)

op_path = '../data/musr/object_placements.json'
op = MuSRDataset(op_path)

In [None]:
for model in models:
    print(model)
    for k in [5, 10]:
        file_path = f"{config.output_dir}/musr_efficiently/{model}/musr_efficiently_few_{k}.jsonl"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = [json.loads(line) for line in f]
        
        
        preds = {k: [] for k in keys_to_eval}


        for test_idx, entry in enumerate(data):

            model_outputs = entry.get('model_outputs', [])
            results_info = entry.get('results', [])

            
            for k in keys_to_eval:


                if k in ("total_max_prob_diff_output", "max_prob_diff_output"):
                    continue
                if k in entry:  

                    preds[k].append(entry[k])
                else:

                    preds[k].append(None)

        
        total_data_len = len(data)
        

        scores = {k: 0 for k in keys_to_eval}
        

        for i, entry in enumerate(data):


            if 'entry' not in entry:
                continue
            
            for k in keys_to_eval:

                pred_answer = preds[k][i]
                if pred_answer is not None:
                    metrics = ta.evaluate_response([pred_answer], ta[i])
                    if metrics and metrics[0]['correct']:
                        scores[k] += 1


        print(f"\n===== Evaluation Results for model={model}, shot={config.shot_type} =====")
        for k in keys_to_eval:

            if len(preds[k]) == 0:
                print(f"{k}: No entries found, skip.")
                continue

            acc = scores[k] / total_data_len if total_data_len else 0
            print(f"{k} -> Accuracy: {acc:.4f}")

        print("-" * 50)

gpt-4o-mini

===== Evaluation Results for model=gpt-4o-mini, shot=few =====
test_time_usc_5 -> Accuracy: 0.7960
test_time_usc_10 -> Accuracy: 0.0000
--------------------------------------------------

===== Evaluation Results for model=gpt-4o-mini, shot=few =====
test_time_usc_5 -> Accuracy: 0.0000
test_time_usc_10 -> Accuracy: 0.7720
--------------------------------------------------
