In [None]:
import sys
import os
from pathlib import Path
import json 
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt 

from musr import MuSRDataset

from tqdm import tqdm

sys.path.append('../')  

from utils import * 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mm_path = '../data/musr/murder_mystery.json'
mm = MuSRDataset(mm_path)

ta_path = '../data/musr/team_allocation.json'
ta = MuSRDataset(ta_path)

op_path = '../data/musr/object_placements.json'
op = MuSRDataset(op_path)

In [5]:
print("Team allocation")
for model in ['gpt-4o-mini', 'gpt-4o']: 
    file_path = f"../result/musr_efficiently/{model}/musr_efficiently_few_greedy.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]
    idx_acc = [0]
    for entry in data: 
        model_outputs = entry['model_outputs']
        for idx, model_output in enumerate(model_outputs): 
            metrics = ta.evaluate_response([model_output], entry['entry'])
            idx_acc[idx] += metrics[0]['correct']
    total = 0 
    for ac in idx_acc: 
        total += ac / len(data)
    
    total = total * 100 
    print(f"{model} {total:.1f}")
        
print("Object Placements")
for model in ['gpt-4o-mini', 'gpt-4o']: 
    file_path = f"../result/musr_location/{model}/musr_location_few_greedy.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]
    idx_acc = [0]
    for entry in data: 
        model_outputs = entry['model_outputs']
        for idx, model_output in enumerate(model_outputs): 
            metrics = op.evaluate_response([model_output], entry['entry'])
            idx_acc[idx] += metrics[0]['correct']
    total = 0 
    for ac in idx_acc: 
        total += ac / len(data)
        
    total = total * 100 
    print(f"{model} {total:.1f}")

Team allocation
gpt-4o-mini 79.6
gpt-4o 89.6
Object Placements
gpt-4o-mini 62.1
gpt-4o 73.4


In [5]:
print("Team allocation")
for model in ['gpt-4o-mini', 'gpt-4o']: 
    for shot_type in ['zero', 'few']:
        file_path = f"../result_gpt/musr_efficiently/{model}/musr_efficiently_{shot_type}.jsonl"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = [json.loads(line) for line in f]
        idx_acc = [0,0,0,0,0]
        for entry in data: 
            model_outputs = entry['model_outputs']
            for idx, model_output in enumerate(model_outputs): 
                metrics = ta.evaluate_response([model_output], entry['entry'])
                idx_acc[idx] += metrics[0]['correct']
        total = 0 
        for ac in idx_acc: 
            total += ac / len(data)

        print(f"{model} {shot_type} {total / 5:.4f}")
        
print("Object Placements")
for model in ['gpt-4o-mini', 'gpt-4o']: 
    for shot_type in ['zero', 'few']:
        file_path = f"../result_gpt/musr_location/{model}/musr_location_{shot_type}.jsonl"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = [json.loads(line) for line in f]
        idx_acc = [0,0,0,0,0]
        for entry in data: 
            model_outputs = entry['model_outputs']
            for idx, model_output in enumerate(model_outputs): 
                metrics = op.evaluate_response([model_output], entry['entry'])
                idx_acc[idx] += metrics[0]['correct']
        total = 0 
        for ac in idx_acc: 
            total += ac / len(data)

        print(f"{model} {shot_type} {total / 5:.4f}")

Team allocation
gpt-4o-mini zero 0.5624
gpt-4o-mini few 0.7696
gpt-4o zero 0.6664
gpt-4o few 0.8696
Object Placements
gpt-4o-mini zero 0.5813
gpt-4o-mini few 0.5938
gpt-4o zero 0.6172
gpt-4o few 0.6969


In [11]:
print("Team allocation")
for model in ['llama']: 
    for shot_type in ['zero', 'few']:
        file_path = f"../result/musr_efficiently/llama/musr_efficiently_{shot_type}.jsonl"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = [json.loads(line) for line in f]
        idx_acc = [0,0,0,0,0]
        for test_idx, entry in enumerate(data): 
            model_outputs = entry['resps'][0]
            for idx, model_output in enumerate(model_outputs): 
                metrics = ta.evaluate_response([model_output], ta[test_idx])
                idx_acc[idx] += metrics[0]['correct']
        total = 0 
        for ac in idx_acc: 
            total += ac / len(data)

        print(f"{model} {shot_type} {total / 5:.4f}")

print("Object Placements")
for model in ['llama']: 
    for shot_type in ['zero', 'few']:
        file_path = f"../result/musr_location/llama/musr_location_{shot_type}.jsonl"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = [json.loads(line) for line in f]
        idx_acc = [0,0,0,0,0]
        for test_idx, entry in enumerate(data): 
            model_outputs = entry['resps'][0]
            for idx, model_output in enumerate(model_outputs): 
                metrics = op.evaluate_response([model_output], op[test_idx])
                idx_acc[idx] += metrics[0]['correct']
        total = 0 
        for ac in idx_acc: 
            total += ac / len(data)

        print(f"{model} {shot_type} {total / 5:.4f}")

Team allocation
llama zero 0.4304
llama few 0.6480
Object Placements
llama zero 0.5062
llama few 0.5328


In [3]:
def update_likelihoods_with_is_correct(file_path, test_data, eval_func):
    if not os.path.exists(file_path):
        print(f"Error: JSON file not found at {file_path}")
        return
    with open(file_path, "r", encoding="utf-8") as f:
        json_data = json.load(f)
    
    problem_groups = list(zip(*json_data))
    
    for problem_likelihoods in tqdm(problem_groups, desc="Updating likelihoods"):
        problem_list = list(problem_likelihoods)

        idx = problem_list[0].get("id")
        if idx is None or idx >= len(test_data):
            print(f"Warning: Invalid or missing id in candidate: {problem_list[0]}")
            continue

        test_entry = test_data[idx]
        for cl in problem_list:
            metrics = eval_func([cl['model_output']], test_entry)
            cl['is_correct'] = metrics[0]['correct']
    
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(json_data, f, indent=4, ensure_ascii=False)
    print(f"Updated predictions saved at: {file_path}")

In [4]:
models = ['gpt-4o-mini', 'gpt-4o', 'llama']
# models = ['llama']
file_name = 'few/all_likelihoods.json'

output_dir = "../likelihood_1B/musr_efficiently/"


for model in models:
    for shot_type in ['few']:
        file_path = f"{output_dir}/{model}/{file_name}"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        update_likelihoods_with_is_correct(file_path, ta, ta.evaluate_response)

output_dir = "../likelihood_1B/musr_location/"

for model in models:
    for shot_type in ['few']:
        file_path = f"{output_dir}/{model}/{file_name}"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        update_likelihoods_with_is_correct(file_path, op, op.evaluate_response)


Updating likelihoods: 100%|██████████| 250/250 [00:00<00:00, 10981.35it/s]


Updated predictions saved at: ../likelihood_1B/musr_efficiently//gpt-4o-mini/few/all_likelihoods.json


Updating likelihoods: 100%|██████████| 250/250 [00:00<00:00, 11217.72it/s]


Updated predictions saved at: ../likelihood_1B/musr_efficiently//gpt-4o/few/all_likelihoods.json


Updating likelihoods: 100%|██████████| 250/250 [00:00<00:00, 306.37it/s]


Updated predictions saved at: ../likelihood_1B/musr_efficiently//llama/few/all_likelihoods.json


Updating likelihoods: 100%|██████████| 256/256 [00:00<00:00, 10270.62it/s]


Updated predictions saved at: ../likelihood_1B/musr_location//gpt-4o-mini/few/all_likelihoods.json


Updating likelihoods: 100%|██████████| 256/256 [00:00<00:00, 14643.80it/s]


Updated predictions saved at: ../likelihood_1B/musr_location//gpt-4o/few/all_likelihoods.json


Updating likelihoods: 100%|██████████| 256/256 [00:00<00:00, 1071.01it/s]


Updated predictions saved at: ../likelihood_1B/musr_location//llama/few/all_likelihoods.json


In [7]:
output_dir = "../likelihood/musr_location/"
file_name = 'few/all_likelihoods.json'


for model in ['gpt-4o-mini', 'gpt-4o', 'llama']:
    for shot_type in ['few']:
        file_path = f"{output_dir}/{model}/{file_name}"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            json_data = json.load(f)
        
        problem_groups = list(zip(*json_data))
        
        idx_acc = [0,0,0,0,0]
        for problem_likelihoods in tqdm(problem_groups, desc="Updating likelihoods"):
            problem_list = list(problem_likelihoods)

            for idx, cl in enumerate(problem_list):
                idx_acc[idx] += cl['is_correct'] 
        
        total = 0 
        for ac in idx_acc:
            acc = ac / len(problem_groups) 
            total += acc
            print(acc)
        print(total / 5)

Updating likelihoods: 100%|██████████| 256/256 [00:00<00:00, 257060.53it/s]


0.6015625
0.6015625
0.55078125
0.58984375
0.625
0.59375


Updating likelihoods: 100%|██████████| 256/256 [00:00<00:00, 165064.08it/s]


0.71875
0.70703125
0.67578125
0.69921875
0.68359375
0.696875


Updating likelihoods: 100%|██████████| 256/256 [00:00<00:00, 160715.73it/s]

0.51171875
0.46875
0.49609375
0.53515625
0.515625
0.50546875



