In [None]:
import sys
import os
from pathlib import Path
import json 
import pandas as pd
import numpy as np
from datasets import load_dataset

from drop_utils import *

sys.path.append('../')  

from utils import * 

from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import random 

random.seed(0)
dataset = pd.read_parquet("../data/drop/drop_sub.parquet", engine="pyarrow")
dataset = dataset.to_dict(orient="records")  
# dataset = random.sample(dataset, k=500)
# dataset = convert_ndarray_to_list(dataset)
# dataset = convert_ndarray_to_list(dataset)

with open("prompt.json", 'r') as f:
    fewshot = json.load(f)

In [None]:
for model in ['gpt-4o', 'gpt-4o-mini']: 
    print(model)
    for shot_type in ['zero', 'few']:
        file_path = f"../result/drop/{model}/drop_{shot_type}.jsonl"
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = [json.loads(line) for line in f]

        em = [[] for _ in range(5)]
        f1 = [[] for _ in range(5)]

        for entry in data:
            golds = get_answers(entry['entry'])
            for idx, out in enumerate(entry['model_outputs']):
                pred = extract_answer(out)
                max_em = 0
                max_f1 = 0
                for gold_answer in golds:
                    exact_match, f1_score = get_metrics(pred, gold_answer)
                    if gold_answer[0].strip():
                        max_em = max(max_em, exact_match)
                        max_f1 = max(max_f1, f1_score)
                em[idx].append(max_em)
                f1[idx].append(max_f1)

        avg_em = np.mean([np.mean(em[i]) for i in range(5)])
        avg_f1 = np.mean([np.mean(f1[i]) for i in range(5)])

        print("{} Average EM: {:.4f} Average F1: {:.4f}".format(shot_type, avg_em, avg_f1))


gpt-4o
zero Average EM: 0.7512 Average F1: 0.8528
few Average EM: 0.8064 Average F1: 0.8920
gpt-4o-mini
zero Average EM: 0.7764 Average F1: 0.8562
few Average EM: 0.7684 Average F1: 0.8312


In [None]:
print("llama")
for shot_type in ['zero', 'few']:
    file_path = f"../result/drop/llama/drop_{shot_type}.jsonl"
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line) for line in f]

    em = [[] for _ in range(5)]
    f1 = [[] for _ in range(5)]
    any_correct_em = []  
    any_correct_f1 = []  

    for entry in data:
        golds = get_answers(entry['doc'])
        all_resps_correct_em = False
        all_resps_correct_f1 = False

        for idx, out in enumerate(entry['resps'][0]):
            pred = extract_answer(out)
            max_em = 0
            max_f1 = 0
            for gold_answer in golds:
                exact_match, f1_score = get_metrics(pred, gold_answer)
                if gold_answer[0].strip():
                    max_em = max(max_em, exact_match)
                    max_f1 = max(max_f1, f1_score)

            em[idx].append(max_em)
            f1[idx].append(max_f1)


            if max_em == 1:
                all_resps_correct_em = max_em
            if max_f1 != 0:
                all_resps_correct_f1 = max_f1


        any_correct_em.append(1 if all_resps_correct_em else 0)
        any_correct_f1.append(1 if all_resps_correct_f1 else 0)


    avg_em = np.mean([np.mean(em[i]) for i in range(5)])
    avg_f1 = np.mean([np.mean(f1[i]) for i in range(5)])
    avg_any_correct_em = np.mean(any_correct_em)  
    avg_any_correct_f1 = np.mean(any_correct_f1)  

    print("{} Average EM: {:.4f} Average F1: {:.4f}".format(shot_type, avg_em, avg_f1))
    print("{} Any Correct EM: {:.4f} Any Correct F1: {:.4f}".format(shot_type, avg_any_correct_em, avg_any_correct_f1))


llama
zero Average EM: 0.6044 Average F1: 0.6635
zero Any Correct EM: 0.8580 Any Correct F1: 0.9680
few Average EM: 0.6140 Average F1: 0.6730
few Any Correct EM: 0.8560 Any Correct F1: 0.9680


In [None]:
output_dir = "../likelihood/drop/"

subjects = [
    "gpt-4o-mini/few",
    "gpt-4o/few",
    "llama/few"
]

def add_pred(prob_type_filter, output_dir):
    output_dir = f"{output_dir}/{prob_type_filter}"
    likelihoods_file = os.path.join(output_dir, "all_likelihoods.json")
    if not os.path.exists(likelihoods_file):
        print(f"Error: {likelihoods_file} not found.")
        return
    with open(likelihoods_file, "r") as f:
        likelihoods = json.load(f)
    
    problem_groups = list(zip(*likelihoods))
    for problem_likelihoods in tqdm(problem_groups, desc="Processing problems"):
        problem_list = list(problem_likelihoods)
        for cl in problem_list: 

            pred = extract_answer(cl['model_output'])
            cl['pred'] = pred
        

            gt = get_answers(cl['answer'])
            cl['gt'] = gt
            

            max_em = 0
            max_f1 = 0 
            for gold in gt:
                exact_match, f1_score = get_metrics(pred, gold)

                if gold[0].strip():
                    max_em = max(max_em, exact_match)
                    max_f1 = max(max_f1, f1_score)

            cl['is_correct'] = (max_em, max_f1)
            
    with open(likelihoods_file, "w") as f:
        json.dump(likelihoods, f, indent=4)
    
    print(f"Updated file saved at: {likelihoods_file}")


In [4]:
for subject in subjects:
    add_pred(subject, output_dir)

Processing problems: 100%|██████████| 500/500 [00:01<00:00, 415.23it/s]


Updated file saved at: ../likelihood_qwen/drop//llama/few/all_likelihoods.json
