In [1]:
import os
import json
from time import time
from faithfulness_eval_utils.human_in_the_loop_eval import evaluate_faithfulness_hil

## Actual HITL Eval

In [3]:
SAMPLE_COUNT = 25

model_to_evaluate = ["gpt4o_non_cot"]
# model_to_evaluate = ['mistral_7b_cot', 'mistral_7b_non_cot', 'gpt4o_cot', 'gpt4o_non_cot', 'deepseek_r1_32b_cot']

data_path = os.path.normpath("../inference/converted_outputs/")
model_setup_dirs = os.listdir(data_path)

ids_per_dataset: dict[str, list[str]] = {}  # k: dataset name, v: list of ids

t1 = time()

for model_setup_dir in model_setup_dirs:
    if not os.path.isdir(os.path.join(data_path, model_setup_dir)):
        continue

    print(model_setup_dir)
    if model_setup_dir not in model_to_evaluate:
        continue

    print("\n\n+++++++++++++++++++++++++++++++++++++++++++")
    print(f"Processing model setup: {model_setup_dir}")

    dataset_files = os.listdir(os.path.join(data_path, model_setup_dir))
    if not dataset_files:
        print(f"No files found in {model_setup_dir}...")
        raise ValueError()

    for dataset_file in dataset_files:
        if not dataset_file.endswith(".json"):
            continue

        t2 = time()
        dataset_path = os.path.join(data_path, model_setup_dir, dataset_file)
        result_dir = os.path.join("output", "hitl", model_setup_dir)
        stats_output_filename = f"{dataset_file.replace('.json', '.stats.json')}"
        results_output_filename = f"{dataset_file.replace('.json', '.results.json')}"

        if os.path.exists(
            os.path.join(result_dir, stats_output_filename)
        ) or os.path.exists(os.path.join(result_dir, results_output_filename)):
            print(
                f"Either {stats_output_filename} or {results_output_filename} already exists in {result_dir}. Skipping evaluation."
            )
            continue

        dataset_identifier = dataset_file.split("_")[0]
        pre_sampled_ids = (
            None  # worst-case initialization, no sampling for this dataset
        )

        if dataset_identifier in ids_per_dataset:
            pre_sampled_ids = ids_per_dataset[dataset_identifier]

        # pre_sampled_ids =
        print(
            f"\nEvaluating {dataset_path} ({dataset_identifier=}) with pre-sampled IDs: {pre_sampled_ids}..."
        )

        with open(dataset_path, encoding="utf-8") as fin:
            data = json.load(fin)

        used_ids: list[str] = evaluate_faithfulness_hil(
            data=data,
            pre_sampled_ids=pre_sampled_ids,
            sample_count=SAMPLE_COUNT,
            result_dir=result_dir,
            stats_filename=stats_output_filename,
            results_filename=results_output_filename,
        )

        if dataset_identifier not in ids_per_dataset:
            ids_per_dataset[dataset_identifier] = used_ids
        t3 = time()
        print(
            f"Finished evaluating {dataset_path}. Time elapsed since start: {(t3 - t1) / 60:.2f} minutes, since last file: {(t3 - t2) / 60:.2f} minutes."
        )


deepseek_r1_32b_cot
gpt4o_cot
gpt4o_non_cot


+++++++++++++++++++++++++++++++++++++++++++
Processing model setup: gpt4o_non_cot
Either covid_fact_gpt4o_non_cot.stats.json or covid_fact_gpt4o_non_cot.results.json already exists in output\hitl\gpt4o_non_cot. Skipping evaluation.
Either hover_train_gpt4o_non_cot.stats.json or hover_train_gpt4o_non_cot.results.json already exists in output\hitl\gpt4o_non_cot. Skipping evaluation.
Either politi_hop_gpt4o_non_cot.stats.json or politi_hop_gpt4o_non_cot.results.json already exists in output\hitl\gpt4o_non_cot. Skipping evaluation.
mistral_7b_cot
mistral_7b_non_cot
