# G-EVAL 

In [2]:
import os
import json
from tqdm.notebook import tqdm
import asyncio
from faithfulness_eval_utils.geval import evaluate_geval

## Example loading

In [10]:
with open("../sample_complex.json") as fin:
    data = json.load(fin)

await evaluate_geval(
    data=data,
    result_dir="output/geval",
)

Saved 5 items to `output/geval/results.json`
Averages written to `output/geval/stats.json`


([{'statement': "Shakespeare wrote 'Romeo and Juliet'.",
   'explanation': "Historical records and publication attributions confirm that Shakespeare wrote 'Romeo and Juliet'.",
   'evidences': ["William Shakespeare is credited as the author of the play 'Romeo and Juliet'.",
    "'Romeo and Juliet' was first published in 1597 and is attributed to Shakespeare."],
   'label': 'SUPPORTED',
   'q1_factual_claims_not_in_evidence': '**No**, the explanation does not make any factual claims not found in the evidence.',
   'q2_reflects_key_points': '**Yes**, the explanation faithfully reflects the key points in the evidence, specifically highlighting the historical records and publication attributions that support Shakespeare\'s authorship of "Romeo and Juliet".',
   'faithfulness_score_0_5': 5.0,
   'justification': '',
   'raw_response': 'Here are my evaluations:\n\n1. **No**, the explanation does not make any factual claims not found in the evidence.\n\n2. **Yes**, the explanation faithfully 

## Evaluating all

In [None]:
SKIP_EXISTING_RESULTS = True

data_path = os.path.normpath("../inference_outputs/converted_outputs/")
model_setup_dirs = os.listdir(data_path)

for model_setup_dir in model_setup_dirs:
    if not os.path.isdir(os.path.join(data_path, model_setup_dir)):
        continue
    dataset_files = os.listdir(os.path.join(data_path, model_setup_dir))
    if not dataset_files:
        print(f"No files found in {model_setup_dir}...")
        raise ValueError()

    for dataset_file in tqdm(dataset_files, desc=f"Processing {model_setup_dir}"):
        if not dataset_file.endswith(".json"):
            continue

        dataset_path = os.path.join(data_path, model_setup_dir, dataset_file)
        result_dir = os.path.join("output", "geval", model_setup_dir)

        if SKIP_EXISTING_RESULTS and os.path.exists(result_dir):
            print(
                f"Skipping {dataset_path} as results already exist in {result_dir}. Please check manually if results are valid."
            )
            if input("Do you want to continue? (y/n): ").strip().lower() != "y":
                print("skipping...")
                continue

        print(f"Evaluating {dataset_path}")
        with open(dataset_path, encoding="utf-8") as fin:
            data = json.load(fin)

        await evaluate_geval(
            data=data,
            result_dir=result_dir,
            stats_filename=f"{dataset_file.replace('.json', '.stats.json')}",
            results_filename=f"{dataset_file.replace('.json', '.eval.json')}",
        )
        print(f"Finished evaluating {dataset_path}")


Processing deepseek_r1_32b_cot:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating ..\outputs\converted_outputs\deepseek_r1_32b_cot\covid_fact_depseek_r1_cot.json
Saved 1017 items to `output\geval\deepseek_r1_32b_cot/results.json`
Averages written to `output\geval\deepseek_r1_32b_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\deepseek_r1_32b_cot\covid_fact_depseek_r1_cot.json
Evaluating ..\outputs\converted_outputs\deepseek_r1_32b_cot\hover_train_depseek-r1-cot.json
Saved 2076 items to `output\geval\deepseek_r1_32b_cot/results.json`
Averages written to `output\geval\deepseek_r1_32b_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\deepseek_r1_32b_cot\hover_train_depseek-r1-cot.json
Evaluating ..\outputs\converted_outputs\deepseek_r1_32b_cot\politi_hop_depseek_r1_cot.json
Saved 497 items to `output\geval\deepseek_r1_32b_cot/results.json`
Averages written to `output\geval\deepseek_r1_32b_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\deepseek_r1_32b_cot\politi_hop_depseek_r1_cot.json


Processing gpt4o_cot:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating ..\outputs\converted_outputs\gpt4o_cot\covid_fact_gpt4o_cot.json
Saved 1050 items to `output\geval\gpt4o_cot/results.json`
Averages written to `output\geval\gpt4o_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\gpt4o_cot\covid_fact_gpt4o_cot.json
Evaluating ..\outputs\converted_outputs\gpt4o_cot\hover_train_gpt4o_cot.json
Saved 2050 items to `output\geval\gpt4o_cot/results.json`
Averages written to `output\geval\gpt4o_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\gpt4o_cot\hover_train_gpt4o_cot.json
Evaluating ..\outputs\converted_outputs\gpt4o_cot\politi_hop_gpt4o_cot.json
Saved 497 items to `output\geval\gpt4o_cot/results.json`
Averages written to `output\geval\gpt4o_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\gpt4o_cot\politi_hop_gpt4o_cot.json


Processing gpt4o_non_cot:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating ..\outputs\converted_outputs\gpt4o_non_cot\covid_fact_gpt4o_non_cot.json
Saved 1050 items to `output\geval\gpt4o_non_cot/results.json`
Averages written to `output\geval\gpt4o_non_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\gpt4o_non_cot\covid_fact_gpt4o_non_cot.json
Evaluating ..\outputs\converted_outputs\gpt4o_non_cot\hover_train_gpt4o_non_cot.json
Saved 2050 items to `output\geval\gpt4o_non_cot/results.json`
Averages written to `output\geval\gpt4o_non_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\gpt4o_non_cot\hover_train_gpt4o_non_cot.json
Evaluating ..\outputs\converted_outputs\gpt4o_non_cot\politi_hop_gpt4o_non_cot.json
Saved 497 items to `output\geval\gpt4o_non_cot/results.json`
Averages written to `output\geval\gpt4o_non_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\gpt4o_non_cot\politi_hop_gpt4o_non_cot.json


Processing mistral_7b_NO_cot:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating ..\outputs\converted_outputs\mistral_7b_NO_cot\covid_fact_mistral_7b_no_cot.json
Saved 1212 items to `output\geval\mistral_7b_NO_cot/results.json`
Averages written to `output\geval\mistral_7b_NO_cot/stats.json`
Finished evaluating ..\outputs\converted_outputs\mistral_7b_NO_cot\covid_fact_mistral_7b_no_cot.json
Evaluating ..\outputs\converted_outputs\mistral_7b_NO_cot\hover_train_mistral_7b_no_cot.json
