# G-EVAL 

In [1]:
import os
import json
from time import time
from typing import Literal
from faithfulness_eval_utils.geval import evaluate_geval

## Example loading

In [10]:
with open("../sample_complex.json") as fin:
    data = json.load(fin)

await evaluate_geval(
    data=data,
    result_dir="output/geval",
)

Saved 5 items to `output/geval/results.json`
Averages written to `output/geval/stats.json`


([{'statement': "Shakespeare wrote 'Romeo and Juliet'.",
   'explanation': "Historical records and publication attributions confirm that Shakespeare wrote 'Romeo and Juliet'.",
   'evidences': ["William Shakespeare is credited as the author of the play 'Romeo and Juliet'.",
    "'Romeo and Juliet' was first published in 1597 and is attributed to Shakespeare."],
   'label': 'SUPPORTED',
   'q1_factual_claims_not_in_evidence': '**No**, the explanation does not make any factual claims not found in the evidence.',
   'q2_reflects_key_points': '**Yes**, the explanation faithfully reflects the key points in the evidence, specifically highlighting the historical records and publication attributions that support Shakespeare\'s authorship of "Romeo and Juliet".',
   'faithfulness_score_0_5': 5.0,
   'justification': '',
   'raw_response': 'Here are my evaluations:\n\n1. **No**, the explanation does not make any factual claims not found in the evidence.\n\n2. **Yes**, the explanation faithfully 

## Evaluating all

In [None]:
SKIP_EXISTING_RESULTS: Literal["auto", "ask"] = "auto"  # auto for the overnight run

data_path = os.path.normpath("../inference_outputs/converted_outputs/")
model_setup_dirs = os.listdir(data_path)

t1 = time()

for model_setup_dir in model_setup_dirs:
    if not os.path.isdir(os.path.join(data_path, model_setup_dir)):
        continue

    print("\n\n+++++++++++++++++++++++++++++++++++++++++++")
    print(f"Processing model setup: {model_setup_dir}")

    dataset_files = os.listdir(os.path.join(data_path, model_setup_dir))
    if not dataset_files:
        print(f"No files found in {model_setup_dir}...")
        raise ValueError()

    for dataset_file in dataset_files:
        if not dataset_file.endswith(".json"):
            continue

        t2 = time()
        dataset_path = os.path.join(data_path, model_setup_dir, dataset_file)
        result_dir = os.path.join("output", "geval", model_setup_dir)
        stats_output_filename = f"{dataset_file.replace('.json', '.stats.json')}"
        results_output_filename = f"{dataset_file.replace('.json', '.results.json')}"

        if os.path.exists(
            os.path.join(result_dir, stats_output_filename)
        ) or os.path.exists(os.path.join(result_dir, results_output_filename)):
            print(
                f"Either {stats_output_filename} or {results_output_filename} already exists in {result_dir}."
            )
            if SKIP_EXISTING_RESULTS == "ask":
                if (
                    input("Do you want to recreate these files? (y/n): ")
                    .strip()
                    .lower()
                    != "y"
                ):
                    print("Skipping these files...")
                    continue
            elif SKIP_EXISTING_RESULTS == "auto":
                print(
                    f"Skipping {dataset_path} as results already exist in {result_dir}. Please check manually if results are valid."
                )
                continue

        print(f"\nEvaluating {dataset_path}...")
        with open(dataset_path, encoding="utf-8") as fin:
            data = json.load(fin)

        await evaluate_geval(
            data=data,
            result_dir=result_dir,
            stats_filename=stats_output_filename,
            results_filename=results_output_filename,
        )
        t3 = time()
        print(
            f"Finished evaluating {dataset_path}. Time elapsed since start: {(t3 - t1) / 60:.2f} minutes, since last file: {(t3 - t2) / 60:.2f} minutes."
        )




+++++++++++++++++++++++++++++++++++++++++++
Processing model setup: deepseek_r1_32b_cot

Evaluating ..\inference_outputs\converted_outputs\deepseek_r1_32b_cot\covid_fact_depseek_r1_cot.json...
Saved 1017 items to `output\geval\deepseek_r1_32b_cot/covid_fact_depseek_r1_cot.results.json`
Averages written to `output\geval\deepseek_r1_32b_cot/covid_fact_depseek_r1_cot.stats.json`
Finished evaluating ..\inference_outputs\converted_outputs\deepseek_r1_32b_cot\covid_fact_depseek_r1_cot.json. Time elapsed since start: 41.17 minutes, since last file: 41.17 minutes.

Evaluating ..\inference_outputs\converted_outputs\deepseek_r1_32b_cot\hover_train_depseek-r1-cot.json...
Saved 2076 items to `output\geval\deepseek_r1_32b_cot/hover_train_depseek-r1-cot.results.json`
Averages written to `output\geval\deepseek_r1_32b_cot/hover_train_depseek-r1-cot.stats.json`
Finished evaluating ..\inference_outputs\converted_outputs\deepseek_r1_32b_cot\hover_train_depseek-r1-cot.json. Time elapsed since start: 113