In [1]:
import os
import json

In [2]:
def calculate_average_faithfulness_score(outer_folder_path):
    for root, dirs, files in os.walk(outer_folder_path):
        for file_name in files:
            if file_name.endswith('.results.json'):
                file_path = os.path.join(root, file_name)
                print(f"Processing: {file_path}")
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)

                    if isinstance(data, list):
                        total_score = 0
                        count = 0
                        for item in data:
                            if 'faithfulness_score_0_5' in item:
                                total_score += item['faithfulness_score_0_5']
                                count += 1

                        avg_score = total_score / count if count > 0 else 0

                        stats_file_name = file_name.replace('.results.json', '.stats.json')
                        stats_file_path = os.path.join(root, stats_file_name)

                        with open(stats_file_path, 'w', encoding='utf-8') as f:
                            json.dump({"average_faithfulness_score": avg_score}, f, indent=4)
                        print(f"  -> Created '{stats_file_name}' with average score: {avg_score:.2f}")
                    else:
                        print(f"  -> Skipping '{file_name}': Not a list of results as expected.")

                except json.JSONDecodeError as e:
                    print(f"  -> Error decoding JSON from '{file_name}': {e}")
                except Exception as e:
                    print(f"  -> An unexpected error occurred with '{file_name}': {e}")

In [3]:
calculate_average_faithfulness_score('experiments/output/geval')

Processing: experiments/output/geval/mistral_7b_cot/politi_hop_mistral_7b_cot.results.json
  -> Created 'politi_hop_mistral_7b_cot.stats.json' with average score: 4.74
Processing: experiments/output/geval/mistral_7b_cot/hover_train_mistral_7b_cot.results.json
  -> Created 'hover_train_mistral_7b_cot.stats.json' with average score: 4.37
Processing: experiments/output/geval/mistral_7b_cot/covid_fact_mistral_7b_cot.results.json
  -> Created 'covid_fact_mistral_7b_cot.stats.json' with average score: 3.84
Processing: experiments/output/geval/mistral_7b_non_cot/hover_train_mistral_7b_no_cot.results.json
  -> Created 'hover_train_mistral_7b_no_cot.stats.json' with average score: 4.39
Processing: experiments/output/geval/mistral_7b_non_cot/covid_fact_mistral_7b_no_cot.results.json
  -> Created 'covid_fact_mistral_7b_no_cot.stats.json' with average score: 3.97
Processing: experiments/output/geval/mistral_7b_non_cot/politi_hop_mistral_7b_no_cot.results.json
  -> Created 'politi_hop_mistral_7b_no