In [None]:
import json
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob


In [64]:
def get_dataset_name_from_filename(filename):
    """Robustly extracts dataset name from filename."""
    if filename.startswith('covid_fact'):
        return 'covid_fact'
    elif filename.startswith('hover_train'):
        return 'hover_train'
    elif filename.startswith('politi_hop'):
        return 'politi_hop'
    return None

def load_all_results(base_dir='.'):
    """
    Loads all experiment results from the specified directory structure
    into a unified list of records.
    """
    all_records = []
    
    # This path assumes you run the script from the root of your project directory
    search_path = os.path.join(base_dir, 'experiments', 'output', '**', '*.results.json')
    result_files = glob.glob(search_path, recursive=True)
    
    if not result_files:
        print(f"FATAL: No '.results.json' files found. Make sure you are running this script from the root of your repository.")
        return []
    
    print(f"Found {len(result_files)} result files. Processing...")
    
    for file_path in result_files:
        try:
            parts = file_path.replace('\\', '/').split('/')
            
            try:
                output_index = parts.index('output')
            except ValueError:
                print(f"Skipping file with unexpected path structure: {file_path}")
                continue
            
            eval_method = parts[output_index + 1]
            model_config = parts[output_index + 2]
            filename = parts[output_index + 3]
            
            dataset_name = get_dataset_name_from_filename(filename)
            if not dataset_name:
                continue
            
            # ragas is stored as jsonl
            if eval_method == 'ragas':
                results = []
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        if line.strip():
                            results.append(json.loads(line))
            else:
                with open(file_path, 'r', encoding='utf-8') as f:
                    results = json.load(f)
            all_records = handle_result(all_records, eval_method, model_config, dataset_name, results)
        except Exception as e:
            print(f"{file_path}: {e}")
            continue
            
    return all_records

def handle_result(all_records, eval_method, model_config, dataset_name, results):
    for r in results:
                # r.get('id') = f"{model_config}_{dataset_name}__{r.get('id')}"
        if eval_method == 'ragas':
            all_records.append({
                'id': r.get('id'), 
                'model_config': model_config, 
                'dataset': dataset_name, 
                'metric_name': 'ragas_faithfulness', 
                'score_value': r.get('faithfulness_score'),
            })
        elif eval_method == 'hitl':
            all_records.append({
                'id': r.get('id'), 
                'model_config': model_config, 
                'dataset': dataset_name, 
                'metric_name': 'human_score', 
                'score_value': r.get('human_score'),
            })
        elif eval_method == 'geval':
            all_records.append({
                'id': r.get('id'), 
                'model_config': model_config, 
                'dataset': dataset_name, 
                'metric_name': 'geval_score', 
                'score_value': r.get('faithfulness_score_0_5'),
            })
        elif eval_method == 'entailment':
            all_records.append({
                'id': r.get('id'), 
                'model_config': model_config, 
                'dataset': dataset_name, 
                'metric_name': 'e2x_prob',
                'score_value': r.get('e2x_entail_prob'),
            })
            all_records.append({
                'id': r.get('id'), 
                'model_config': model_config, 
                'dataset': dataset_name, 
                'metric_name': 'x2e_prob',
                'score_value': r.get('x2e_entail_prob'),
            })
    return all_records


results = load_all_results()
np.random.shuffle(results)

i=0
none = []
for l in results:
    # if l['metric_name'] == 'entailment_score':
    #     print(l)
    #     print(type(l['score_value']))
    #     i+=1
    #     if i > 10: break
    if l['score_value'] is None:
        print(l['metric_name']) if l['metric_name'] != 'human_score' else None
        none.append(l)
print(len(none))

Found 60 result files. Processing...
26


In [79]:
raw_df = pd.DataFrame(results)
df_wide = raw_df.pivot_table(
    index=['id', 'model_config', 'dataset'], 
    columns='metric_name', 
    values='score_value',

).reset_index()
df_wide.head()
print(df_wide.columns, df_wide.shape)
print(raw_df.columns, raw_df.shape)

Index(['id', 'model_config', 'dataset', 'e2x_prob', 'geval_score',
       'human_score', 'ragas_faithfulness', 'x2e_prob'],
      dtype='object', name='metric_name') (17485, 8)
Index(['id', 'model_config', 'dataset', 'metric_name', 'score_value'], dtype='object') (70347, 5)


In [80]:
df_wide.head()

metric_name,id,model_config,dataset,e2x_prob,geval_score,human_score,ragas_faithfulness,x2e_prob
0,covid_fact-000001,deepseek_r1_32b_cot,covid_fact,0.11289,5.0,,0.6,0.001309
1,covid_fact-000001,gpt4o_cot,covid_fact,0.176205,5.0,,1.0,0.002086
2,covid_fact-000001,gpt4o_non_cot,covid_fact,0.340726,5.0,,1.0,0.001994
3,covid_fact-000001,mistral_7b_cot,covid_fact,0.165425,5.0,,1.0,0.002111
4,covid_fact-000001,mistral_7b_non_cot,covid_fact,0.712899,5.0,,-1.0,0.002513
