In [8]:
import json
from pathlib import Path

def load_results(path):
    """Load all summary_results_*.json files from given path"""
    files = [
        "summary_results_AIME.json",
        "summary_results_AMC.json", 
        "summary_results_gsm8k.json",
        "summary_results_hendrycks_math_500.json",
        "summary_results_hendrycks_math.json",
        "summary_results_Olympiad.json"
    ]
    
    results = {}
    base_path = Path(path)
    
    for file in files:
        try:
            with open(base_path / file, 'r') as f:
                results[file] = json.load(f)
        except:
            results[file] = None
    
    return results

In [9]:
def process_results(data):
    """Process loaded JSON results and extract key metrics"""
    processed = {}
    
    for filename, content in data.items():
        if content is None:
            continue
            
        if "gsm8k" in filename:
            # Handle gsm8k specially - extract the score value
            if "results" in content:
                for result_file, metrics in content["results"].items():
                    if "gsm8k_cot_zeroshot" in metrics:
                        processed[filename] = metrics["gsm8k_cot_zeroshot"]
                        break
        else:
            # For other files, extract exact_match_accuracy and llm_check_result_accuracy
            metrics = {}
            if "exact_match_accuracy" in content:
                metrics["exact_match_accuracy"] = content["exact_match_accuracy"]
            if "llm_check_result_accuracy" in content:
                metrics["llm_check_result_accuracy"] = content["llm_check_result_accuracy"]
            
            if metrics:
                processed[filename] = metrics
    
    return processed

In [10]:
import pandas as pd

def create_results_table(results, folder_name):
    """Create a single row table with folder name and each metric as a column"""
    row = {"Folder": folder_name}
    
    # Define the order of tasks
    task_order = ["AIME", "AMC", "Olympiad", "hendrycks_math", "hendrycks_math_500", "gsm8k"]
    
    for task in task_order:
        # Find matching filename
        matching_file = None
        for filename in results.keys():
            if task in filename:
                matching_file = filename
                break
        
        if matching_file and results[matching_file] is not None:
            metrics = results[matching_file]
            task_name = matching_file.replace("summary_results_", "").replace(".json", "")
            
            if isinstance(metrics, dict):
                # For files with multiple metrics, add each metric as separate column
                for metric_name, value in metrics.items():
                    column_name = f"{task_name}_{metric_name}"
                    # Convert to percentage with 2 decimal places
                    row[column_name] = f"{value * 100:.2f}%" if isinstance(value, (int, float)) else value
            else:
                # For gsm8k with single score
                # Convert to percentage with 2 decimal places
                row[task_name] = f"{metrics * 100:.2f}%" if isinstance(metrics, (int, float)) else metrics
    
    return pd.DataFrame([row])

def combine_tables(tables):
    """Combine multiple single-row tables into one table"""
    return pd.concat(tables, ignore_index=True)

In [11]:
# Example: Adding multiple folders
def process_multiple_folders(folder_paths):
    """Process multiple folders and create combined table"""
    all_tables = []
    
    for path, name in folder_paths:
        data = load_results(path)
        results = process_results(data)
        table = create_results_table(results, name)
        all_tables.append(table)
    
    return combine_tables(all_tables)


In [18]:
folders = [
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/Qwen__Qwen2.5-7B-Instruct", "Qwen2.5-7B-Instruct"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/__home__aiscuser__CascadeDistill__saves__deepseek_qwen2.5-7b-instruct_qwen2.5-1.5b-instruct_20k__deepseek_qwen2.5-7b-instruct_20k_sft_ds3__20251001-032444", "Qwen2.5-7B-Instruct SFT"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/__home__aiscuser__CascadeDistill__saves__deepseek_qwen2.5-7b-instruct_qwen2.5-1.5b-instruct_20k__deepseek_qwen2.5-7b-instruct_20k_math-verify-skip_cascade_ds3__20251002-120950", "Qwen2.5-7B-Instruct Cascade"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/Qwen__Qwen2.5-1.5B-Instruct", "Qwen2.5-1.5B-Instruct"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/__home__aiscuser__CascadeDistill__saves__deepseek_qwen2.5-7b-instruct_qwen2.5-1.5b-instruct_20k__deepseek_qwen2.5-1.5b-instruct_20k_sft_ds3__20251001-004541", "Qwen2.5-1.5B-Instruct SFT"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/__home__aiscuser__CascadeDistill__saves__deepseek_qwen2.5-7b-instruct_qwen2.5-1.5b-instruct_20k__deepseek_qwen2.5-1.5b-instruct_20k_math-verify-skip_cascade_ds3__20251002-073227", "Qwen2.5-1.5B-Instruct Cascade"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/Qwen__Qwen2.5-3B-Instruct", "Qwen2.5-3B-Instruct"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/Qwen__Qwen2.5-3B", "Qwen2.5-3B"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/__home__aiscuser__CascadeDistill__saves__deepseek_qwen2.5-7b-instruct_qwen2.5-1.5b-instruct_20k__deepseek_qwen2.5-3b_20k_sft_ds3__20251003-012549", "Qwen2.5-3B SFT"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/__home__aiscuser__CascadeDistill__saves__deepseek_qwen2.5-7b-instruct_qwen2.5-1.5b-instruct_20k__deepseek_qwen2.5-3b_20k_math-verify-skip_cascade_ds3__20251003-044546", "Qwen2.5-3B Cascade"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/meta-llama__Llama-3.2-3B-Instruct", "Llama-3.2-3B-Instruct"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/__home__aiscuser__CascadeDistill__saves__deepseek_qwen2.5-7b-instruct_qwen2.5-1.5b-instruct_20k__deepseek_Llama-3.2-3B-instruct_20k_sft_ds3__20251002-184518", "Llama-3.2-3B-Instruct SFT"),
    ("/home/aiscuser/Small-Model-Learnability-Gap/lm-evaluation-harness/Cascade_results/__home__aiscuser__CascadeDistill__saves__deepseek_qwen2.5-7b-instruct_qwen2.5-1.5b-instruct_20k__deepseek_Llama-3.2-3B-instruct_20k_math-verify-skip_cascade_ds3__20251002-220228", "Llama-3.2-3B-Instruct Cascade"),

]

combined_table = process_multiple_folders(folders)
combined_table

Unnamed: 0,Folder,AIME_exact_match_accuracy,AIME_llm_check_result_accuracy,AMC_exact_match_accuracy,AMC_llm_check_result_accuracy,Olympiad_exact_match_accuracy,Olympiad_llm_check_result_accuracy,hendrycks_math_500_exact_match_accuracy,hendrycks_math_500_llm_check_result_accuracy,gsm8k
0,Qwen2.5-7B-Instruct,10.00%,16.67%,0.00%,40.00%,25.48%,39.85%,73.00%,76.80%,82.18%
1,Qwen2.5-7B-Instruct SFT,3.33%,6.67%,0.00%,42.50%,19.56%,27.26%,61.20%,64.20%,80.82%
2,Qwen2.5-7B-Instruct Cascade,3.33%,3.33%,0.00%,32.50%,18.67%,26.07%,58.00%,60.40%,79.91%
3,Qwen2.5-1.5B-Instruct,0.00%,3.33%,0.00%,30.00%,12.74%,20.30%,47.20%,49.40%,66.34%
4,Qwen2.5-1.5B-Instruct SFT,0.00%,0.00%,0.00%,12.50%,4.15%,6.22%,26.40%,27.60%,48.67%
5,Qwen2.5-1.5B-Instruct Cascade,0.00%,0.00%,0.00%,10.00%,2.96%,4.15%,22.40%,23.00%,52.54%
6,Qwen2.5-3B-Instruct,0.00%,6.67%,0.00%,35.00%,18.52%,28.30%,62.80%,65.80%,76.80%
7,Qwen2.5-3B,3.33%,3.33%,0.00%,25.00%,15.41%,23.11%,47.80%,50.00%,69.07%
8,Qwen2.5-3B SFT,3.33%,3.33%,0.00%,30.00%,10.81%,14.37%,43.80%,45.00%,67.02%
9,Qwen2.5-3B Cascade,0.00%,0.00%,0.00%,17.50%,7.70%,10.07%,33.20%,34.40%,63.91%


In [19]:
def export_table_to_csv(table, filename="results_table.csv"):
    """Export DataFrame table to CSV file"""
    table.to_csv(filename, index=False)
    print(f"Table exported to: {filename}")

# Export current table
export_table_to_csv(combined_table, "model_results.csv")

Table exported to: model_results.csv
