In [1]:
import json
import pandas as pd
import numpy as np
from pathlib import Path
from dictor import dictor

def extract_experiment_data(json_file):
    variant_name = Path(json_file).relative_to('./results').parts[0]

    with open(json_file, 'r') as f:
        data = json.load(f)

    # Extract metadata
    model_family = dictor(data, 'args.model_family')
    peft_method = dictor(data, 'args.peft')
    task = dictor(data, 'args.task')

    # Get metrics
    # Some tasks use eval_accuracy, others eval_matthews_correlation
    accuracy = dictor(data, 'eval_accuracy') or dictor(data, 'eval_matthews_correlation') or 0.0

    eval_runtime = data.get('eval_runtime', 0.0)

    # Get training-specific metrics
    trainable_params = dictor(data, 'train.trainable_params_count', 0.0)
    train_runtime = dictor(data, 'train.train_time', 0.0)

    # Calculate Average GPU Memory (Allocated)
    memory_list = dictor(data, 'train.memory_allocated', [])
    avg_memory = np.mean(memory_list) if memory_list else 0.0

    return {
        "Model Family": model_family,
        "PEFT Method": peft_method,
        "Task": task,
        "Variant": variant_name,
        "Accuracy/Metric": round(accuracy, 4),
        "Trainable Params (M)": round(trainable_params, 4),
        "Train Runtime (s)": round(train_runtime, 2),
        "Eval Runtime (s)": round(eval_runtime, 2),
        "Avg GPU Memory (MB)": round(avg_memory, 2)
    }

def aggregate_experiment_results(root_dir):
    """
    Finds all .json files under a directory recursively, extracts data,
    and concatenates them into one large DataFrame.
    """
    root_path = Path(root_dir)
    # Recursively find all JSON files
    json_files = list(root_path.rglob("*.json"))

    if not json_files:
        print(f"No JSON files found in {root_dir}")
        return pd.DataFrame()

    all_dfs = []
    for f in json_files:
        row = extract_experiment_data(f)
        if row:
            all_dfs.append(row)

    if not all_dfs:
        print("No valid data extracted from found files.")
        return pd.DataFrame()

    # Concatenate all individual DataFrames by row
    final_df = pd.DataFrame.from_records(all_dfs)

    # Sort for better readability
    sort_cols = ["Task", "Model Family", "Variant"]
    final_df = final_df.sort_values(by=[c for c in sort_cols if c in final_df.columns])

    return final_df


In [2]:
df = aggregate_experiment_results('./results/')

fft
fft
lora
lora
lora
lora
lora
lora


In [3]:
df

Unnamed: 0,Model Family,PEFT Method,Task,Variant,Accuracy/Metric,Trainable Params (M),Train Runtime (s),Eval Runtime (s),Avg GPU Memory (MB)
1,bert,lora,rte,fft,0.5812,109.4838,31.37,0.39,1797.64
5,bert,lora,rte,lora,0.5632,0.2964,11.04,0.25,463.09
6,bert,lora,rte,lora,0.6101,0.2964,10.96,0.24,463.09
7,bert,lora,rte,lora,0.5993,0.2964,10.87,0.25,463.09
0,bert,lora,wnli,fft,0.4648,109.4838,18.46,0.11,1787.08
2,bert,lora,wnli,lora,0.5634,0.2964,2.86,0.08,463.09
3,bert,lora,wnli,lora,0.5634,0.2964,3.45,0.09,462.3
4,bert,lora,wnli,lora,0.5634,0.2964,2.91,0.08,463.09
