In [1]:
import pandas as pd
import json
import os
from glob import glob
import re

# --- Configuration ---
JSON_FOLDER = 'data_json'
CSV_FOLDER = 'data_csv'
os.makedirs(CSV_FOLDER, exist_ok=True)

print(f"Input folder: '{JSON_FOLDER}'")
print(f"Output folder: '{CSV_FOLDER}'")

Input folder: 'data_json'
Output folder: 'data_csv'


In [2]:
all_runs_data = []
# Assuming filenames like 'transformer_architecture_scaling_results-1.json'
json_files = sorted(glob(os.path.join(JSON_FOLDER, 'transformer_architecture_scaling_results-*.json')))

if not json_files:
    print(f"❌ ERROR: No JSON files found in '{JSON_FOLDER}'.")
    print("Please ensure your files are named 'transformer_architecture_scaling_results-n.json'.")
else:
    print(f"Found {len(json_files)} JSON files to process...")

for i, file_path in enumerate(json_files):
    run_id = i + 1
    print(f"\n--- Processing Run {run_id} ({os.path.basename(file_path)}) ---")

    with open(file_path, 'r') as f:
        data = json.load(f)

    # Flatten the new JSON structure
    flattened_data = []
    for experiment_key, metrics in data.items():
        # Use regex to parse the experiment key, e.g., "decoder_base_n10000"
        match = re.match(r'(\w+?)_(\w+?)_n(\d+)', experiment_key)
        if not match:
            print(f"  - Warning: Could not parse key '{experiment_key}'. Skipping.")
            continue

        model_name, size_variant, ds_size = match.groups()

        # Get the architecture type from the config dictionary inside the JSON
        arch_type = metrics.get('config', {}).get('arch_type', 'unknown')

        record = {
            'run_id': run_id,
            'experiment_key': experiment_key,
            'model_name': model_name,
            'size_variant': size_variant,
            'arch_type': arch_type,
            'dataset_size': int(ds_size),
            'n_params': metrics.get('n_params'),
            'val_loss': metrics.get('final_val_loss'),
            'val_accuracy': metrics.get('final_val_accuracy'),
            'training_time': metrics.get('training_time_sec')
        }
        flattened_data.append(record)

    run_df = pd.DataFrame(flattened_data)

    # Save individual run to CSV
    output_path = os.path.join(CSV_FOLDER, f'run_{run_id}.csv')
    run_df.to_csv(output_path, index=False)
    print(f"✅ Saved individual run data to '{output_path}'")

    all_runs_data.append(run_df)

print("\n--- All individual files processed. ---")

Found 5 JSON files to process...

--- Processing Run 1 (transformer_architecture_scaling_results-1.json) ---
✅ Saved individual run data to 'data_csv/run_1.csv'

--- Processing Run 2 (transformer_architecture_scaling_results-2.json) ---
✅ Saved individual run data to 'data_csv/run_2.csv'

--- Processing Run 3 (transformer_architecture_scaling_results-3.json) ---
✅ Saved individual run data to 'data_csv/run_3.csv'

--- Processing Run 4 (transformer_architecture_scaling_results-4.json) ---
✅ Saved individual run data to 'data_csv/run_4.csv'

--- Processing Run 5 (transformer_architecture_scaling_results-5.json) ---
✅ Saved individual run data to 'data_csv/run_5.csv'

--- All individual files processed. ---


In [3]:
if all_runs_data:
    combined_df = pd.concat(all_runs_data, ignore_index=True)
    combined_output_path = os.path.join(CSV_FOLDER, 'all_runs_combined.csv')
    combined_df.to_csv(combined_output_path, index=False)

    print(f"\n🎉 Successfully combined all runs.")
    print(f"✅ Master data file saved to '{combined_output_path}'")
    print("\nCombined DataFrame preview:")
    display(combined_df.head())
    print(f"\nTotal experiments processed: {len(combined_df)}")
else:
    print("\nNo data was processed.")



🎉 Successfully combined all runs.
✅ Master data file saved to 'data_csv/all_runs_combined.csv'

Combined DataFrame preview:


Unnamed: 0,run_id,experiment_key,model_name,size_variant,arch_type,dataset_size,n_params,val_loss,val_accuracy,training_time
0,1,decoder_small_n1000,decoder,small,decoder_only,1000,778504,8.55114,0.0,1.873749
1,1,decoder_small_n5000,decoder,small,decoder_only,5000,778504,7.951559,0.0,4.987655
2,1,decoder_small_n10000,decoder,small,decoder_only,10000,778504,7.622734,0.0,7.977101
3,1,decoder_base_n1000,decoder,base,decoder_only,1000,2343304,8.500344,0.0,1.692486
4,1,decoder_base_n5000,decoder,base,decoder_only,5000,2343304,7.77394,0.0,7.70684



Total experiments processed: 90
