In [1]:
import pandas as pd
import json
import os
from glob import glob

# --- Configuration ---
# Define the paths for your data folders.
# This script assumes it's running from the root of your project folder.
JSON_FOLDER = 'data_json'
CSV_FOLDER = 'data_csv'

# Create the output folder if it doesn't exist
os.makedirs(CSV_FOLDER, exist_ok=True)

print(f"Reading from: '{JSON_FOLDER}'")
print(f"Will save to: '{CSV_FOLDER}'")

Reading from: 'data_json'
Will save to: 'data_csv'


In [2]:
all_runs_data = []
json_files = sorted(glob(os.path.join(JSON_FOLDER, 'gpu_scaling_results-*.json')))

if not json_files:
    print("❌ ERROR: No JSON files found in the 'data_json' folder.")
    print("Please make sure your result files are named 'gpu_scaling_results-n.json' and are in the correct directory.")
else:
    print(f"Found {len(json_files)} JSON files to process...")

for i, file_path in enumerate(json_files):
    run_id = i + 1
    print(f"\n--- Processing Run {run_id} ({os.path.basename(file_path)}) ---")

    with open(file_path, 'r') as f:
        data = json.load(f)

    # Flatten the nested JSON structure into a list of records
    flattened_data = []
    for ds_size, models in data.items():
        for model_name, metrics in models.items():
            record = {
                'run_id': run_id,
                'dataset_size': int(ds_size),
                'model_name': model_name,
                'arch_type': metrics.get('arch_type'),
                'n_params': metrics.get('n_params'),
                'train_loss': metrics.get('train_loss'),
                'val_loss': metrics.get('val_loss'),
                'gen_gap': metrics.get('gen_gap'),
                'training_time': metrics.get('training_time')
            }
            flattened_data.append(record)

    # Create a DataFrame for the current run
    run_df = pd.DataFrame(flattened_data)

    # Save the individual run to its own CSV
    output_filename = f'run_{run_id}.csv'
    output_path = os.path.join(CSV_FOLDER, output_filename)
    run_df.to_csv(output_path, index=False)
    print(f"✅ Saved individual run data to '{output_path}'")

    # Append the DataFrame to our master list
    all_runs_data.append(run_df)

print("\n--- All individual files processed. ---")

Found 5 JSON files to process...

--- Processing Run 1 (gpu_scaling_results-1.json) ---
✅ Saved individual run data to 'data_csv/run_1.csv'

--- Processing Run 2 (gpu_scaling_results-2.json) ---
✅ Saved individual run data to 'data_csv/run_2.csv'

--- Processing Run 3 (gpu_scaling_results-3.json) ---
✅ Saved individual run data to 'data_csv/run_3.csv'

--- Processing Run 4 (gpu_scaling_results-4.json) ---
✅ Saved individual run data to 'data_csv/run_4.csv'

--- Processing Run 5 (gpu_scaling_results-5.json) ---
✅ Saved individual run data to 'data_csv/run_5.csv'

--- All individual files processed. ---


In [3]:
if all_runs_data:
    # Concatenate all individual run DataFrames into one master DataFrame
    combined_df = pd.concat(all_runs_data, ignore_index=True)

    # Save the combined DataFrame
    combined_output_path = os.path.join(CSV_FOLDER, 'all_runs_combined.csv')
    combined_df.to_csv(combined_output_path, index=False)

    print(f"\n🎉 Successfully combined all runs into a single file.")
    print(f"✅ Master data file saved to '{combined_output_path}'")
    print("\nCombined DataFrame preview:")
    display(combined_df.head())
    print(f"\nTotal experiments processed: {len(combined_df)}")
else:
    print("\nNo data was processed. Skipping combination step.")


🎉 Successfully combined all runs into a single file.
✅ Master data file saved to 'data_csv/all_runs_combined.csv'

Combined DataFrame preview:


Unnamed: 0,run_id,dataset_size,model_name,arch_type,n_params,train_loss,val_loss,gen_gap,training_time
0,1,400,transformer_nano,transformer,3243968,10.391734,10.584248,0.192514,0.879869
1,1,400,transformer_micro,transformer,6637056,10.010585,10.341223,0.330638,0.959431
2,1,400,lstm_nano,lstm,3283601,10.767841,10.776723,0.008882,0.918976
3,1,400,lstm_micro,lstm,6616273,10.698589,10.704955,0.006366,1.014123
4,1,400,gru_nano,gru,3279377,10.734201,10.772248,0.038048,0.941737



Total experiments processed: 160
