In [1]:
import pandas as pd
import os

# --- Configuration ---
CSV_FOLDER = 'data_csv'
ANALYSIS_FOLDER = 'statistical_analysis'
os.makedirs(ANALYSIS_FOLDER, exist_ok=True)

combined_csv_path = os.path.join(CSV_FOLDER, 'all_runs_combined.csv')

print(f"Reading combined data from: '{combined_csv_path}'")
print(f"Will save analysis to: '{ANALYSIS_FOLDER}'")


Reading combined data from: 'data_csv/all_runs_combined.csv'
Will save analysis to: 'statistical_analysis'


In [2]:
try:
    df = pd.read_csv(combined_csv_path)
    print("✅ Successfully loaded the combined data.")
    print("\nData preview:")
    display(df.head())
except FileNotFoundError:
    print(f"❌ ERROR: File not found at '{combined_csv_path}'.")
    print("Please run the 'preprocessing.ipynb' notebook first.")


✅ Successfully loaded the combined data.

Data preview:


Unnamed: 0,run_id,experiment_key,model_name,size_variant,arch_type,dataset_size,n_params,val_loss,val_accuracy,training_time
0,1,decoder_small_n1000,decoder,small,decoder_only,1000,778504,8.55114,0.0,1.873749
1,1,decoder_small_n5000,decoder,small,decoder_only,5000,778504,7.951559,0.0,4.987655
2,1,decoder_small_n10000,decoder,small,decoder_only,10000,778504,7.622734,0.0,7.977101
3,1,decoder_base_n1000,decoder,base,decoder_only,1000,2343304,8.500344,0.0,1.692486
4,1,decoder_base_n5000,decoder,base,decoder_only,5000,2343304,7.77394,0.0,7.70684


In [3]:
if 'df' in locals():
    # Define the metrics to aggregate
    metrics_to_aggregate = ['val_loss', 'val_accuracy', 'training_time']

    # Group by the new experimental conditions
    grouping_keys = ['arch_type', 'size_variant', 'dataset_size', 'n_params']

    # Calculate mean and standard deviation
    statistical_summary = df.groupby(grouping_keys)[metrics_to_aggregate].agg(['mean', 'std']).reset_index()

    # Flatten the multi-level column names
    statistical_summary.columns = ['_'.join(col).strip('_') for col in statistical_summary.columns.values]

    # Sort the results for cleaner presentation
    statistical_summary = statistical_summary.sort_values(by=['arch_type', 'size_variant', 'dataset_size'])

    print("✅ Statistical aggregation complete.")
    print("\nSummary preview:")
    display(statistical_summary.head())


✅ Statistical aggregation complete.

Summary preview:


Unnamed: 0,arch_type,size_variant,dataset_size,n_params,val_loss_mean,val_loss_std,val_accuracy_mean,val_accuracy_std,training_time_mean,training_time_std
0,decoder_only,base,1000,2343304,8.488891,0.012199,0.0,0.0,1.595834,0.14834
1,decoder_only,base,5000,2343304,7.782166,0.009035,0.0,0.0,7.497437,0.297877
2,decoder_only,base,10000,2343304,7.162202,0.030113,0.0,0.0,14.78504,0.135116
3,decoder_only,small,1000,778504,8.551616,0.008455,0.0,0.0,1.093013,0.444149
4,decoder_only,small,5000,778504,7.953337,0.006059,0.0,0.0,4.298236,0.424087


In [4]:
if 'statistical_summary' in locals():
    summary_output_path = os.path.join(ANALYSIS_FOLDER, 'statistical_summary.csv')
    statistical_summary.to_csv(summary_output_path, index=False)

    print(f"\n🎉 Successfully saved the statistical summary.")
    print(f"✅ Analysis file saved to '{summary_output_path}'")



🎉 Successfully saved the statistical summary.
✅ Analysis file saved to 'statistical_analysis/statistical_summary.csv'
