In [1]:
import pandas as pd
import os

# --- Configuration ---
CSV_FOLDER = 'data_csv'
ANALYSIS_FOLDER = 'statistical_analysis'

# Define the path to the input file
combined_csv_path = os.path.join(CSV_FOLDER, 'all_runs_combined.csv')

# Create the output folder if it doesn't exist
os.makedirs(ANALYSIS_FOLDER, exist_ok=True)

print(f"Reading combined data from: '{combined_csv_path}'")
print(f"Will save analysis to: '{ANALYSIS_FOLDER}'")

Reading combined data from: 'data_csv/all_runs_combined.csv'
Will save analysis to: 'statistical_analysis'


In [2]:
try:
    df = pd.read_csv(combined_csv_path)
    print("✅ Successfully loaded the combined data.")
    print("\nData preview:")
    display(df.head())
except FileNotFoundError:
    print(f"❌ ERROR: File not found at '{combined_csv_path}'.")
    print("Please run the 'preprocessing.ipynb' notebook first to generate this file.")


✅ Successfully loaded the combined data.

Data preview:


Unnamed: 0,run_id,dataset_size,model_name,arch_type,n_params,train_loss,val_loss,gen_gap,training_time
0,1,400,transformer_nano,transformer,3243968,10.391734,10.584248,0.192514,0.879869
1,1,400,transformer_micro,transformer,6637056,10.010585,10.341223,0.330638,0.959431
2,1,400,lstm_nano,lstm,3283601,10.767841,10.776723,0.008882,0.918976
3,1,400,lstm_micro,lstm,6616273,10.698589,10.704955,0.006366,1.014123
4,1,400,gru_nano,gru,3279377,10.734201,10.772248,0.038048,0.941737


In [3]:
if 'df' in locals():
    # Define the columns we want to aggregate
    metrics_to_aggregate = ['val_loss', 'train_loss', 'gen_gap', 'training_time']

    # Group by the experimental conditions
    grouping_keys = ['dataset_size', 'model_name', 'arch_type', 'n_params']

    # Calculate mean and standard deviation for each metric
    # The 'agg' function is perfect for this
    statistical_summary = df.groupby(grouping_keys)[metrics_to_aggregate].agg(['mean', 'std']).reset_index()

    # Flatten the multi-level column names (e.g., from ('val_loss', 'mean') to 'val_loss_mean')
    statistical_summary.columns = ['_'.join(col).strip('_') for col in statistical_summary.columns.values]

    print("✅ Statistical aggregation complete.")
    print("\nSummary preview:")
    display(statistical_summary.head())

✅ Statistical aggregation complete.

Summary preview:


Unnamed: 0,dataset_size,model_name,arch_type,n_params,val_loss_mean,val_loss_std,train_loss_mean,train_loss_std,gen_gap_mean,gen_gap_std,training_time_mean,training_time_std
0,400,gru_micro,gru,6582993,10.435173,0.192486,10.487455,0.020701,-0.052282,0.189808,1.06692,0.083106
1,400,gru_nano,gru,3279377,10.702211,0.082433,10.710784,0.01899,-0.008573,0.065711,0.941317,0.013385
2,400,lstm_micro,lstm,6616273,10.616153,0.085293,10.671145,0.02565,-0.054992,0.087814,1.024124,0.01179
3,400,lstm_nano,lstm,3283601,10.760715,0.02974,10.762294,0.00562,-0.001579,0.027997,0.940713,0.03547
4,400,mlp_micro,mlp,6499793,10.70725,0.065135,10.726447,0.016862,-0.019196,0.060843,0.863518,0.022959


In [4]:
if 'statistical_summary' in locals():
    # Save the summary DataFrame to a new CSV file
    summary_output_path = os.path.join(ANALYSIS_FOLDER, 'statistical_summary.csv')
    statistical_summary.to_csv(summary_output_path, index=False)

    print(f"\n🎉 Successfully saved the statistical summary.")
    print(f"✅ Analysis file saved to '{summary_output_path}'")


🎉 Successfully saved the statistical summary.
✅ Analysis file saved to 'statistical_analysis/statistical_summary.csv'
