In [41]:
import json
import os
import pandas as pd
import ast
import numpy as np

In [55]:
def load_mean_std_from_json(row):
    """
    Given a row from the CSV, build the JSON filename, 
    load it, and return (mu, sigma).
    
    EXPECTED structure in the JSON file:
      {
        "mean": <float>,
        "std": <float>
      }
    
    Adjust the filename construction as needed for your environment.
    """
    # 1) Extract needed fields from the row
    #    (Example fields; adapt if your CSV columns differ.)
    y_val = row["config/train_loop_config/y_val"]
    adj   = row["config/train_loop_config/adj_thresh"]
    num_nodes = row["config/train_loop_config/num_nodes"]
    
    # Potentially parse mutation/sex if they're comma-separated
    mutation_raw = row["config/train_loop_config/mutation"]
    mutation = ast.literal_eval(mutation_raw)
    mutation_str = f"mutation_{','.join(mutation)}"
    
    sex_raw = row["config/train_loop_config/sex"]
    sex = ast.literal_eval(sex_raw)
    sex_str = f'sex_{",".join(sex)}'
    
    modality = row["config/train_loop_config/modality"]
    
    y_val_str = f"y_val_{y_val}"
    adj_str   = f"adj_thresh_{adj}"
    num_nodes_str = f"num_nodes_{num_nodes}"
    
    experiment_id = (
        f"ftd_{y_val_str}_{adj_str}_{num_nodes_str}_{mutation_str}_{modality}_{sex_str}"
    )
    
    split = row.get("config/train_loop_config/split", "train")
    random_state = row["config/train_loop_config/seed"]
    num_folds = 5
    fold = row["config/train_loop_config/fold"]
    
    # 2) Construct the JSON filename (example):
    filename = (
        f"{experiment_id}_{split}_random_state_{random_state}_"
        f"{num_folds}fold_{fold}.json"
    )
    
    # Optionally prepend a directory path if needed
    # directory = "mean_std_files"
    # full_path = os.path.join(directory, filename)
    # Or just use filename if it's already a full path
    full_path = os.path.join("/scratch/lcornelis/data/data_louisa/processed", filename)
    
    # 3) Load JSON and extract mean, std
    with open(full_path, "r") as f:
        lines = f.read().splitlines()
  
    # parse them manually
    mean_str = lines[0].split(":")[1].strip()
    std_str  = lines[1].split(":")[1].strip()

    mean_val = float(mean_str)
    std_val  = float(std_str)

    return mean_val, std_val

In [56]:
def compute_rmse_original(row):
    """
    Convert the normalized val_loss (z-space MSE) back to original-scale MSE
    using the fold-specific sigma from JSON.
    """
    val_loss_z = row["val_loss"]  # MSE in normalized space
    if pd.isna(val_loss_z):
        return np.nan
    mu, sigma = load_mean_std_from_json(row)
    val_loss_orig = val_loss_z * (sigma ** 2)  # MSE_x = MSE_z * sigma^2
    return np.sqrt(val_loss_orig)

def concatenate_trial_ids(trial_ids):
    """
    Helper function to join trial_id values for grouped rows.
    """
    return ",".join(trial_ids.astype(str))

def main():
    # 1) Read the CSV
    df = pd.read_csv("ray_results_search_hyperparameters.csv")
    
    # 2) Create a new column for val_loss in original scale
    df["rmse_original"] = df.apply(compute_rmse_original, axis=1)
    
    # 3) Identify columns to group by (all 'config/train_loop_config/' except fold)
    fold_col = "config/train_loop_config/fold"
    all_config_cols = [
        c for c in df.columns
        if c.startswith("config/train_loop_config/") and c != fold_col
    ]
    
    # 4) Group by everything except the fold column
    grouped = df.groupby(all_config_cols, dropna=False)
    
    # 5) Compute mean & std across folds for relevant metrics
    #    We have "val_loss" (normalized), "val_loss_original" (unscaled), "train_loss" etc.
    metrics_of_interest = ["val_loss", "rmse_original", "train_loss"]
    
    agg_df = grouped.agg({
        **{metric: ["mean", "std"] for metric in metrics_of_interest},
        "trial_id": concatenate_trial_ids
    })
    
    # 6) Flatten the multi-level columns
    agg_df.reset_index(inplace=True)
    agg_df.columns = ["_".join(col).rstrip("_") for col in agg_df.columns.to_flat_index()]

    # 7) Inspect & save results
    print(agg_df.head())
    agg_df.to_csv("kfold_mean_std_results_with_original.csv", index=False)

main()

   config/train_loop_config/seed  config/train_loop_config/batch_size  \
0                             42                                    8   
1                             42                                    8   
2                             42                                    8   
3                             42                                    8   
4                             42                                    8   

  config/train_loop_config/lr_scheduler  config/train_loop_config/dropout  \
0                     CosineAnnealingLR                               0.1   
1                     CosineAnnealingLR                               0.1   
2                     CosineAnnealingLR                               0.1   
3                     CosineAnnealingLR                               0.1   
4                     CosineAnnealingLR                               0.1   

  config/train_loop_config/act  config/train_loop_config/num_nodes  \
0                          e