In [44]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import re
import os
import shutil

In [12]:
def parse_txt_to_dict(file_path):
    """
    Reads a text file and converts its contents into a dictionary.

    Args:
        file_path (str): The path to the text file.

    Returns:
        dict: A dictionary containing the key-value pairs from the file.
    """
    data = {}
    try:
        with open(file_path, "r") as file:
            for line in file:
                # Check if the line contains a colon
                if ':' in line:
                    key, value = line.strip().split(":", 1)  # Split by the first colon
                    data[key.strip()] = value.strip()
    except FileNotFoundError:
        print(f"Error: The file at '{file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    
    return data



In [13]:
res = {}

for i in range(1,20):
    res[i] = parse_txt_to_dict(f"experiments/e{i}/train/results.txt")
    

In [14]:
for key, sub_dict in res.items():
    if 'sid' in sub_dict:
        del sub_dict['sid']


In [20]:
for key, metrics in res.items():
    for metric, value in metrics.items():
        # Remove the trailing comma and convert to float
        res[key][metric] = float(value.strip(','))

In [21]:
res

{1: {'shd': 1.0,
  'fn': 0.0,
  'fp': 0.0,
  'rev': 1.0,
  'nll_val': 0.9370526385102842},
 2: {'shd': 3.0,
  'fn': 1.0,
  'fp': 1.0,
  'rev': 1.0,
  'nll_val': 1.1314296597652729},
 3: {'shd': 3.0,
  'fn': 0.0,
  'fp': 1.0,
  'rev': 2.0,
  'nll_val': 0.8313778447102383},
 4: {'shd': 5.0,
  'fn': 2.0,
  'fp': 2.0,
  'rev': 1.0,
  'nll_val': 0.8513567092661044},
 5: {'shd': 3.0,
  'fn': 1.0,
  'fp': 0.0,
  'rev': 2.0,
  'nll_val': 0.9880804649793543},
 6: {'shd': 4.0,
  'fn': 1.0,
  'fp': 2.0,
  'rev': 1.0,
  'nll_val': 0.5872324226444693},
 7: {'shd': 0.0,
  'fn': 0.0,
  'fp': 0.0,
  'rev': 0.0,
  'nll_val': 0.31692471616848},
 8: {'shd': 13.0,
  'fn': 3.0,
  'fp': 7.0,
  'rev': 3.0,
  'nll_val': 0.8428364766174895},
 9: {'shd': 14.0,
  'fn': 2.0,
  'fp': 6.0,
  'rev': 6.0,
  'nll_val': 0.3171223817977142},
 10: {'shd': 2.0,
  'fn': 1.0,
  'fp': 0.0,
  'rev': 1.0,
  'nll_val': 1.1433606461066543},
 11: {'shd': 5.0,
  'fn': 2.0,
  'fp': 1.0,
  'rev': 2.0,
  'nll_val': 1.013161373866038}

In [45]:

# Shortened labels for x-axis
short_labels = {
    1: "e1: perfect",
    2: "e2: perfect-unknown",
    3: "e3: imperfect",
    4: "e4: perfect",
    5: "e5: perfect-unknown",
    6: "e6: imperfect",
    7: "e7: perfect",
    8: "e8: perfect-unknown",
    9: "e9: imperfect",
    10: "e10: no interventions",
    11: "e11: removed",
    12: "e12: no interventions",
    13: "e13: removed",
    14: "e14: no interventions",
    15: "e15: removed",
    16: "e16: no interventions",
    17: "e17: perfect-unknown",
    18: "e18: perfect",
    19: "e19: perfect longer patience"
}

# Grouping cases
cases = {
    "Generalization Across Intervention Types\nCase 1 (Linear Structure)": [2, 3, 1],
    "Generalization Across Intervention Types\nCase 2 (Additive Noise Neural Network Structure)": [5, 6, 4],
    "Generalization Across Intervention Types\nCase 3 (Nonlinear Non-Additive Noise Structure)": [8, 9, 7],
    "Identifiability with Varying Numbers of Interventions\nCase 1 (Perfect - Linear Structure)": [10, 11, 1],
    "Identifiability with Varying Numbers of Interventions\nCase 2 (Perfect - Additive Noise Neural Network Structure)": [12, 13, 4],
    "Identifiability with Varying Numbers of Interventions\nCase 3 (Imperfect - Nonlinear Non-Additive Noise Structure)": [14, 15, 3],
    "Synthetic Economic Growth Data Identifiability Tests": [16, 17, 18, 19]
}

# Create a directory to save the figures
output_dir = "results"
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Function to clean the case name for filenames (removes parentheses and contents)
def clean_case_name(case_name):
    return re.sub(r'\s*\([^)]*\)', '', case_name).replace(' ', '_').replace('\n', '_').replace(':', '')

# Function to plot and save SHD, FN, FP, REV together (bar plot)
def plot_combined_bars(subset, title, filename):
    # Prepare data in long format for seaborn
    long_format = subset.melt(id_vars=["short_label"], 
                              value_vars=["shd", "fn", "fp", "rev"], 
                              var_name="Metric", 
                              value_name="Value")

    # Plot
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(data=long_format, x="short_label", y="Value", hue="Metric", palette="viridis")
    
    # Add values on top of the bars
    for bar in ax.patches:
        ax.annotate(
            f'{int(bar.get_height())}',  # Show as integer
            (bar.get_x() + bar.get_width() / 2, bar.get_height()),
            ha='center', va='bottom', fontsize=10, color='black'
        )
    
    # Formatting
    plt.title(title, fontsize=14)
    plt.xlabel("Experiments", fontsize=12)
    plt.ylabel("Value", fontsize=12)
    plt.xticks(rotation=30, ha="right")  # Rotate x-axis labels
    plt.legend(title="Metrics", fontsize=10)
    plt.tight_layout()
    
    # Save the figure
    plt.savefig(os.path.join(output_dir, filename), dpi=300)  # Save as high-resolution image
    plt.close()  # Close the plot to free up memory

# Function to plot and save NLL values (scatter plot)
def plot_nll_scatter(subset, title, filename):
    plt.figure(figsize=(10, 6))
    ax = plt.gca()  # Get current axes
    
    # Create custom labels with NLL values for the legend
    custom_labels = [f"{row['short_label']} (NLL: {row['nll_val']:.5f})" for _, row in subset.iterrows()]
    
    for idx, row in enumerate(subset.iterrows()):
        _, data = row
        # Plot scatter points
        ax.scatter(data['short_label'], data['nll_val'], label=custom_labels[idx], s=100)
    
    # Formatting
    plt.title(f"NLL Values - {title}", fontsize=14)
    plt.xlabel("Experiments", fontsize=12)
    plt.ylabel("Negative Log-Likelihood (NLL)", fontsize=12)
    plt.xticks(rotation=30, ha="right")  # Rotate x-axis labels
    
    # Legend inside the plot
    plt.legend(
        title="Experiments", 
        loc='lower left',  # Place in the lower left inside the plot
        fontsize=10, 
        frameon=True,  # Add a border to the legend
        fancybox=True,  # Rounded edges
        framealpha=0.8  # Transparent background for legend
    )
    plt.grid(True)
    plt.tight_layout()
    
    # Save the figure
    plt.savefig(os.path.join(output_dir, filename), dpi=300)  # Save as high-resolution image
    plt.close()  # Close the plot to free up memory

# Loop through cases and save both bar and scatter plots
for case_name, experiments in cases.items():
    # Subset the DataFrame based on the experiments in the case
    subset = df[df['experiment'].isin(experiments)]
    
    # Set the order of the short_label column as specified in the cases dictionary
    ordered_labels = [short_labels[exp] for exp in experiments]  # Map experiments to short labels
    subset['short_label'] = pd.Categorical(subset['short_label'], categories=ordered_labels, ordered=True)
    subset = subset.sort_values('short_label')  # Sort by the specified order
    
    # Clean case name for filenames
    cleaned_name = clean_case_name(case_name)
    
    # Filenames for saving
    bar_filename = f"{cleaned_name}_bar_plot.png"
    scatter_filename = f"{cleaned_name}_scatter_plot.png"
    
    # Plot and save the figures
    plot_combined_bars(subset, title=case_name, filename=bar_filename)
    plot_nll_scatter(subset, title=case_name, filename=scatter_filename)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['short_label'] = pd.Categorical(subset['short_label'], categories=ordered_labels, ordered=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['short_label'] = pd.Categorical(subset['short_label'], categories=ordered_labels, ordered=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [47]:
# Create a directory to save the tables
tables_output_dir = "results/tables"
os.makedirs(tables_output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Loop through cases and prepare tables
for case_name, experiments in cases.items():
    # Subset the DataFrame based on the experiments in the case
    subset = df[df['experiment'].isin(experiments)]
    
    # Set the order of the short_label column as specified in the cases dictionary
    ordered_labels = [short_labels[exp] for exp in experiments]  # Map experiments to short labels
    subset['short_label'] = pd.Categorical(subset['short_label'], categories=ordered_labels, ordered=True)
    subset = subset.sort_values('short_label')  # Sort by the specified order

    # Select relevant columns and rename them for clarity
    table = subset[['short_label', 'fp', 'fn', 'rev', 'shd', 'nll_val']]
    table = table.rename(columns={'short_label': 'Experiment', 'nll_val': 'NLL'})  # Rename columns

    # Clean case name for filename
    cleaned_name = clean_case_name(case_name)

    # Save the table as a CSV file
    csv_filename = f"{cleaned_name}.csv"
    table.to_csv(os.path.join(tables_output_dir, csv_filename), index=False)
    print(f"Saved table for case: {case_name} -> {csv_filename}")


Saved table for case: Generalization Across Intervention Types
Case 1 (Linear Structure) -> Generalization_Across_Intervention_Types_Case_1.csv
Saved table for case: Generalization Across Intervention Types
Case 2 (Additive Noise Neural Network Structure) -> Generalization_Across_Intervention_Types_Case_2.csv
Saved table for case: Generalization Across Intervention Types
Case 3 (Nonlinear Non-Additive Noise Structure) -> Generalization_Across_Intervention_Types_Case_3.csv
Saved table for case: Identifiability with Varying Numbers of Interventions
Case 1 (Perfect - Linear Structure) -> Identifiability_with_Varying_Numbers_of_Interventions_Case_1.csv
Saved table for case: Identifiability with Varying Numbers of Interventions
Case 2 (Perfect - Additive Noise Neural Network Structure) -> Identifiability_with_Varying_Numbers_of_Interventions_Case_2.csv
Saved table for case: Identifiability with Varying Numbers of Interventions
Case 3 (Imperfect - Nonlinear Non-Additive Noise Structure) -> I

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['short_label'] = pd.Categorical(subset['short_label'], categories=ordered_labels, ordered=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['short_label'] = pd.Categorical(subset['short_label'], categories=ordered_labels, ordered=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

In [46]:
# Define the range of experiments
experiment_range = range(1, 20)

# Define the directory paths and filenames
base_dir = "experiments"
output_dir = "results"  # Directory where the renamed files will be saved
os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist

# Iterate through the range of experiments
for i in experiment_range:
    train_dir = os.path.join(base_dir, f"e{i}", "train")  # Path to the train directory
    experiment_dir = os.path.join(base_dir, f"e{i}")  # Path to the experiment directory

    # File copying from the train directory
    if os.path.exists(train_dir):  # Check if the train directory exists
        adjacency_src = os.path.join(train_dir, "adjacency.png")
        learning_curves_src = os.path.join(train_dir, "learning-curves.png")

        # Copy adjacency.png with a new name
        if os.path.exists(adjacency_src):  # Check if the file exists
            adjacency_dst = os.path.join(output_dir, f"adjacency{i}.png")
            shutil.copy(adjacency_src, adjacency_dst)
            print(f"Copied: {adjacency_src} -> {adjacency_dst}")

        # Copy learning-curves.png with a new name
        if os.path.exists(learning_curves_src):  # Check if the file exists
            learning_curves_dst = os.path.join(output_dir, f"learning-curves{i}.png")
            shutil.copy(learning_curves_src, learning_curves_dst)
            print(f"Copied: {learning_curves_src} -> {learning_curves_dst}")

    # Additional copying for specific i values (2, 5, 8, 17)
    if i in [2, 5, 8, 17]:
        interv_w_src = os.path.join(experiment_dir, "interv_w.png")

        # Copy interv_w.png with a new name
        if os.path.exists(interv_w_src):  # Check if the file exists
            interv_w_dst = os.path.join(output_dir, f"interv_w{i}.png")
            shutil.copy(interv_w_src, interv_w_dst)
            print(f"Copied: {interv_w_src} -> {interv_w_dst}")


Copied: experiments\e1\train\adjacency.png -> results\adjacency1.png
Copied: experiments\e1\train\learning-curves.png -> results\learning-curves1.png
Copied: experiments\e2\train\adjacency.png -> results\adjacency2.png
Copied: experiments\e2\train\learning-curves.png -> results\learning-curves2.png
Copied: experiments\e2\interv_w.png -> results\interv_w2.png
Copied: experiments\e3\train\adjacency.png -> results\adjacency3.png
Copied: experiments\e3\train\learning-curves.png -> results\learning-curves3.png
Copied: experiments\e4\train\adjacency.png -> results\adjacency4.png
Copied: experiments\e4\train\learning-curves.png -> results\learning-curves4.png
Copied: experiments\e5\train\adjacency.png -> results\adjacency5.png
Copied: experiments\e5\train\learning-curves.png -> results\learning-curves5.png
Copied: experiments\e5\interv_w.png -> results\interv_w5.png
Copied: experiments\e6\train\adjacency.png -> results\adjacency6.png
Copied: experiments\e6\train\learning-curves.png -> results

In [52]:
import pandas as pd
import re

# Dataset mapping
experiment_data = [
    (1, "Generalization Across Intervention Types", 1, "data_p10_e10_n10000_linear_struct"),
    (2, "Generalization Across Intervention Types", 1, "data_p10_e10_n10000_linear_struct"),
    (3, "Generalization Across Intervention Types", 1, "data_p10_e10_n10000_linear_brutal_param"),
    (4, "Generalization Across Intervention Types", 2, "data_p10_e10_n10000_nn_struct"),
    (5, "Generalization Across Intervention Types", 2, "data_p10_e10_n10000_nn_struct"),
    (6, "Generalization Across Intervention Types", 2, "data_p10_e10_n10000_nn_brutal_param"),
    (7, "Generalization Across Intervention Types", 3, "data_p10_e10_n10000_nnadd_struct"),
    (8, "Generalization Across Intervention Types", 3, "data_p10_e10_n10000_nnadd_struct"),
    (9, "Generalization Across Intervention Types", 3, "data_p10_e10_n10000_nnadd_brutal_param"),
    (10, "Identifiability with Varying Numbers of Interventions", 1, "data_p10_e10_n10000_linear_struct"),
    (11, "Identifiability with Varying Numbers of Interventions", 1, "data_p10_e10_n10000_linear_struct"),
    (1, "Identifiability with Varying Numbers of Interventions", 1, "data_p10_e10_n10000_linear_struct"),
    (12, "Identifiability with Varying Numbers of Interventions", 2, "data_p10_e10_n10000_nn_struct"),
    (13, "Identifiability with Varying Numbers of Interventions", 2, "data_p10_e10_n10000_nn_struct"),
    (4, "Identifiability with Varying Numbers of Interventions", 2, "data_p10_e10_n10000_nn_struct"),
    (14, "Identifiability with Varying Numbers of Interventions", 3, "data_p10_e10_n10000_linear_brutal_param"),
    (15, "Identifiability with Varying Numbers of Interventions", 3, "data_p10_e10_n10000_linear_brutal_param"),
    (3, "Identifiability with Varying Numbers of Interventions", 3, "data_p10_e10_n10000_linear_brutal_param"),
    (16, "Synthetic Economic Growth Data Identifiability Tests", "Econ Data", "data_p10_e20_n10000_macro_policy_scenario"),
    (17, "Synthetic Economic Growth Data Identifiability Tests", "Econ Data", "data_p10_e20_n10000_macro_policy_scenario"),
    (18, "Synthetic Economic Growth Data Identifiability Tests", "Econ Data", "data_p10_e20_n10000_macro_policy_scenario"),
    (19, "Synthetic Economic Growth Data Identifiability Tests", "Econ Data", "data_p10_e20_n10000_macro_policy_scenario")
]

# Shortened labels mapping
short_labels = {
    1: "e1: perfect",
    2: "e2: perfect-unknown",
    3: "e3: imperfect",
    4: "e4: perfect",
    5: "e5: perfect-unknown",
    6: "e6: imperfect",
    7: "e7: perfect",
    8: "e8: perfect-unknown",
    9: "e9: imperfect",
    10: "e10: no interventions",
    11: "e11: removed",
    12: "e12: no interventions",
    13: "e13: removed",
    14: "e14: no interventions",
    15: "e15: removed",
    16: "e16: no interventions",
    17: "e17: perfect-unknown",
    18: "e18: perfect",
    19: "e19: perfect longer patience"
}

# Function to extract dataset details
def parse_dataset_name(dataset_name):
    match = re.match(r"data_p(\d+)_e(\d+)_n(\d+)_(.*)", dataset_name)
    if match:
        num_nodes = int(match.group(1))
        num_edges = int(match.group(2))
        num_instances = int(match.group(3))
        data_name = match.group(4)
        return num_nodes, num_edges, num_instances, data_name
    return None, None, None, None

# Create a list to hold the rows of the DataFrame
rows = []

# Build the mapping
for experiment_id, experiment, case, dataset_name in experiment_data:
    num_nodes, num_edges, num_instances, data_name = parse_dataset_name(dataset_name)
    rows.append({
        "Experiment": f"e{experiment_id}",
        "Category": experiment,
        "Case": case,
        "Number of Nodes": num_nodes,
        "Number of Edges": num_edges,
        "Number of Instances": num_instances,
        "Dataset Name": data_name,
        "Independent Variable": short_labels[experiment_id]  # Add the short label
    })

# Create the DataFrame
experiment_mapping_df = pd.DataFrame(rows)



In [53]:
experiment_mapping_df

Unnamed: 0,Experiment,Category,Case,Number of Nodes,Number of Edges,Number of Instances,Dataset Name,Independent Variable
0,e1,Generalization Across Intervention Types,1,10,10,10000,linear_struct,e1: perfect
1,e2,Generalization Across Intervention Types,1,10,10,10000,linear_struct,e2: perfect-unknown
2,e3,Generalization Across Intervention Types,1,10,10,10000,linear_brutal_param,e3: imperfect
3,e4,Generalization Across Intervention Types,2,10,10,10000,nn_struct,e4: perfect
4,e5,Generalization Across Intervention Types,2,10,10,10000,nn_struct,e5: perfect-unknown
5,e6,Generalization Across Intervention Types,2,10,10,10000,nn_brutal_param,e6: imperfect
6,e7,Generalization Across Intervention Types,3,10,10,10000,nnadd_struct,e7: perfect
7,e8,Generalization Across Intervention Types,3,10,10,10000,nnadd_struct,e8: perfect-unknown
8,e9,Generalization Across Intervention Types,3,10,10,10000,nnadd_brutal_param,e9: imperfect
9,e10,Identifiability with Varying Numbers of Interv...,1,10,10,10000,linear_struct,e10: no interventions


In [55]:
# Save the DataFrame as a CSV file
output_file = "results/tables/experiment_mapping.csv"
experiment_mapping_df.to_csv(output_file, index=False)
print(f"Experiment mapping saved to {output_file}")

Experiment mapping saved to results/tables/experiment_mapping.csv
