In [None]:
import json
import copy
from collections import deque, defaultdict

def diagnose_and_fix_config(input_file, output_file):
    print(f"ü©∫  DIAGNOSING: {input_file}")
    print("="*60)

    with open(input_file, 'r') as f:
        config = json.load(f)

    original_list = config['setup_sequence_sample_space']
    num_vars = len(original_list)

    # --- PHASE 1: LOGICAL CHECKS (Before Reordering) ---

    # Identify indices of t and yf for special handling later
    t_idx_orig = -1
    yf_idx_orig = -1
    for i, item in enumerate(original_list):
        if item['variable_name'] == 't':
            t_idx_orig = i
        elif item['variable_name'] == 'yf':
            yf_idx_orig = i

    # Check Exogenous Consistency
    for i, item in enumerate(original_list):
        is_exo = item.get('exogenous', False)
        parents = item.get('parent_indices', [])

        if is_exo and len(parents) > 0:
            print(f"‚ùå  LOGIC ERROR (Var {i} '{item['variable_name']}'): Marked Exogenous but has parents {parents}. Removing parents.")
            item['parent_indices'] = [] # Auto-fix

        if i in parents:
             print(f"‚ùå  CRITICAL ERROR (Var {i} '{item['variable_name']}'): Self-loop detected. Removing.")
             item['parent_indices'].remove(i)

    # --- PHASE 2: TOPOLOGICAL REORDERING WITH LATE SCHEDULING ---

    print("\nüîÑ  REORDERING VARIABLES (Prioritizing 't' and 'yf' at the end)...")

    # Build Graph
    graph = defaultdict(list)
    in_degree = {i: 0 for i in range(num_vars)}

    for child_idx, item in enumerate(original_list):
        parents = item['parent_indices']
        for parent_idx in parents:
            if parent_idx >= num_vars:
                print(f"‚ùå  ERROR: Var '{item['variable_name']}' points to non-existent parent index {parent_idx}")
                continue
            graph[parent_idx].append(child_idx)
            in_degree[child_idx] += 1

    # Initialize Available Nodes (In-degree 0)
    available_nodes = [i for i in range(num_vars) if in_degree[i] == 0]
    sorted_indices = []

    # Modified Kahn's Algorithm
    while available_nodes:
        # Selection Strategy: "Late Scheduling"
        # We want to pick 't' and 'yf' LAST.
        # So we prioritize picking any node that is NOT 't' or 'yf'.

        selected_node = None

        # 1. Look for non-special nodes first
        non_special = [n for n in available_nodes if n != t_idx_orig and n != yf_idx_orig]

        if non_special:
            # If we have standard variables ready, pick one of them
            selected_node = non_special[0]
        else:
            # 2. Only 't' or 'yf' are left in the available pool.
            # If both are available, pick 't' first (as yf often depends on t)
            if t_idx_orig in available_nodes:
                selected_node = t_idx_orig
            elif yf_idx_orig in available_nodes:
                selected_node = yf_idx_orig
            else:
                # Should technically not happen if list is not empty
                selected_node = available_nodes[0]

        # Remove selected from available and add to sorted list
        available_nodes.remove(selected_node)
        sorted_indices.append(selected_node)

        # Expand to children
        for neighbor in graph[selected_node]:
            in_degree[neighbor] -= 1
            if in_degree[neighbor] == 0:
                available_nodes.append(neighbor)

    if len(sorted_indices) != num_vars:
        print("‚ùå  FATAL ERROR: Circular dependency detected. Cannot fix file.")
        return

    # Create Remapping
    old_to_new_map = {old_idx: new_idx for new_idx, old_idx in enumerate(sorted_indices)}

    # Construct New List
    new_list = []
    print(f"\n{'New Idx':<8} {'Variable Name'}")
    print("-" * 30)
    for new_idx, old_idx in enumerate(sorted_indices):
        item = copy.deepcopy(original_list[old_idx])

        # Remap parents
        new_parents = [old_to_new_map[p] for p in item['parent_indices']]
        item['parent_indices'] = sorted(new_parents)

        new_list.append(item)
        print(f"{new_idx:<8} {item['variable_name']}")

    # --- PHASE 3: ENFORCE 't' AND 'yf' ---

    print("\nüîç  ENFORCING RULES (Intervention='t', Outcome='yf')...")

    new_intervention_index = None
    new_outcome_index = None

    # Find 't' and 'yf' in the NEW list
    for idx, item in enumerate(new_list):
        if item['variable_name'] == 't':
            new_intervention_index = idx
        elif item['variable_name'] == 'yf':
            new_outcome_index = idx

    # Apply Updates
    new_config = copy.deepcopy(config)
    new_config['setup_sequence_sample_space'] = new_list

    # Fix Intervention
    if new_intervention_index is not None:
        print(f"‚úÖ  Found 't' at index {new_intervention_index}. Setting as Intervention.")
        new_config['index_of_intervention'] = new_intervention_index
    else:
        print("‚ö†Ô∏è  WARNING: Could not find variable named 't'. Kept original intervention index (remapped).")
        # Fallback: Just remap the old index
        if 'index_of_intervention' in config:
            new_config['index_of_intervention'] = old_to_new_map[config['index_of_intervention']]

    # Fix Outcome
    if new_outcome_index is not None:
        print(f"‚úÖ  Found 'yf' at index {new_outcome_index}. Setting as Outcome.")
        new_config['index_of_outcome'] = new_outcome_index
    else:
        print("‚ö†Ô∏è  WARNING: Could not find variable named 'yf'. Kept original outcome index (remapped).")
        # Fallback
        if 'index_of_outcome' in config:
            new_config['index_of_outcome'] = old_to_new_map[config['index_of_outcome']]

    # --- PHASE 4: SAVE ---

    with open(output_file, 'w') as f:
        json.dump(new_config, f, indent=2)

    print(f"\nüéâ  SUCCESS: Fixed file saved to: {output_file}")

# Usage
if __name__ == "__main__":
    # Ensure this matches your file name
    diagnose_and_fix_config("ihdp_complete_3_long.json", "ihdp_complete_3_long_FIXED.json")

ü©∫  DIAGNOSING: ihdp_complete_3_long.json

üîÑ  REORDERING VARIABLES (Prioritizing 't' and 'yf' at the end)...

New Idx  Variable Name
------------------------------
0        birth_weight
1        birth_order
2        mother_age
3        smoked_cigarettes
4        drank_alcohol
5        worked_during_pregnancy
6        education
7        head_circumference
8        sex_male
9        twin
10       married
11       weeks_preterm
12       prenatal_care
13       used_drugs
14       neonatal_health_index
15       firstborn
16       t
17       yf

üîç  ENFORCING RULES (Intervention='t', Outcome='yf')...
‚úÖ  Found 't' at index 16. Setting as Intervention.
‚úÖ  Found 'yf' at index 17. Setting as Outcome.

üéâ  SUCCESS: Fixed file saved to: ihdp_complete_3_long_FIXED.json


In [None]:
length = ['short', 'med', 'long']
for i in range(10):
  for l in length:
    diagnose_and_fix_config(f"ihdp_complete_{i}_{l}.json", f"/content/fixed/ihdp_complete_{i}_{l}_FIXED.json")

In [None]:
from google.colab import files
for l in length:
  files.download(f"/content/fixed/ihdp_complete_9_{l}_FIXED.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>