In [29]:
import pandas as pd

In [30]:
df = pd.read_parquet("networks.parquet")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

Dataset shape: (1680, 21)
Columns: ['model_id', 'dag_id', 'naming_variant_id', 'n_nodes', 'target_treewidth', 'achieved_treewidth', 'dag_method', 'naming_strategy', 'dirichlet_alpha', 'determinism_fraction', 'arity_min', 'arity_max', 'structural_seed', 'cpt_seed', 'sample_idx', 'network_description', 'cpd_arrays', 'nodes', 'edges', 'edges_count', 'created_at']


In [31]:
import json
import numpy as np
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.factors.discrete import TabularCPD

def reconstruct_bayesian_network(row):
    """
    Reconstruct a pgmpy DiscreteBayesianNetwork from a dataset row.
    
    Note: CPD arrays are stored using cpd.get_values() which returns the 2D format
    expected by TabularCPD constructor, making reconstruction straightforward.
    
    Args:
        row: A pandas Series (row from the dataset)
        
    Returns:
        DiscreteBayesianNetwork: The reconstructed pgmpy model
    """
    # Parse the stored data
    nodes = json.loads(row['nodes'])
    edges = json.loads(row['edges'])
    cpd_arrays = json.loads(row['cpd_arrays'])
    
    # Create the network structure
    model = DiscreteBayesianNetwork(edges)
    
    # Reconstruct CPDs
    cpds = []
    for node in nodes:
        # Get the CPD array for this node
        cpd_values = np.array(cpd_arrays[node])
        
        # Determine parents from the edges
        parents = [edge[0] for edge in edges if edge[1] == node]
        
        # Get cardinalities
        variable_card = cpd_values.shape[0]  # Number of states for this variable
        
        if parents:
            # This node has parents
            evidence_card = []
            for parent in parents:
                parent_cpd = np.array(cpd_arrays[parent])
                evidence_card.append(parent_cpd.shape[0])
            
            # Create state names (using default s0, s1, s2, ... format)
            state_names = {node: [f"s{i}" for i in range(variable_card)]}
            for parent in parents:
                parent_cpd = np.array(cpd_arrays[parent])
                state_names[parent] = [f"s{i}" for i in range(parent_cpd.shape[0])]
            
            cpd = TabularCPD(
                variable=node,
                variable_card=variable_card,
                values=cpd_values,
                evidence=parents,
                evidence_card=evidence_card,
                state_names=state_names
            )
        else:
            # Root node (no parents)
            state_names = {node: [f"s{i}" for i in range(variable_card)]}
            
            cpd = TabularCPD(
                variable=node,
                variable_card=variable_card,
                values=cpd_values,  
                state_names=state_names
            )
        
        cpds.append(cpd)
    
    # Add CPDs to the model
    model.add_cpds(*cpds)
    
    # Verify the model is valid
    model.check_model()
    
    return model

# Test with the first row
print("=== Reconstructing Bayesian Network from Dataset ===")
sample_row = df.iloc[0]
print(f"Reconstructing model: {sample_row['model_id']}")
print(f"Nodes: {sample_row['n_nodes']}, Edges: {sample_row['edges_count']}")

reconstructed_model = reconstruct_bayesian_network(sample_row)
print(f"✓ Successfully reconstructed model with {len(reconstructed_model.nodes())} nodes")


=== Reconstructing Bayesian Network from Dataset ===
Reconstructing model: model_0001
Nodes: 10, Edges: 9
✓ Successfully reconstructed model with 10 nodes
