# BN Generation Parameter Sweep

This notebook sweeps over DAG/BN generation parameters outlined in `notebooks/graph_generation/ideas.md` and materializes multiple discrete BN variants per DAG.

It varies:
- n (number of variables)
- target treewidth
- variable arity (fixed or range)
- CPT skewness (Dirichlet alpha)
- determinism fraction (mostly 0%)

Outputs:
- CSV with per-variant metadata
- On-screen CPT previews for a small sample



In [1]:
import sys
from pathlib import Path
import re
from os import getenv
import pandas as pd
import numpy as np
from openai import OpenAI

# Ensure src is importable
repo_root = Path(".").resolve().parents[1]
sys.path.append(str(repo_root / 'src'))

from graph_generation import generate_dag_with_treewidth
from bn_generation import generate_variants_for_dag
from cpd_utils import cpd_to_ascii_table
from discrete.discrete_inference import format_probability_query, query_probability
from bn_query_sweep import compute_query_complexity, compute_all_query_complexities, generate_bayesian_networks_and_metadata


In [2]:
# Parameter grids 
ns = [8]
treewidths = [4, 5, 7]
arity_specs = [
    #{"type": "fixed", "fixed": 2},
    {"type": "range", "min": 2, "max": 3},
]
dirichlet_alphas = [1.0, 0.5]
determinism_fracs = [0.0]  
#naming_strategies = ['simple', 'confusing', 'semantic']  # Add naming strategy variation
naming_strategies = ['confusing']  # Add naming strategy variation
variants_per_combo = 4
base_seed = 42
rows = []
preview_samples = []
sample_counter = 0
all_bayesian_networks = []  # Store all BNs and their metadata

In [3]:
# Generate Bayesian networks and populate metadata using the centralized function
all_bayesian_networks, rows, preview_samples = generate_bayesian_networks_and_metadata(
    ns=ns,
    treewidths=treewidths,
    arity_specs=arity_specs,
    dirichlet_alphas=dirichlet_alphas,
    determinism_fracs=determinism_fracs,
    naming_strategies=naming_strategies,
    variants_per_combo=variants_per_combo,
    base_seed=base_seed,
    max_preview_samples=3
)

print(f"Generated {len(all_bayesian_networks)} Bayesian networks")
print(f"Created {len(rows)} metadata rows")
print(f"Collected {len(preview_samples)} preview samples")


Generated 24 Bayesian networks
Created 24 metadata rows
Collected 12 preview samples


In [None]:
# Filter out BNs where the number of edges is less than twice the number of nodes
from bn_utils import num_edges

filtered_bn_list = []
filtered_rows = []

# Map original BN indices to their indices in the filtered list
original_bn_idx_to_filtered_idx = {}

for orig_idx, (bn_dict, row) in enumerate(zip(all_bayesian_networks, rows)):
    bn = bn_dict["bn"]
    n_nodes = len(bn.nodes())
    n_edges = num_edges(bn)
    if n_edges >= 2 * n_nodes:
        filtered_bn_list.append(bn_dict)
        filtered_rows.append(row)
        original_bn_idx_to_filtered_idx[orig_idx] = len(filtered_bn_list) - 1

# Overwrite the master lists with the filtered ones
all_bayesian_networks = filtered_bn_list
rows = filtered_rows

In [None]:
print(f"After filtering, {len(all_bayesian_networks)} Bayesian networks remain.")
rows

df = pd.DataFrame(rows)
# Drop columns 'determinism' and 'target_tw'
df = df.drop(columns=['determinism', 'target_tw', 'arity', 'n'])
display(df)
print(f"Total variants: {len(df)}")

After filtering, 16 Bayesian networks remain.


Unnamed: 0,achieved_tw,naming,alpha,seed,variant_index,num_edges,num_nodes
0,5,confusing,1.0,44,0,17,8
1,5,confusing,1.0,10017,1,17,8
2,5,confusing,1.0,19990,2,17,8
3,5,confusing,1.0,29963,3,17,8
4,5,confusing,0.5,45,0,17,8
5,5,confusing,0.5,10018,1,17,8
6,5,confusing,0.5,19991,2,17,8
7,5,confusing,0.5,29964,3,17,8
8,7,confusing,1.0,46,0,28,8
9,7,confusing,1.0,10019,1,28,8


Total variants: 16
