In [5]:
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator 
from pgmpy.inference import VariableElimination

In [17]:
import numpy as np
import pandas as pd

# pgmpy imports
from pgmpy.models import BayesianNetwork           # renamed from BayesianModel
from pgmpy.estimators import BayesianEstimator    # avoids zero-counts with pseudo_counts
from pgmpy.inference import VariableElimination

# Read data
heartDisease = pd.read_csv('heart.csv')
# Replace '?' and coerce numeric columns to numeric (non-numeric -> NaN)
heartDisease = heartDisease.replace('?', np.nan)

# List of columns we will use (adjust if your CSV has different names)
# Keep categorical columns as-is if already 0/1, else convert appropriately
num_cols = ['age', 'trestbps', 'chol', 'thalach']
cat_cols = ['sex', 'fbs', 'exang', 'restecg', 'heartdisease']  # adjust column names if different

# Ensure numeric columns are numeric
for c in num_cols:
    heartDisease[c] = pd.to_numeric(heartDisease[c], errors='coerce')

# Impute numeric columns with median (you may choose mean or dropna)
for c in num_cols:
    med = heartDisease[c].median()
    heartDisease[c] = heartDisease[c].fillna(med)

# If categorical columns contain missing values, fill with mode
for c in cat_cols:
    if c in heartDisease.columns:
        heartDisease[c] = pd.to_numeric(heartDisease[c], errors='coerce')
        heartDisease[c] = heartDisease[c].fillna(heartDisease[c].mode()[0]).astype(int)

# Discretize numeric columns into bins (choose bin counts as needed)
# We'll store the cut bins so we can map back a continuous value to its discrete bin index.
bins_def = {}
n_bins = {'age':5, 'trestbps':3, 'chol':3, 'thalach':3}  # tweak as needed

for col, k in n_bins.items():
    # use pd.cut with labels 0..k-1 so values become integers (discrete states)
    labels = list(range(k))
    heartDisease[col + '_bin'] = pd.cut(heartDisease[col], bins=k, labels=labels, include_lowest=True)
    # convert categorical codes to int
    heartDisease[col + '_bin'] = heartDisease[col + '_bin'].astype(int)
    bins_def[col] = heartDisease[[col, col + '_bin']].drop_duplicates().sort_values(col)

# Show mapping (example)
print("Example bin mapping (sample):")
for col in n_bins:
    # print bin edges by using pandas.cut again to inspect the intervals
    _, edges = pd.cut(heartDisease[col], bins=n_bins[col], retbins=True, include_lowest=True)
    print(f"{col} bins (edges): {edges}")

# Build Bayesian Network structure (use BayesianNetwork)
model = BayesianNetwork([
    ('age_bin','trestbps_bin'),
    ('age_bin','fbs'),
    ('sex','trestbps_bin'),
    ('exang','trestbps_bin'),
    ('trestbps_bin','heartdisease'),
    ('fbs','heartdisease'),
    ('heartdisease','restecg'),
    ('heartdisease','thalach_bin'),
    ('heartdisease','chol_bin')
])

# Fit the model: use the discretized columns and categorical columns
fit_cols = ['age_bin','trestbps_bin','chol_bin','thalach_bin','sex','fbs','exang','restecg','heartdisease']
model.fit(heartDisease[fit_cols], estimator=BayesianEstimator, prior_type='dirichlet', pseudo_counts=1)

# Inference
infer = VariableElimination(model)

# Example: to query for a specific continuous age=38, map to its bin first
age_value = 38
age_bin = int(pd.cut([age_value], bins=n_bins['age'], labels=list(range(n_bins['age'])), include_lowest=True)[0])
print(f"age={age_value} -> age_bin={age_bin}")

# Query probability of heartdisease given age_bin
q = infer.query(variables=['heartdisease'], evidence={'age_bin': age_bin})
print(q)

# Example: Query probability of heartdisease given chol=230
chol_value = 230
chol_bin = int(pd.cut([chol_value], bins=n_bins['chol'], labels=list(range(n_bins['chol'])), include_lowest=True)[0])
print(f"chol={chol_value} -> chol_bin={chol_bin}")

q2 = infer.query(variables=['heartdisease'], evidence={'chol_bin': chol_bin})
print(q2)


Example bin mapping (sample):
age bins (edges): [28.952 38.6   48.2   57.8   67.4   77.   ]
trestbps bins (edges): [ 93.894      129.33333333 164.66666667 200.        ]
chol bins (edges): [125.562 272.    418.    564.   ]
thalach bins (edges): [ 70.869      114.66666667 158.33333333 202.        ]
age=38 -> age_bin=2


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.5088 |
+-----------------+---------------------+
| heartdisease(1) |              0.1869 |
+-----------------+---------------------+
| heartdisease(2) |              0.1313 |
+-----------------+---------------------+
| heartdisease(3) |              0.1172 |
+-----------------+---------------------+
| heartdisease(4) |              0.0558 |
+-----------------+---------------------+
chol=230 -> chol_bin=1


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

+-----------------+---------------------+
| heartdisease    |   phi(heartdisease) |
| heartdisease(0) |              0.3785 |
+-----------------+---------------------+
| heartdisease(1) |              0.2150 |
+-----------------+---------------------+
| heartdisease(2) |              0.1833 |
+-----------------+---------------------+
| heartdisease(3) |              0.1459 |
+-----------------+---------------------+
| heartdisease(4) |              0.0773 |
+-----------------+---------------------+
