In [None]:
# pip install pgmpy
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
column_names = [
    "age",
    "sex",
    "cp",
    "trestbps",
    "chol",
    "fbs",
    "restecg",
    "thalach",
    "exang",
    "oldpeak",
    "slope",
    "ca",
    "thal",
    "target",
]

df = pd.read_csv(url, names=column_names)
print(df)

le = LabelEncoder()
df["sex"] = le.fit_transform(df["sex"])
df["cp"] = le.fit_transform(df["cp"])
df["fbs"] = le.fit_transform(df["fbs"])
df["restecg"] = le.fit_transform(df["restecg"])
df["exang"] = le.fit_transform(df["exang"])
df["slope"] = le.fit_transform(df["slope"])
df["ca"] = le.fit_transform(df["ca"])
df["thal"] = le.fit_transform(df["thal"])
df["target"] = le.fit_transform(df["target"])
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("target", axis=1), df["target"], test_size=0.3, random_state=42
)

# Construct a Bayesian Network structure (for simplicity, connecting a few features)
model = BayesianNetwork(
    [
        ("age", "target"),
        ("sex", "target"),
        ("chol", "target"),
        ("trestbps", "target"),
        ("exang", "target"),
    ]
)
# Define the Conditional Probability Distributions (CPDs)
cpd_age = TabularCPD(
    variable="age", variable_card=2, values=[[0.7], [0.3]]
)  # Simplified probabilities
cpd_sex = TabularCPD(variable="sex", variable_card=2, values=[[0.5], [0.5]])
cpd_chol = TabularCPD(
    variable="chol", variable_card=2, values=[[0.6], [0.4]]
)  # Simplified
cpd_trestbps = TabularCPD(variable="trestbps", variable_card=2, values=[[0.65], [0.35]])
cpd_exang = TabularCPD(variable="exang", variable_card=2, values=[[0.5], [0.5]])
# CPD for target, given age, sex, cholesterol, trestbps, and exang
cpd_target = TabularCPD(
    variable="target",
    variable_card=2,
    values=[
        # Probabilities for target = 0 (no heart disease)
        [
            0.9,
            0.6,
            0.8,
            0.4,
            0.7,
            0.3,
            0.5,
            0.2,
            0.8,
            0.7,
            0.9,
            0.6,
            0.6,
            0.3,
            0.4,
            0.1,
            0.9,
            0.8,
            0.7,
            0.5,
            0.8,
            0.6,
            0.5,
            0.4,
            0.7,
            0.6,
            0.9,
            0.7,
            0.6,
            0.4,
            0.5,
            0.6,
        ],
        # Probabilities for target = 1 (heart disease present)
        [
            0.1,
            0.4,
            0.2,
            0.6,
            0.3,
            0.7,
            0.5,
            0.8,
            0.2,
            0.3,
            0.1,
            0.4,
            0.4,
            0.7,
            0.6,
            0.9,
            0.1,
            0.2,
            0.3,
            0.5,
            0.2,
            0.4,
            0.5,
            0.6,
            0.3,
            0.4,
            0.1,
            0.3,
            0.4,
            0.6,
            0.5,
            0.4,
        ],
    ],
    evidence=["age", "sex", "chol", "trestbps", "exang"],
    evidence_card=[2, 2, 2, 2, 2],
)
# Add CPDs to the model
model.add_cpds(cpd_age, cpd_sex, cpd_chol, cpd_trestbps, cpd_exang, cpd_target)

# Validate the model
assert model.check_model()

# Perform inference
inference = VariableElimination(model)

# Define evidence (for example, if a person is 60 years old, male, with high cholesterol and normal blood pressure)
evidence = {
    "age": 1,  # 60+ years
    "sex": 1,  # Male
    "chol": 1,  # High cholesterol
    "trestbps": 1,  # Normal blood pressure
    "exang": 1,  # No exercise-induced angina
}
# Query the model for the probability of having heart disease (target=1)
result = inference.query(variables=["target"], evidence=evidence)

print("Probability of Heart Disease (1 = present, 0 = absent):")


print(result)

      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1    67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2    67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3    37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4    41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
298  45.0  1.0  1.0     110.0  264.0  0.0      0.0    132.0    0.0      1.2   
299  68.0  1.0  4.0     144.0  193.0  1.0      0.0    141.0    0.0      3.4   
300  57.0  1.0  4.0     130.0  131.0  0.0      0.0    115.0    1.0      1.2   
301  57.0  0.0  2.0     130.0  236.0  0.0      2.0    174.0    0.0      0.0   
302  38.0  1.0  3.0     138.0  175.0  0.0      0.0    173.0    0.0      0.0   

     slope   ca thal  target  
0      3.0  0.0  6.0