# **Diplomatura en Ciencia de Datos - UNNE - 2024**
### Módulo 4: Aprendizaje Automático
### Clase 8: Redes Bayesianas

![alt text](https://www.researchgate.net/profile/Ronan-Daly/publication/220254267/figure/fig2/AS:393986503135236@1470945110452/The-ASIA-Bayesian-network-structure.png)


## Asian Disease

En este ejemplo las CPD (distribuciones de probabilidades condicionales) son valores conocidos.

![alt text](https://i.sstatic.net/IaI0b.png)

In [None]:
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

# Define the Bayesian Network structure
model = BayesianNetwork([
    ('Smoking', 'Lung Cancer'),
    ('Smoking', 'Bronchitis'),
    ('Asia', 'Tuberculosis'),
    ('Tuberculosis', 'TB or Lung Cancer'),
    ('Lung Cancer', 'TB or Lung Cancer'),
    ('TB or Lung Cancer', 'Positive X-ray'),
    ('Bronchitis', 'Positive X-ray'),
    ('TB or Lung Cancer', 'Dyspnea'),
    ('Bronchitis', 'Dyspnea')
])

# Define CPDs for each variable

# Smoking CPD (assume 30% of people smoke)
cpd_smoking = TabularCPD(variable='Smoking', variable_card=2, values=[[0.7], [0.3]])

# Visit to Asia CPD (assume 20% of people visit Asia)
cpd_asia = TabularCPD(variable='Asia', variable_card=2, values=[[0.8], [0.2]])

# Tuberculosis CPD (dependent on Asia visit)
cpd_tb = TabularCPD(variable='Tuberculosis', variable_card=2,
                    values=[[0.99, 0.95],  # No TB if no Asia visit or if visit
                            [0.01, 0.05]],  # TB is more likely if visited Asia
                    evidence=['Asia'], evidence_card=[2])

# Lung Cancer CPD (dependent on Smoking)
cpd_lc = TabularCPD(variable='Lung Cancer', variable_card=2,
                    values=[[0.99, 0.90],  # No lung cancer
                            [0.01, 0.10]],  # Lung cancer more likely if smoker
                    evidence=['Smoking'], evidence_card=[2])

# Bronchitis CPD (dependent on Smoking)
cpd_bronchitis = TabularCPD(variable='Bronchitis', variable_card=2,
                            values=[[0.9, 0.6],  # No bronchitis
                                    [0.1, 0.4]],  # Bronchitis more likely if smoker
                            evidence=['Smoking'], evidence_card=[2])

# Either TB or Lung Cancer CPD (dependent on TB and Lung Cancer)
cpd_tb_or_lc = TabularCPD(variable='TB or Lung Cancer', variable_card=2,
                          values=[[1.0, 0.0, 0.0, 0.0],  # No TB or LC
                                  [0.0, 1.0, 1.0, 1.0]],  # Either TB or LC present
                          evidence=['Tuberculosis', 'Lung Cancer'], evidence_card=[2, 2])

# Positive X-ray CPD (dependent on TB or LC and Bronchitis)
cpd_xray = TabularCPD(variable='Positive X-ray', variable_card=2,
                      values=[[0.99, 0.7, 0.8, 0.1],  # No positive x-ray
                              [0.01, 0.3, 0.2, 0.9]],  # Positive x-ray
                      evidence=['TB or Lung Cancer', 'Bronchitis'], evidence_card=[2, 2])

# Dyspnea CPD (dependent on TB or LC and Bronchitis)
cpd_dyspnea = TabularCPD(variable='Dyspnea', variable_card=2,
                         values=[[0.9, 0.7, 0.8, 0.1],  # No dyspnea
                                 [0.1, 0.3, 0.2, 0.9]],  # Dyspnea more likely if either TB/LC or Bronchitis
                         evidence=['TB or Lung Cancer', 'Bronchitis'], evidence_card=[2, 2])

# Add CPDs to the model
model.add_cpds(cpd_smoking, cpd_asia, cpd_tb, cpd_lc, cpd_bronchitis, cpd_tb_or_lc, cpd_xray, cpd_dyspnea)

# Validate the model
assert model.check_model()


In [None]:
# Create the inference object
inference = VariableElimination(model)

# Query the probability of Dyspnea given that the person smokes and has a positive X-ray
result = inference.query(variables=['Dyspnea'], evidence={'Smoking': 1, 'Positive X-ray': 1})
print(result)


In [None]:
# Query the probability of having TB or Lung Cancer given Dyspnea
result = inference.query(variables=['TB or Lung Cancer'], evidence={'Smoking': 0, 'Dyspnea': 1})
print(result)


## Weather and Rain

![alt text](https://i.sstatic.net/NEAYo.jpg)

In [None]:
# Define the Bayesian Network structure
weather_model = BayesianNetwork([
    ('C', 'R'),  # Cloudy affects Rain
    ('C', 'S'),  # Cloudy affects Sprinkler
    ('R', 'W'),  # Rain affects Wet grass
    ('S', 'W')   # Sprinkler affects Wet grass
])

# Define CPDs
cpd_C = TabularCPD(variable='C', variable_card=2, values=[[0.5], [0.5]])  # 50% chance of Cloudy
cpd_R = TabularCPD(variable='R', variable_card=2, 
                   values=[[0.8, 0.2], [0.2, 0.8]], evidence=['C'], evidence_card=[2])  # Rain depends on Cloudy
cpd_S = TabularCPD(variable='S', variable_card=2, 
                   values=[[0.5, 0.9], [0.5, 0.1]], evidence=['C'], evidence_card=[2])  # Sprinkler depends on Cloudy
cpd_W = TabularCPD(variable='W', variable_card=2, 
                   values=[[1.0, 0.1, 0.1, 0.01], [0.0, 0.9, 0.9, 0.99]], 
                   evidence=['S', 'R'], evidence_card=[2, 2])  # Wet grass depends on Rain and Sprinkler

# Add CPDs to the model
weather_model.add_cpds(cpd_C, cpd_R, cpd_S, cpd_W)

# Validate the model
assert weather_model.check_model()

# Perform inference
weather_inference = VariableElimination(weather_model)

# Query the probability of Wet grass being wet given that it's raining
weather_result = weather_inference.query(variables=['W'], evidence={'R': 1})
print(weather_result)


## Symptoms

In [None]:
# Define the Bayesian Network structure
medical_model = BayesianNetwork([
    ('D', 'S'),  # Disease affects Symptom
    ('D', 'T')   # Disease affects Test result
])

# Define CPDs
cpd_D = TabularCPD(variable='D', variable_card=2, values=[[0.95], [0.05]])  # 5% chance of Disease
cpd_S = TabularCPD(variable='S', variable_card=2, 
                   values=[[0.99, 0.30], [0.01, 0.70]], evidence=['D'], evidence_card=[2])  # Symptom depends on Disease
cpd_T = TabularCPD(variable='T', variable_card=2, 
                   values=[[0.90, 0.20], [0.10, 0.80]], evidence=['D'], evidence_card=[2])  # Test depends on Disease

# Add CPDs to the model
medical_model.add_cpds(cpd_D, cpd_S, cpd_T)

# Validate the model
assert medical_model.check_model()

# Perform inference
medical_inference = VariableElimination(medical_model)

# Query the probability of a positive test given the symptom is present
medical_result = medical_inference.query(variables=['T'], evidence={'S': 1})
print(medical_result)


Learning CPDs from Data (Medical Diagnosis)  
We'll use a simple dataset that contains information about a medical diagnosis system. We'll assume that we have data for three variables:

- Disease (D): Whether a patient has a disease (1) or not (0).
- Symptom (S): Whether a patient has a symptom (1) or not (0).
- Test Result (T): Whether a test result is positive (1) or negative (0).
- First, let’s simulate some data and then learn the probabilities from it.

Step-by-Step:
- Create a dataset: Simulate a dataset based on the structure of the Bayesian network.
- Learn CPDs: Use Maximum Likelihood Estimation (MLE) to learn the CPDs from the dataset.

In [None]:
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

# Simulate some data for our Bayesian network
data = pd.DataFrame(data={
    'D': [0, 0, 0, 1, 1, 1, 1, 1, 0, 0],  # Disease (0: no, 1: yes)
    'S': [0, 0, 1, 1, 1, 1, 1, 0, 0, 0],  # Symptom (0: no, 1: yes)
    'T': [0, 0, 1, 1, 0, 1, 1, 0, 0, 1]   # Test result (0: negative, 1: positive)
})

# Define the structure of the Bayesian network
model = BayesianNetwork([('D', 'S'), ('D', 'T')])  # Disease causes Symptom and Test result

# Learn the CPDs using Maximum Likelihood Estimation
model.fit(data, estimator=MaximumLikelihoodEstimator)

# Check the learned CPDs
for cpd in model.get_cpds():
    print(cpd)

# Perform inference with the learned model
inference = VariableElimination(model)

# Query the probability of Disease given a positive test
result = inference.query(variables=['D'], evidence={'T': 1})
print(result)


Explanation:
- Simulating Data: A small dataset is created with variables D (Disease), S (Symptom), and T (Test). This represents patients' disease status, symptoms, and test results.
- Define the Structure: We define the structure of the Bayesian network where Disease (D) influences both Symptom (S) and Test result (T).
- Fit the Model: Using Maximum Likelihood Estimation (MLE), we fit the model to the data. This learns the conditional probability distributions (CPDs) from the dataset.
- Check Learned CPDs: We print the CPDs to inspect the learned probabilities.
- Perform Inference: Finally, we query the model to compute the probability of having the disease (D) given a positive test result (T=1).

Learning the Structure from Data

If you don’t know the structure of the Bayesian network, you can also learn it from the data using structure learning algorithms like Hill-Climb Search or Constraint-Based Search (e.g., PC algorithm). Here's how to learn the structure:

In [None]:
from pgmpy.estimators import HillClimbSearch, BicScore

# Learn the structure using Hill-Climb search
hc = HillClimbSearch(data)
best_model = hc.estimate(scoring_method=BicScore(data))

# Print the learned edges
print(best_model.edges())


## Titanic

The Titanic dataset contains the following key columns:

- Pclass: Passenger class (1 = 1st, 2 = 2nd, 3 = 3rd).
- Sex: Gender (0 = female, 1 = male).
- Age: Age of the passenger.
- Fare: Fare paid by the passenger.
- Embarked: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton).
- Survived: Survival status (0 = No, 1 = Yes).

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load Titanic dataset (You can download it from Kaggle or other sources)
titanic_data = pd.read_csv("titanic.csv")

# Keep relevant columns
titanic_data = titanic_data[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Survived']]

# Handle missing values by filling with median for Age, and mode for Embarked
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

# Encode categorical data
labelencoder = LabelEncoder()
titanic_data['Sex'] = labelencoder.fit_transform(titanic_data['Sex'])  # Female = 0, Male = 1
titanic_data['Embarked'] = labelencoder.fit_transform(titanic_data['Embarked'])  # S = 2, C = 0, Q = 1

# Split the dataset into train and test sets
train_data, test_data = train_test_split(titanic_data, test_size=0.3, random_state=42)

In [None]:
titanic_data

In [None]:
from pgmpy.estimators import HillClimbSearch, BicScore

# Learn the structure of the Bayesian Network
hc = HillClimbSearch(train_data)
best_model = hc.estimate(scoring_method=BicScore(train_data))

# Print the learned structure (edges of the Bayesian network)
print("Learned structure:", best_model.edges())


In [None]:
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork

# Define the Bayesian Network model using the learned structure
bn_model = BayesianNetwork(best_model.edges())

# Learn the CPDs using Maximum Likelihood Estimation
bn_model.fit(train_data, estimator=MaximumLikelihoodEstimator)

# Display the learned CPDs
for cpd in bn_model.get_cpds():
    print(cpd)


In [None]:
from pgmpy.inference import VariableElimination

# Perform inference on the learned model
inference = VariableElimination(bn_model)

# Query: Probability of survival given a passenger is in 1st class, male
result = inference.query(variables=['Survived'], evidence={'Pclass': 1, 'Sex': 1})
print(result)


## Adult

Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset.

The dataset contains features like:

- Age: The age of the individual.
- Education: Level of education (e.g., Bachelor's, Masters).
- Marital Status: Marital status of the individual.
- Occupation: Type of occupation.
- Work Hours per Week: Number of hours worked per week.
- Race: Race of the individual.
- Sex: Gender of the individual.
- Income: Income level, either ">50K" or "<=50K".

In [3]:
import pandas as pd

data = pd.read_csv('adult.csv')

# Assign column names
columns = ['id', 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
           'hours-per-week', 'native-country', 'income']
data.columns = columns

# Select relevant features and convert categorical columns into numerical codes
data = data[['age', 'education', 'marital-status', 'occupation', 'race', 'sex', 'hours-per-week', 'income']]
data['education'] = data['education'].astype('category').cat.codes
data['marital-status'] = data['marital-status'].astype('category').cat.codes
data['occupation'] = data['occupation'].astype('category').cat.codes
data['race'] = data['race'].astype('category').cat.codes
data['sex'] = data['sex'].astype('category').cat.codes
data['income'] = data['income'].apply(lambda x: 1 if x == ' >50K' else 0)  # Income: 1 = >50K, 0 = <=50K

In [None]:
from pgmpy.estimators import HillClimbSearch, BicScore

# Initialize HillClimbSearch with the data
hc = HillClimbSearch(data)

# Use BIC score to evaluate the network structure
best_model = hc.estimate(scoring_method=BicScore(data))

# Print the learned structure
print(best_model.edges())


In [None]:
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianNetwork

# Define the Bayesian Network with the learned structure
model = BayesianNetwork(best_model.edges())

# Learn CPDs (conditional probability distributions) using MLE
model.fit(data, estimator=MaximumLikelihoodEstimator)

# Print CPDs for each variable
for cpd in model.get_cpds():
    print(cpd)


0 - Divorced
1 - Married-AF-spouse
2 - Married-civ-spouse
3 - Married-spouse-absent
4 - Never married
5 - Separated
6 - Widowed


In [None]:
from pgmpy.inference import VariableElimination

# Create inference object
inference = VariableElimination(model)

# Query the probability of having income >50K given work hours > 40 and marital status
result = inference.query(variables=['income'], evidence={'hours-per-week': 20, 'marital-status': 6, 'age': 51})
print(result)
