# Midterm 2 - Assignment 5

### Main objective

Learn the structure of the Bayesian Network (BN) resulting from the dataset DSET4 using two BN structure learning algorithms of your choice.  For instance you can consider the algorithms implemented in PGMPY or any other comparable library (e.g. see the list of libraries listed in Lecture 7).  Compare and discuss the results obtained with the two different algorithms. Also discuss any hyperparameter/design choice you had to take.

### Data processing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
data = pd.read_csv(url, names=names)

# Display the first few rows of the dataset
print(data.head())

# Preprocessing
# Check for missing values
# print(data.isnull().sum())

from sklearn.preprocessing import OneHotEncoder

# Assuming 'data' is the original dataset containing features and target variable
# Extract features (X) and target variable (y)
X = data.drop(columns=['class'])  # Features
y = data['class']  # Target variable

# One-hot encode the features and target variable
X_encoded = OneHotEncoder().fit_transform(X).toarray()
y_encoded = OneHotEncoder().fit_transform(y.values.reshape(-1, 1)).toarray()

# Concatenate the features and target variable
data_encoded = np.append(X_encoded, y_encoded, axis=1)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

data_train_encoded = np.append(X_train, y_train, axis=1)
data_test_encoded = np.append(X_test, y_test, axis=1)

print("TR data")
print(data_train_encoded)
print("\n")
print("TS data")
print(data_test_encoded)

  buying  maint doors persons lug_boot safety  class
0  vhigh  vhigh     2       2    small    low  unacc
1  vhigh  vhigh     2       2    small    med  unacc
2  vhigh  vhigh     2       2    small   high  unacc
3  vhigh  vhigh     2       2      med    low  unacc
4  vhigh  vhigh     2       2      med    med  unacc
TR data
[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 1. 0.]
 [0. 1. 0. ... 0. 1. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 0. 0.]]


TS data
[[1. 0. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 1. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 1. 0.]]


### PC algorithm

In [2]:
from pgmpy.estimators import PC
from pgmpy.models import BayesianNetwork

# Initialize the PC algorithm with data and a different significance level
model_pc = BayesianNetwork()
pc = PC(pd.DataFrame(data_train_encoded))

# Constraint-based structure learning, with PC algorithm
dag = pc.estimate(return_type='dag')

model_pc.add_nodes_from(dag.nodes())    
model_pc.add_edges_from(dag.edges())

# Display the edges of the learned structure
print("Nodes and edges of the PC model:")
print(model_pc.nodes())
print(model_pc.edges())

  from .autonotebook import tqdm as notebook_tqdm
Working for n conditional variables: 5: 100%|██████████| 5/5 [00:04<00:00,  1.01s/it]INFO:pgmpy:Reached maximum number of allowed conditional variables. Exiting
Working for n conditional variables: 5: 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]

Nodes and edges of the PC model:
[5, 4, 21, 22, 18, 19, 20, 14, 12, 6, 7, 24, 23, 2, 0, 3, 13, 16, 17, 1, 8, 9, 15, 11, 10]
[(5, 4), (5, 7), (21, 22), (21, 23), (18, 19), (18, 24), (18, 20), (20, 24), (20, 19), (14, 12), (14, 13), (12, 23), (6, 4), (6, 7), (6, 5), (2, 0), (2, 3), (13, 12), (16, 17), (1, 0), (1, 3), (1, 2), (8, 9), (8, 10), (8, 11), (15, 17), (15, 16), (11, 9), (11, 10), (10, 9)]





### Hill Climbing Algorithm

In [3]:
# Learning with hill climbing
from pgmpy.estimators import HillClimbSearch

# Initialize the Hill Climbing search
model_hc = BayesianNetwork()
hc = HillClimbSearch(pd.DataFrame(data_train_encoded))

# Structure learning with Hill Climbing search algorithm
best_model = hc.estimate(scoring_method='k2score')

model_hc.add_nodes_from(best_model.nodes())
model_hc.add_edges_from(best_model.edges())
print("Nodes and edges of the HC model:")
print(model_hc.nodes())
print(model_hc.edges())


  0%|          | 56/1000000 [00:00<4:37:15, 60.11it/s]

Nodes and edges of the HC model:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
[(0, 2), (0, 1), (0, 12), (1, 2), (3, 2), (3, 1), (3, 0), (3, 12), (4, 3), (5, 4), (5, 6), (6, 4), (7, 4), (7, 6), (7, 3), (7, 5), (9, 8), (9, 10), (9, 11), (10, 8), (11, 8), (11, 10), (12, 13), (12, 14), (14, 13), (15, 17), (16, 17), (16, 15), (17, 21), (18, 19), (18, 20), (18, 21), (19, 21), (19, 12), (19, 22), (19, 7), (20, 19), (21, 23), (21, 3), (21, 7), (22, 23), (22, 5), (22, 21), (22, 0), (22, 6), (22, 1), (23, 12), (23, 3), (23, 7), (24, 23), (24, 18), (24, 21), (24, 15), (24, 0), (24, 5)]





### Data fitting

In [4]:
# Learning with Maximum Likelihood Estimation, other methods are available
from pgmpy.estimators import MaximumLikelihoodEstimator

# Estimate parameters using Maximum Likelihood Estimation (MLE) for PC model
model_pc.fit(pd.DataFrame(data_train_encoded), estimator=MaximumLikelihoodEstimator)

# Estimate parameters using Maximum Likelihood Estimation (MLE) for HC model
model_hc.fit(pd.DataFrame(data_train_encoded), estimator=MaximumLikelihoodEstimator)


### Predictions 

In [5]:
# Predictions using PC model
y_pred_pc = model_pc.predict(pd.DataFrame(X_test))

print("Predictions using PC")
print(y_pred_pc)

# Predictions using HC model
y_pred_hc = model_hc.predict(pd.DataFrame(X_test))

print("Predictions using HC")
print(y_pred_hc)

100%|██████████| 346/346 [00:02<00:00, 131.96it/s]


Predictions using PC
      24   21   22   23
0    0.0  0.0  0.0  1.0
1    0.0  0.0  0.0  1.0
2    0.0  0.0  0.0  1.0
3    0.0  0.0  0.0  1.0
4    0.0  0.0  0.0  1.0
..   ...  ...  ...  ...
341  0.0  0.0  0.0  1.0
342  0.0  0.0  0.0  1.0
343  0.0  0.0  0.0  1.0
344  0.0  0.0  0.0  1.0
345  0.0  0.0  0.0  1.0

[346 rows x 4 columns]


100%|██████████| 346/346 [00:00<00:00, 2369.09it/s]


Predictions using HC
      24   21   22   23
0    0.0  0.0  0.0  1.0
1    0.0  1.0  0.0  0.0
2    0.0  0.0  0.0  1.0
3    0.0  1.0  0.0  0.0
4    0.0  0.0  0.0  1.0
..   ...  ...  ...  ...
341  0.0  0.0  0.0  1.0
342  0.0  0.0  0.0  1.0
343  0.0  0.0  0.0  1.0
344  0.0  0.0  0.0  1.0
345  0.0  0.0  0.0  1.0

[346 rows x 4 columns]


In [6]:
# Get the names of nodes (variables) in PC model
print("Nodes in PC model:", model_pc.nodes())
print("Edges in PC model:", model_pc.edges())

print("\n")

# Get the names of nodes (variables) in HC model
print("Nodes in HC model:", model_hc.nodes())
print("Edges in HC model:", model_hc.edges())

Nodes in PC model: [5, 4, 21, 22, 18, 19, 20, 14, 12, 6, 7, 24, 23, 2, 0, 3, 13, 16, 17, 1, 8, 9, 15, 11, 10]
Edges in PC model: [(5, 4), (5, 7), (21, 22), (21, 23), (18, 19), (18, 24), (18, 20), (20, 24), (20, 19), (14, 12), (14, 13), (12, 23), (6, 4), (6, 7), (6, 5), (2, 0), (2, 3), (13, 12), (16, 17), (1, 0), (1, 3), (1, 2), (8, 9), (8, 10), (8, 11), (15, 17), (15, 16), (11, 9), (11, 10), (10, 9)]


Nodes in HC model: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
Edges in HC model: [(0, 2), (0, 1), (0, 12), (1, 2), (3, 2), (3, 1), (3, 0), (3, 12), (4, 3), (5, 4), (5, 6), (6, 4), (7, 4), (7, 6), (7, 3), (7, 5), (9, 8), (9, 10), (9, 11), (10, 8), (11, 8), (11, 10), (12, 13), (12, 14), (14, 13), (15, 17), (16, 17), (16, 15), (17, 21), (18, 19), (18, 20), (18, 21), (19, 21), (19, 12), (19, 22), (19, 7), (20, 19), (21, 23), (21, 3), (21, 7), (22, 23), (22, 5), (22, 21), (22, 0), (22, 6), (22, 1), (23, 12), (23, 3), (23, 7), (24, 23), (24, 18),

### Structure scores of the models

In [7]:
from scores import compute_structure_score

print("Structure score for PC:")
compute_structure_score(model_pc, data_test_encoded)
print("\n")
print("Structure score for HC:")
compute_structure_score(model_hc, data_test_encoded)

Structure score for PC:
k2 score: -3384.823199700444
bdeu score: -3424.808248803261
bic score: -3375.2576757455663
bds score: -3781.977868765447


Structure score for HC:
k2 score: -2957.889126228843
bdeu score: -2958.594353603475
bic score: -3152.0380245178853
bds score: -3494.292343403773


### Correlation scores of the models

In [8]:
from scores import compute_correlation_score
print("Correlation score for PC:")
compute_correlation_score(model_pc, data_test_encoded)
print("\n")
print("Correlation score for HC:")
compute_correlation_score(model_hc, data_test_encoded)

Correlation score for PC:
chi_square score (accuracy): 0.9314516129032258
g_sq score (accuracy): 0.9249492900608519
log_likelihood score (accuracy): 0.9249492900608519
freeman_tuckey score (accuracy): 0.9224489795918367
modified_log_likelihood score (accuracy): 0.9202453987730062
neyman score (accuracy): 0.9202453987730062
cressie_read score (accuracy): 0.9292929292929293


Correlation score for HC:
chi_square score (accuracy): 0.5482866043613707
g_sq score (accuracy): 0.5534591194968553
log_likelihood score (accuracy): 0.5534591194968553
freeman_tuckey score (accuracy): 0.5523809523809524
modified_log_likelihood score (accuracy): 0.554140127388535
neyman score (accuracy): 0.554140127388535
cressie_read score (accuracy): 0.55


### Final considerations