# Beyond Naïve Bayes: complex Bayesian Network Architectures
Build two or three Bayes networks of more complex architecture for (a smaller version of) this data set, increasing the number
of connections among the nodes. Construct one of them semi-manually (e.g use K2 algorithm and vary the maximum number of parents), and two others – using Weka’s algorithms for learning Bayes net construction (e.g. use TAN or Hill Climbing algorithms). Run the experiments described in item 5 on these new Bayes network architectures. Record, compare and analyse the outputs, in the light of the previous conclusions about the given data. 

In [26]:
import math
import numpy as np
import pandas as pd
#from pomegranate import *

import pgmpy.models
import pgmpy.inference
import networkx as nx
import pylab as plt

In [5]:
pixel_values = [x for x in range(0, 256)]



In [6]:
# Example

# Create a bayesian network
model = pgmpy.models.BayesianModel([('Guest', 'Monty'), 
                                    ('Prize', 'Monty')])

# Define conditional probability distributions (CPD)
# Probability of guest selecting door 0, 1 and 2
cpd_guest = pgmpy.factors.discrete.TabularCPD('Guest', 3, [[0.33], [0.33], [0.33]])
# Probability that the price is behind door 0, 1 and 2
cpd_prize = pgmpy.factors.discrete.TabularCPD('Prize', 3, [[0.33], [0.33], [0.33]])
# Probability that Monty selects a door (0, 1, 2), when we know which door the guest has selected and we know were the prize is
cpd_monty = pgmpy.factors.discrete.TabularCPD('Monty', 3, [[0, 0, 0, 0, 0.5, 1, 0, 1, 0.5], 
                                                           [0.5, 0, 1, 0, 0, 0, 1, 0, 0.5], 
                                                           [0.5, 1, 0, 1, 0.5, 0, 0, 0, 0]], 
                                              evidence=['Guest', 'Prize'], 
                                              evidence_card=[3, 3])

# Add CPDs to the network structure
model.add_cpds(cpd_guest, cpd_prize, cpd_monty)

# Check if the model is valid, throw an exception otherwise
model.check_model()
# Print probability distributions
print('Probability distribution, P(Guest)')
print(cpd_guest)
print()
print('Probability distribution, P(Price)')
print(cpd_prize)
print()
print('Joint probability distribution, P(Monty | Guest, Price)')
print(cpd_monty)
print()

# Plot the model
nx.draw(model, with_labels=True)
#plt.savefig('C:\\DATA\\Python-data\\bayesian-networks\\monty-hall.png')
plt.close()

# Perform variable elimination for inference
# Variable elimination (VE) is a an exact inference algorithm in bayesian networks
infer = pgmpy.inference.VariableElimination(model)

# Calculate probabilites for doors including prize, the guest has selected door 0 and Monty has selected door 2
posterior_probability = infer.query(['Prize'], evidence={'Guest': 0, 'Monty': 2})

# Print posterior probability
print('Posterior probability, Guest(0) and Monty(2)')
print(posterior_probability)
print()

Finding Elimination Order: : : 0it [00:00, ?it/s]
0it [00:00, ?it/s]Probability distribution, P(Guest)
+----------+------+
| Guest(0) | 0.33 |
+----------+------+
| Guest(1) | 0.33 |
+----------+------+
| Guest(2) | 0.33 |
+----------+------+

Probability distribution, P(Price)
+----------+------+
| Prize(0) | 0.33 |
+----------+------+
| Prize(1) | 0.33 |
+----------+------+
| Prize(2) | 0.33 |
+----------+------+

Joint probability distribution, P(Monty | Guest, Price)
+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
| Guest    | Guest(0) | Guest(0) | Guest(0) | Guest(1) | Guest(1) | Guest(1) | Guest(2) | Guest(2) | Guest(2) |
+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
| Prize    | Prize(0) | Prize(1) | Prize(2) | Prize(0) | Prize(1) | Prize(2) | Prize(0) | Prize(1) | Prize(2) |
+----------+----------+----------+----------+----------+----------+---------

## Specific for 

In [7]:
from Scripts import helperfn as hf
from Scripts import pixelFinder as pf 
from Scripts.NaiveBayse import SamNaiveBayseGaussian as nbg
%load_ext autoreload
%autoreload 2


Finding Elimination Order: : : 0it [00:00, ?it/s]


In [8]:
pixels = pf.get_top_pixels(5)

In [9]:
raw_data = pf.data_lists()

In [10]:
X, y = raw_data[1]

In [11]:
five_pixels = np.take(X, pixels[1], axis=1)
five_pixels.shape

(9690, 5)

In [13]:
test = five_pixels[0:423]
y = y[0:423]



In [14]:
pixels[0]

array([1714, 1762, 1761, 1743, 1694], dtype=int64)

In [15]:
model = pgmpy.models.BayesianModel([('1362', 'speed limit 20'), 
                                    ('982', 'speed limit 20'),
                                    ('1030', 'speed limit 20'),
                                    ('1315', 'speed limit 20'),
                                    ('1314', 'speed limit 20')])

In [16]:
def get_CPD(data, pixel_index, cardinality=256):
    #conditional probability distribution table for pixel at index
    entries = data.shape[0]
    calc = np.zeros(cardinality, dtype=int)

    for i in range(entries):
        val = data[i][pixel_index].astype(int)
        calc[val] += 1

    calc = calc/entries
    return calc[..., None].tolist()

def pixels_as_string(pixels):
    return list(map(str, pixels))



In [17]:
#cpd_1362 = pgmpy.factors.discrete.TabularCPD('1362', 256, get_CPD(test, 0))
#print('Probability distribution, P(1362)')
#print(cpd_1362)

In [18]:
cpd_labels = pixels_as_string(pixels[1])

independant_cpd_pixel_tables = [pgmpy.factors.discrete.TabularCPD(value, 256, get_CPD(test, index)) for index, value in enumerate(cpd_labels)]
independant_cpd_class_table = pgmpy.factors.discrete.TabularCPD('speed limit 20', 2, get_CPD(y, 0, cardinality=2))

In [19]:
print(independant_cpd_class_table)

+-------------------+----------+
| speed limit 20(0) | 0.496454 |
+-------------------+----------+
| speed limit 20(1) | 0.503546 |
+-------------------+----------+


In [21]:
#len(independant_cpd_tables)
#independant_cpd_tables[0].get_values()[55]

# need equal width binning for the grayscale values, try bin sizes of between 8 and 16

In [22]:
from itertools import combinations

from pgmpy.estimators import HillClimbSearch, ExhaustiveSearch
from pgmpy.estimators import K2Score

In [36]:
test = pd.DataFrame(five_pixels[0:423])
y = pd.DataFrame(y[0:423])


In [38]:
y.columns = ['y']
test = test.join(y)
test

Unnamed: 0,1362,982,1030,1315,1314,y
0,160.0,191.0,155.0,166.0,195.0,0
1,218.0,141.0,132.0,192.0,226.0,0
2,166.0,142.0,135.0,148.0,194.0,0
3,152.0,136.0,127.0,121.0,161.0,0
4,212.0,212.0,151.0,192.0,224.0,0
...,...,...,...,...,...,...
418,255.0,255.0,255.0,254.0,255.0,1
419,255.0,254.0,253.0,255.0,255.0,1
420,255.0,250.0,222.0,255.0,255.0,1
421,255.0,226.0,211.0,255.0,255.0,1


In [31]:
test = pd.DataFrame(test)
scoring_method = K2Score(data=test)
est = HillClimbSearch(data=test, scoring_method=scoring_method)
estimated_model = est.estimate(max_indegree=4, max_iter=int(1e4))

KeyboardInterrupt: 