# Probabilistic Graphical Models Using Python

# Bayesian Network Fundamentals

## Representing independencies using pgmpy

In [32]:
# Firstly we need to import IndependenceAssertion
from pgmpy.independencies import IndependenceAssertion

In [33]:
# Each assertion is in the form of [X, Y, Z] meaning X is
# independent of Y given Z.
assertion1 = IndependenceAssertion('X', 'Y')
assertion1

(X _|_ Y)

In [34]:
assertion2 = IndependenceAssertion('X', 'Y', 'Z')
assertion2

(X _|_ Y | Z)

In [35]:
from pgmpy.independencies import Independencies
# There are multiple ways to create an Independencies object, we
# could either initialize an empty object or initialize with some
# assertions

In [36]:
independencies = Independencies() # Empty object
independencies.get_assertions()

[]

In [37]:
independencies.add_assertions(assertion1, assertion2)
independencies.get_assertions()

[(X _|_ Y), (X _|_ Y | Z)]

# Representing joint probability distributions using pgmpy

In [38]:
from pgmpy.factors.discrete import JointProbabilityDistribution as Joint
distribution = Joint(['coin1', 'coin2'],
[2, 2],
[0.25, 0.25, 0.25, 0.25])

In [39]:
distribution.check_independence(['coin1'], ['coin2'])

True

# Conditional probability distribution

## Representing CPDs using pgmpy

Restaurant example\
To choose a restaurant, we won't only be
looking just at the quality of food; we might also want to look at other attributes,
such as the cost, location, size, and so on.

In [40]:
from pgmpy.factors.discrete import TabularCPD
# For creating a TabularCPD object we need to pass three
# arguments: the variable name, its cardinality that is the number
# of states of the random variable and the probability value
# corresponding each state.
quality = TabularCPD(variable='Quality',
variable_card=3,
values=[[0.3], [0.5], [0.2]])
print(quality)

+------------+-----+
| Quality(0) | 0.3 |
+------------+-----+
| Quality(1) | 0.5 |
+------------+-----+
| Quality(2) | 0.2 |
+------------+-----+


In [41]:
quality.variables

['Quality']

In [42]:
quality.cardinality

array([3])

In [43]:
quality.values

array([0.3, 0.5, 0.2])

In [44]:
location = TabularCPD(variable='Location',
variable_card=2,
values=[[0.6], [0.4]])
print(location)

+-------------+-----+
| Location(0) | 0.6 |
+-------------+-----+
| Location(1) | 0.4 |
+-------------+-----+


In [45]:
cost = TabularCPD(
variable='Cost',
variable_card=2,
values=[[0.8, 0.6, 0.1, 0.6, 0.6, 0.05],
[0.2, 0.4, 0.9, 0.4, 0.4, 0.95]],
evidence=['Q', 'L'],
evidence_card=[3, 2])

In [46]:
print(cost)

+---------+------+------+------+------+------+------+
| Q       | Q(0) | Q(0) | Q(1) | Q(1) | Q(2) | Q(2) |
+---------+------+------+------+------+------+------+
| L       | L(0) | L(1) | L(0) | L(1) | L(0) | L(1) |
+---------+------+------+------+------+------+------+
| Cost(0) | 0.8  | 0.6  | 0.1  | 0.6  | 0.6  | 0.05 |
+---------+------+------+------+------+------+------+
| Cost(1) | 0.2  | 0.4  | 0.9  | 0.4  | 0.4  | 0.95 |
+---------+------+------+------+------+------+------+


# Graph theory

# Bayesian models

getting late for school

<img src="images/gettinglateforschool.jpg" width="450">

random variable is discrete with only two possible states {yes, no}

In [47]:
from pgmpy.models import BayesianModel
model = BayesianModel()

In [48]:
model.add_nodes_from(['rain', 'traffic_jam'])
model.add_edge('rain', 'traffic_jam')

if we add an edge, but the nodes, between which the edge is, are not present in the model,
pgmpy automatically adds those nodes to the model.

In [49]:
model.add_edge('accident', 'traffic_jam')
model.nodes()

NodeView(('rain', 'traffic_jam', 'accident'))

In [50]:
model.edges()

OutEdgeView([('rain', 'traffic_jam'), ('accident', 'traffic_jam')])

In [51]:
from pgmpy.factors.discrete import TabularCPD
cpd_rain = TabularCPD('rain', 2, [[0.4], [0.6]])
cpd_accident = TabularCPD('accident', 2, [[0.2], [0.8]])
cpd_traffic_jam = TabularCPD(
'traffic_jam', 2,
[[0.9, 0.6, 0.7, 0.1],
[0.1, 0.4, 0.3, 0.9]],
evidence=['rain', 'accident'],
evidence_card=[2, 2])

In [52]:
# sociate CPD with the model. To associate them with the model, we just need to use the add_cpd method and pgmpy automatically

In [53]:
model.add_cpds(cpd_rain, cpd_accident, cpd_traffic_jam)
model.get_cpds()

[<TabularCPD representing P(rain:2) at 0x7fa437152d90>,
 <TabularCPD representing P(accident:2) at 0x7fa437152dd0>,
 <TabularCPD representing P(traffic_jam:2 | rain:2, accident:2) at 0x7fa437152e50>]

In [54]:
model.add_node('long_queues')
model.add_edge('traffic_jam', 'long_queues')
cpd_long_queues = TabularCPD('long_queues', 2,
[[0.9, 0.2],
[0.1, 0.8]],
evidence=['traffic_jam'],
evidence_card=[2])

In [55]:
print(cpd_long_queues)

+----------------+----------------+----------------+
| traffic_jam    | traffic_jam(0) | traffic_jam(1) |
+----------------+----------------+----------------+
| long_queues(0) | 0.9            | 0.2            |
+----------------+----------------+----------------+
| long_queues(1) | 0.1            | 0.8            |
+----------------+----------------+----------------+


In [56]:
model.add_cpds(cpd_long_queues)
model.add_nodes_from(['getting_up_late',
'late_for_school'])

In [57]:
model.add_edges_from(
[('getting_up_late', 'late_for_school'),
('traffic_jam', 'late_for_school')])


In [58]:
cpd_getting_up_late = TabularCPD('getting_up_late', 2,
[[0.6], [0.4]])

In [59]:
cpd_late_for_school = TabularCPD(
'late_for_school', 2,
[[0.9, 0.45, 0.8, 0.1],
[0.1, 0.55, 0.2, 0.9]],
evidence=['getting_up_late',
'traffic_jam'],
evidence_card=[2, 2])

In [60]:
model.add_cpds(cpd_getting_up_late, cpd_late_for_school)
model.get_cpds()

[<TabularCPD representing P(rain:2) at 0x7fa437152d90>,
 <TabularCPD representing P(accident:2) at 0x7fa437152dd0>,
 <TabularCPD representing P(traffic_jam:2 | rain:2, accident:2) at 0x7fa437152e50>,
 <TabularCPD representing P(long_queues:2 | traffic_jam:2) at 0x7fa437151550>,
 <TabularCPD representing P(getting_up_late:2) at 0x7fa437151dd0>,
 <TabularCPD representing P(late_for_school:2 | getting_up_late:2, traffic_jam:2) at 0x7fa4371514d0>]

pgmpy also provides a check_model method that checks whether the model and all the associated CPDs are consistentv

In [61]:
model.check_model()

True

In case we have got some wrong CPD associated with the model and we want to remove it

In [62]:
model.remove_cpds('late_for_school')
model.get_cpds()

[<TabularCPD representing P(rain:2) at 0x7fa437152d90>,
 <TabularCPD representing P(accident:2) at 0x7fa437152dd0>,
 <TabularCPD representing P(traffic_jam:2 | rain:2, accident:2) at 0x7fa437152e50>,
 <TabularCPD representing P(long_queues:2 | traffic_jam:2) at 0x7fa437151550>,
 <TabularCPD representing P(getting_up_late:2) at 0x7fa437151dd0>]

### Reasoning pattern in Bayesian networks

Would the probability of having a road accident change if I knew that there was a
traffic jam? what are the chances that it rained heavily today if some student
comes late to class?

$$
P_{J}=P(A, R, J, G, L, Q)=P(A) P(R) P(J \mid A, R) P(Q \mid G) P(L \mid G, J)
$$

In [63]:
model.is_active_trail('accident', 'rain')

False

In [64]:
model.is_active_trail('accident', 'rain',
observed='traffic_jam')

True

In [65]:
model.is_active_trail('getting_up_late', 'rain')

False

In [66]:
model.is_active_trail('getting_up_late', 'rain',
observed='late_for_school')

True

### Rule CPD

<img src="images/RuleCPD.jpg" width="450">

In [None]:
from pgmpy.factors import TreeCPD, Factor
tree_cpd = TreeCPD([
('B', Factor(['A'], [2], [0.8,
('B', 'C', '1'),
('C', Factor(['A'], [2], [0.1,
('C', 'D', '1'),
('D', Factor(['A'], [2], [0.9,
('D', Factor(['A'], [2], [0.4,
0.2]), '0'),
0.9]), '0'),
0.1]), '0'),
0.6]), '1')])

## Markov Network Fundamentals