In [None]:
%load_ext watermark
%watermark -a 'Christian Schuhegger' -u -d -v -p numpy,pandas,matplotlib,seaborn,rpy2,libpgm,pgmpy,networkx,dsbasics,pytest

In [None]:
%matplotlib inline
import numpy as np, scipy, scipy.stats as stats, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import pgmpy, pgmpy.models, pgmpy.factors.discrete, pgmpy.inference, libpgm, pytest
import datetime, time
import matplotlib.dates
import pytz
from dateutil import relativedelta
import timeit

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180

sns.set()

In [None]:
%load_ext autoreload
%autoreload 1
%aimport dsbasics.bn

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'C')

import rpy2, rpy2.rinterface, rpy2.robjects, rpy2.robjects.packages, rpy2.robjects.lib, rpy2.robjects.lib.grid, \
    rpy2.robjects.lib.ggplot2, rpy2.robjects.pandas2ri, rpy2.interactive.process_revents, \
    rpy2.interactive, rpy2.robjects.lib.grdevices
# rpy2.interactive.process_revents.start()
rpy2.robjects.pandas2ri.activate()

# import R's "base" package
base = rpy2.robjects.packages.importr('base')
# import R's utility package
utils = rpy2.robjects.packages.importr('utils')
# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

# R package names
packnames = ('bnlearn', 'gRain')

# R vector of strings

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpy2.robjects.packages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(rpy2.robjects.StrVector(names_to_install))

grdevices   = rpy2.robjects.packages.importr('grDevices')
bnlearn     = rpy2.robjects.packages.importr('bnlearn')
gRain       = rpy2.robjects.packages.importr('gRain')

The test case follows exercise 5.1 from "Doing Bayesian Data Analysis" by John K. Kruschke:
* https://sites.google.com/site/doingbayesiandataanalysis/exercises
* [Kruschke-DBDA2E-ExerciseSolutions.pdf](https://sites.google.com/site/doingbayesiandataanalysis/exercises/Kruschke-DBDA2E-ExerciseSolutions.pdf) p. 17

In [None]:
p_disease_present = 0.001
prior = pd.DataFrame([p_disease_present, 1-p_disease_present], columns=['disease-state'])
prior.index = pd.Index(['disease-present', 'disease-absent'])

p_test_positive_given_disease_present = 0.99
p_test_positive_given_disease_absent = 0.05

disease_test_cpd_df = pd.DataFrame([[p_test_positive_given_disease_present, p_test_positive_given_disease_absent],
                                    [1 - p_test_positive_given_disease_present, 1 - p_test_positive_given_disease_absent]], columns=['disease-present', 'disease-absent'])
disease_test_cpd_df.index = pd.Index(['test-positive', 'test-negative'])

posterior = prior['disease-state'].copy()

# selected_row = disease_test_cpd_df.loc['test-positive', :]
# posterior = selected_row * posterior
# posterior = posterior / float(posterior.sum())
# posterior['disease-present']

for x in ['test-positive', 'test-negative']:
    selected_row = disease_test_cpd_df.loc[x,:]
    posterior = selected_row * posterior
    posterior = posterior / float(posterior.sum())


In [None]:
prior

In [None]:
disease_test_cpd_df

In [None]:
posterior

In [None]:
disease_state_CPD = pgmpy.factors.discrete.TabularCPD(variable='disease-state',
                                                      variable_card=2,
                                                      values=[[p_disease_present], [1.0 - p_disease_present]])
print(disease_state_CPD)

In [None]:
test_result_CPD_1 = pgmpy.factors.discrete.TabularCPD(
     variable='test-result1',
     variable_card=2,
     values=[[p_test_positive_given_disease_present, p_test_positive_given_disease_absent],
             [(1 - p_test_positive_given_disease_present), (1 - p_test_positive_given_disease_absent)]],
     evidence=['disease-state'],
     evidence_card=[2])
print(test_result_CPD_1)

In [None]:
test_result_CPD_2 = pgmpy.factors.discrete.TabularCPD(
     variable='test-result2',
     variable_card=2,
     values=[[p_test_positive_given_disease_present, p_test_positive_given_disease_absent],
             [(1 - p_test_positive_given_disease_present), (1 - p_test_positive_given_disease_absent)]],
     evidence=['disease-state'],
     evidence_card=[2])
print(test_result_CPD_2)

In [None]:
model = pgmpy.models.BayesianModel()

model.add_nodes_from(['disease-state', 'test-result1', 'test-result2'])
model.add_edge('disease-state', 'test-result1')
model.add_edge('disease-state', 'test-result2')

model.add_cpds(disease_state_CPD, test_result_CPD_1, test_result_CPD_2)
model.check_model()

* [TypeError: object of type 'dict_keyiterator' has no len()](https://github.com/pgmpy/pgmpy/issues/927)

In [None]:
infr1 = pgmpy.inference.BeliefPropagation(model)
evidence = {'test-result1': 0}
query_vars = ['disease-state']
p_disease = infr1.query(variables=query_vars, evidence=evidence)['disease-state'].values[0]
# pprint('{0:f}'.format(p_disease))
assert p_disease == pytest.approx(0.01943463, 0.0000001)

In [None]:
p_disease

In [None]:
infr2 = pgmpy.inference.VariableElimination(model)
p_disease = infr2.query(variables=query_vars, evidence=evidence)['disease-state'].values[0]
assert p_disease == pytest.approx(0.01943463, 0.0000001)

In [None]:
p_disease