In [None]:
%load_ext watermark
%watermark -a 'Christian Schuhegger' -u -d -v -p numpy,pandas,matplotlib,seaborn,rpy2,libpgm,pgmpy,networkx,graphviz,xarray,dsbasics,pytest
#,pygraphviz

In [None]:
%matplotlib inline
import numpy as np, scipy, scipy.stats as stats, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import pgmpy, pgmpy.models, pgmpy.factors.discrete, pgmpy.inference, libpgm, pytest
import datetime, time
import matplotlib.dates
import pytz
from dateutil import relativedelta
import timeit

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180

sns.set()

In [None]:
from IPython.display import display, HTML

from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        if type(df) == np.ndarray:
            df = pd.DataFrame(df)
        html_str+=df.to_html()
    html_str = html_str.replace('table','table style="display:inline"')
    # print(html_str)
    display_html(html_str,raw=True)

CSS = """
.output {
    flex-direction: row;
}
"""

#HTML('<style>{}</style>'.format(CSS))

In [None]:
%load_ext autoreload
%autoreload 1
%aimport dsbasics.bn

* [rpy2](https://rpy2.bitbucket.io/) on bitbucket
* [rpy2](https://rpy2.github.io/) on github pages

* [bnlearn](http://www.bnlearn.com/)
* [manual](http://www.bnlearn.com/documentation/man/)

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'C')

import rpy2, rpy2.rinterface, rpy2.robjects, rpy2.robjects.packages, rpy2.robjects.lib, rpy2.robjects.lib.grid, \
    rpy2.robjects.lib.ggplot2, rpy2.robjects.pandas2ri, rpy2.interactive.process_revents, \
    rpy2.interactive, rpy2.robjects.lib.grdevices
# rpy2.interactive.process_revents.start()
rpy2.robjects.pandas2ri.activate()

# import R's "base" package
base = rpy2.robjects.packages.importr('base')
# import R's utility package
utils = rpy2.robjects.packages.importr('utils')
# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

# R package names
packnames = ('bnlearn', 'gRain')

# R vector of strings

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpy2.robjects.packages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(rpy2.robjects.StrVector(names_to_install))

grdevices   = rpy2.robjects.packages.importr('grDevices')
bnlearn     = rpy2.robjects.packages.importr('bnlearn')
gRain       = rpy2.robjects.packages.importr('gRain')

The test case follows exercise 5.1 from "Doing Bayesian Data Analysis" by John K. Kruschke:
* https://sites.google.com/site/doingbayesiandataanalysis/exercises
* [Kruschke-DBDA2E-ExerciseSolutions.pdf](https://sites.google.com/site/doingbayesiandataanalysis/exercises/Kruschke-DBDA2E-ExerciseSolutions.pdf) p. 17

In [None]:
p_disease_present = 0.001
prior = pd.DataFrame([p_disease_present, 1-p_disease_present], columns=['disease-state'])
prior.index = pd.Index(['disease-present', 'disease-absent'])

p_test_positive_given_disease_present = 0.99
p_test_positive_given_disease_absent = 0.05

disease_test_cpd_df = pd.DataFrame([[p_test_positive_given_disease_present, p_test_positive_given_disease_absent],
                                    [1 - p_test_positive_given_disease_present, 1 - p_test_positive_given_disease_absent]], columns=['disease-present', 'disease-absent'])
disease_test_cpd_df.index = pd.Index(['test-positive', 'test-negative'])

posterior = prior['disease-state'].copy()

# selected_row = disease_test_cpd_df.loc['test-positive', :]
# posterior = selected_row * posterior
# posterior = posterior / float(posterior.sum())
# posterior['disease-present']

for x in ['test-positive', 'test-negative']:
    selected_row = disease_test_cpd_df.loc[x,:]
    posterior = selected_row * posterior
    posterior = posterior / float(posterior.sum())


In [None]:
prior

In [None]:
disease_test_cpd_df

In [None]:
posterior

In [None]:
disease_state_CPD = pgmpy.factors.discrete.TabularCPD(variable='disease-state',
                                                      variable_card=2,
                                                      values=[[p_disease_present], [1.0 - p_disease_present]])
print(disease_state_CPD)

In [None]:
test_result_CPD_1 = pgmpy.factors.discrete.TabularCPD(
     variable='test-result1',
     variable_card=2,
     values=[[p_test_positive_given_disease_present, p_test_positive_given_disease_absent],
             [(1 - p_test_positive_given_disease_present), (1 - p_test_positive_given_disease_absent)]],
     evidence=['disease-state'],
     evidence_card=[2])
print(test_result_CPD_1)

In [None]:
test_result_CPD_2 = pgmpy.factors.discrete.TabularCPD(
     variable='test-result2',
     variable_card=2,
     values=[[p_test_positive_given_disease_present, p_test_positive_given_disease_absent],
             [(1 - p_test_positive_given_disease_present), (1 - p_test_positive_given_disease_absent)]],
     evidence=['disease-state'],
     evidence_card=[2])
print(test_result_CPD_2)

In [None]:
model = pgmpy.models.BayesianModel()

model.add_nodes_from(['disease-state', 'test-result1', 'test-result2'])
model.add_edge('disease-state', 'test-result1')
model.add_edge('disease-state', 'test-result2')

model.add_cpds(disease_state_CPD, test_result_CPD_1, test_result_CPD_2)
model.check_model()

* [TypeError: object of type 'dict_keyiterator' has no len()](https://github.com/pgmpy/pgmpy/issues/927)

In [None]:
# infr1 = pgmpy.inference.BeliefPropagation(model)
# evidence = {'test-result1': 0}
# query_vars = ['disease-state']
# p_disease = infr1.query(variables=query_vars, evidence=evidence)['disease-state'].values[0]
# # pprint('{0:f}'.format(p_disease))
# assert p_disease == pytest.approx(0.01943463, 0.0000001)

In [None]:
# p_disease

In [None]:
# infr2 = pgmpy.inference.VariableElimination(model)
# p_disease = infr2.query(variables=query_vars, evidence=evidence)['disease-state'].values[0]
# assert p_disease == pytest.approx(0.01943463, 0.0000001)

In [None]:
# p_disease

In [None]:
df_disease_state_pm_table = pd.DataFrame(
    [
        [True,  p_disease_present], 
        [False, 1-p_disease_present], 
    ],columns=['disease-present','p'])
df_disease_state_pm_table

In [None]:
df_test_when_disease_cpm_table = pd.DataFrame(
    [
        [True, True, p_test_positive_given_disease_present],
        [True, False, 1- p_test_positive_given_disease_present],
        [False, True, p_test_positive_given_disease_absent],
        [False, False, 1- p_test_positive_given_disease_absent],
    ],columns=['disease-present','test-result','p']
)
df_test_when_disease_cpm_table

In [None]:
import qgrid,itertools

* [pd.Categorical](https://pandas.pydata.org/pandas-docs/version/0.22.0/categorical.html)

In [None]:
disease_present = pd.Categorical([False, True], categories=[False, True], ordered=True)
disease_present.categories

In [None]:
test_result = pd.Categorical([False, True], categories=[False, True], ordered=True)
test_result

In [None]:
dp_tr_product = list(itertools.product(list(disease_present.categories),list(test_result.categories)))
dp_tr_product

In [None]:
df_tmp = pd.DataFrame(dp_tr_product, columns=['disease-present', 'test-result'])
df_tmp['p'] = np.nan
qgrid_widget = qgrid.show_grid(df_tmp, show_toolbar=True)
qgrid_widget

In [None]:
qgrid_widget.get_changed_df().head()

In [None]:
# df_test_when_disease_cpm_table.pivot_table(index   = ['disease-present'], 
#                                            columns = ['test-result'],
#                                            values  = 'p')
df_test_when_disease_cpm_table.pivot_table(index   = list(df_test_when_disease_cpm_table.columns[:-2]), 
                                           columns = list(df_test_when_disease_cpm_table.columns[-2:-1]),
                                           values  = df_test_when_disease_cpm_table.columns[-1])
# df_test_when_disease_cpm_table.columns[-1]

* [networkx](https://github.com/networkx/networkx)
* [tutorial](https://networkx.github.io/documentation/stable/tutorial.html)

In [None]:
import networkx as nx

In [None]:
dg = nx.DiGraph()
# G.add_node(1)
dg.add_nodes_from(list(df_test_when_disease_cpm_table.columns[:-1]))
dg.add_edges_from([
    list(df_test_when_disease_cpm_table.columns[:-1]),
])

In [None]:
list(nx.connected_components(dg.to_undirected()))

In [None]:
nx.draw(dg, with_labels=True)

In [None]:
# pos = nx.nx_agraph.graphviz_layout(dg)
# nx.draw(dg, with_labels=True, pos=pos)

In [None]:
list(dg.nodes())

In [None]:
list(dg.edges())

* [graphviz](https://github.com/xflr6/graphviz)
* [graphviz.readthedocs](https://graphviz.readthedocs.io/en/stable/)
* [examples/notebook.ipynb](https://github.com/xflr6/graphviz/blob/master/examples/notebook.ipynb)

In [None]:
import graphviz

In [None]:
dot = graphviz.Digraph(comment='The Round Table')

In [None]:
dot.node('A', 'King Arthur')
dot.node('B', 'Sir Bedevere the Wise')
dot.node('L', 'Sir Lancelot the Brave')

dot.edges(['AB', 'AL'])
dot.edge('B', 'L', constraint='false')

In [None]:
dot

In [None]:
dg_dot = graphviz.Digraph(comment='disease-network')
for node in dg.nodes():
    dg_dot.node(node)

for edge in dg.edges():
    dg_dot.edge(edge[0],edge[1])

dg_dot

* [xarray](https://xarray.pydata.org/en/stable/)
* [xArray_seminar](http://meteo.unican.es/work/xarray_seminar/xArray_seminar.html)

In [None]:
import xarray as xr

In [None]:
df_test_when_disease_cpm_table

In [None]:
df_test_when_disease_cpm_xr = xr.DataArray(
    [
        [0.95, 0.05],
        [0.01, 0.99],
    ], 
    coords={'disease-present': [False, True], 'test-result': [False, True]}, 
    dims=('disease-present', 'test-result'))
df_test_when_disease_cpm_xr

In [None]:
df_test_when_disease_cpm_xr.loc[False, True]

In [None]:
df_test_when_disease_cpm_xr.loc[{'disease-present': False, 'test-result': True}]

In [None]:
df_test_when_disease_cpm_xr.loc[{'disease-present': False, 'test-result': False}]

In [None]:
df_test_when_disease_cpm_xr.loc[{'disease-present': True, 'test-result': True}]

In [None]:
df_test_when_disease_cpm_xr.loc[{'disease-present': True, 'test-result': False}]

In [None]:
df_test_when_disease_cpm_xr.to_dataframe(name='p').reset_index()

In [None]:
df_test_when_disease_cpm_table.to_xarray()

In [None]:
pvalue_array = df_test_when_disease_cpm_table.groupby(['disease-present', 'test-result']).max().to_xarray().p.values
pvalue_array

In [None]:
df_test_when_disease_cpm_xr

In [None]:
%load_ext rpy2.ipython

* R [gl](https://www.rdocumentation.org/packages/base/versions/3.5.0/topics/gl) Generate Factor Levels

In [None]:
%%R
dni3 <- dimnames(iris3)
print(dni3)
ii <- data.frame(matrix(aperm(iris3, c(1,3,2)), 
                        ncol = 4, 
                        dimnames = list(NULL, sub(" L.",".Length", sub(" W.",".Width", dni3[[2]])))),
                 Species = gl(3, 50, labels = sub("S", "s", sub("V", "v", dni3[[3]]))))
print(typeof(ii))
all.equal(ii, iris) # TRUE

In [None]:
# import os
# os.environ['EDITOR']

In [None]:
# %%R
# print(?lm)

In [None]:
# %%R
# iris3

In [None]:
# %%R
# iris

In [None]:
# %%R
# aperm(iris3, c(1,3,2))

In [None]:
# %%R
# sub(" L.",".Length", sub(" W.",".Width", dni3[[2]]))

In [None]:
# %%R
# list(NULL, sub(" L.",".Length", sub(" W.",".Width", dni3[[2]])))

In [None]:
# %%R
# matrix(aperm(iris3, c(1,3,2)),
#        ncol = 4,
#        dimnames = list(NULL, sub(" L.",".Length", sub(" W.",".Width", dni3[[2]]))))

In [None]:
%%R -i df_test_when_disease_cpm_table
# print(typeof(df_test_when_disease_cpm_table))
# print(typeof(df_test_when_disease_cpm_table[1]))
# print(df_test_when_disease_cpm_table[1])
# print(df_test_when_disease_cpm_table[2])
df_test_when_disease_cpm_table

In [None]:
%%R -i pvalue_array
pvalue_array

* [ENH: Mosaic plot and DataArray](https://github.com/pydata/xarray/issues/779)

In [None]:
# %%R -i df_test_when_disease_cpm_xr
# df_test_when_disease_cpm_xr