In [None]:
%load_ext watermark
%watermark -a 'Christian Schuhegger' -u -d -v -p numpy,xarray,scipy,pandas,matplotlib,seaborn,qgrid,rpy2,libpgm,pgmpy,networkx,graphviz,pybnl,pytest

In [None]:
%matplotlib inline
import numpy as np, pandas as pd, xarray as xr, matplotlib.pyplot as plt, seaborn as sns
import networkx as nx, graphviz, networkx.algorithms.dag

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180

sns.set()

In [None]:
from IPython.display import display, HTML

from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        if type(df) == np.ndarray:
            df = pd.DataFrame(df)
        html_str+=df.to_html()
    html_str = html_str.replace('table','table style="display:inline"')
    # print(html_str)
    display_html(html_str,raw=True)

CSS = """
.output {
    flex-direction: row;
}
"""

display(HTML("<style>.container { width:70% !important; }</style>"))

In [None]:
%load_ext rpy2.ipython

In [None]:
%load_ext autoreload
%autoreload 1
%aimport pybnl.bn

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'C')

import rpy2, rpy2.rinterface, rpy2.robjects, rpy2.robjects.packages, rpy2.robjects.lib, rpy2.robjects.lib.grid, \
    rpy2.robjects.lib.ggplot2, rpy2.robjects.pandas2ri, rpy2.interactive.process_revents, \
    rpy2.interactive, rpy2.robjects.lib.grdevices
# rpy2.interactive.process_revents.start()
rpy2.robjects.pandas2ri.activate()

# learning.test

Before we look at the `marks` data-set let's look first at a test network provided in the `bnlearn` package:
[networks](http://www.bnlearn.com/documentation/networks/)
<img src='http://www.bnlearn.com/documentation/networks/learning.test.png' width=400>

In [None]:
%%R -o rdf_lt
data(learning.test)
rdf_lt = learning.test

## Converting the R data.frame into a python pd.DataFrame and converting to CategoricalDtype

After loading the data-set we need to convert it so that all variables are of type `CategoricalDtype`: see the pandas documentation about [Categorical Data](https://pandas.pydata.org/pandas-docs/stable/categorical.html) for more details.

In [None]:
#df_lt = rpy2.robjects.pandas2ri.ri2py(rdf_lt)
df_lt = rdf_lt

ct1 = pd.api.types.CategoricalDtype(['a', 'b', 'c'], ordered=True)
ct2 = pd.api.types.CategoricalDtype(['a', 'b'], ordered=True)

for c in 'ABCDE':
    df_lt[c] = df_lt[c].astype(ct1)
df_lt['F'] = df_lt['F'].astype(ct2)

df_lt.info()

## ConstraintBasedNetFromDataDiscreteBayesNetwork

In [None]:
cbnet = pybnl.bn.ConstraintBasedNetFromDataDiscreteBayesNetwork(df_lt)
cbnet.fit()
#display_side_by_side(cbnet.structure().dot(),cbnet.structure().cpdag().dot())
cbnet.structure().cpdag().dot()

## ScoreBasedNetFromDataDiscreteBayesNetwork

In [None]:
sbnet = pybnl.bn.ScoreBasedNetFromDataDiscreteBayesNetwork(df_lt)
sbnet.fit()
sbnet.structure().cpdag().dot()

## HybridScoreAndConstainedBasedNetFromDataDiscreteBayesNetwork

In [None]:
# hnet1 = pybnl.bn.HybridScoreAndConstainedBasedNetFromDataDiscreteBayesNetwork(df_lt)
# hnet1.fit()
# hnet1.structure().cpdag().dot()

In [None]:
hnet2 = pybnl.bn.HybridScoreAndConstainedBasedNetFromDataDiscreteBayesNetwork(df_lt, algorithm='rxmax2_sihitonpc_tabu')
hnet2.fit()
hnet2.structure().cpdag().dot()

In [None]:
hnet2.structure().cpdag().vstructs()

# marks

Let's take the detour of loading the `R` data set, writing it to CSV and then loading the CSV via pandas from python. Like that we're sure we have a typical starting position in a `python` data workflow.

In [None]:
%%R -o marks
library(bnlearn)
data(marks)
write.csv(marks, file = "marks.csv")

In [None]:
pd_marks = pd.read_csv('marks.csv', index_col=0).astype(np.float64)
pd_marks.head()

In [None]:
dmarks = pybnl.bn.discretize(pd_marks)
dmarks.info()

In [None]:
dmarks.head()

Let's also create immediately a marks data-frame that include one additional latent variable that we will need later:

In [None]:
ldmarks = dmarks.copy()
pybnl.bn.augment_df_with_latent_variable(ldmarks, 'LAT', 3)
print(pybnl.bn.levels_of_latent_variable(ldmarks,'LAT'))
ldmarks.head()

## HybridScoreAndConstainedBasedNetFromDataDiscreteBayesNetwork

In [None]:
# net_dmarks = pybnl.bn.HybridScoreAndConstainedBasedNetFromDataDiscreteBayesNetwork(dmarks, algorithm='rxmax2_sihitonpc_tabu')
# net_dmarks = pybnl.bn.ConstraintBasedNetFromDataDiscreteBayesNetwork(dmarks)
net_dmarks = pybnl.bn.ScoreBasedNetFromDataDiscreteBayesNetwork(dmarks)
net_dmarks.fit()
net_dmarks.structure().cpdag().dot()

## StructuralEMNetFromDataDiscreteBayesNetwork

In [None]:
net_ldmarks = pybnl.bn.StructuralEMNetFromDataDiscreteBayesNetwork(ldmarks)
net_ldmarks.fit()
net_ldmarks.structure().cpdag().dot()