In [1]:
import pandas as pd
import numpy as np
import bamt.preprocessors
import bamt.networks as Nets
from sklearn.preprocessing import OneHotEncoder

from itertools import product
from pgmpy.estimators import K2Score


In [2]:
data = pd.read_csv("synthetic.csv")
data.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
data

Unnamed: 0,origin1,origin2,cubed,negative_root3,hyperbola,combined,marker
0,2.119492,0.737207,0.174883,-0.794917,0.118959,-0.620034,1
1,-2.625095,0.031537,-0.477368,-0.337130,0.461323,-0.814498,2
2,-2.894855,0.009888,-0.505449,-0.261298,0.660181,-0.766747,1
3,2.688882,0.292918,0.486818,-0.737814,0.044543,-0.250996,1
4,1.990172,0.243591,0.188168,-0.630120,0.361815,-0.441953,0
...,...,...,...,...,...,...,...
1995,-1.058996,0.001015,-0.052232,-0.223609,0.293884,-0.275842,1
1996,0.970856,0.000148,-0.099516,-0.051751,0.158146,-0.151267,0
1997,1.772789,0.719539,0.154715,-0.843346,0.163430,-0.688631,1
1998,2.566010,1.369119,0.448988,-1.271503,0.171588,-0.822514,1


In [4]:
# define categorical variables
categoricals = ['marker']

data_discretized = data.copy(deep=True)
conts = data.columns if categoricals is None else data.columns.difference(categoricals)
for feat in conts:
    data_discretized[feat] = pd.cut(data[feat], bins=3, labels=False, duplicates='drop')

# define all categories (gradations) like Low, Mid, High
categories = [feat + str(int(k)) for feat in data_discretized.columns for k in
                  sorted(data_discretized[feat].unique())]

# construction of one-hot encoded dataset
encoder = OneHotEncoder(sparse=False)
data_discretized_enc = pd.DataFrame(encoder.fit_transform(X=data_discretized), columns=categories, dtype='uint8')

# define blacklist of edges which connect gradations of the same feature
ublacklist = [(node1, node2) for node1, node2 in product(categories, categories) if node1[:-1] == node2[:-1]]

params = {'bl_add': ublacklist}

bn = Nets.DiscreteBN()


In [5]:
nodes_descriptor = dict(types={cat: 'disc' for _, cat in enumerate(categories)})
bn.add_nodes(nodes_descriptor)

In [6]:
nodes_descriptor

{'types': {'origin10': 'disc',
  'origin11': 'disc',
  'origin12': 'disc',
  'origin20': 'disc',
  'origin21': 'disc',
  'origin22': 'disc',
  'cubed0': 'disc',
  'cubed1': 'disc',
  'cubed2': 'disc',
  'negative_root30': 'disc',
  'negative_root31': 'disc',
  'negative_root32': 'disc',
  'hyperbola0': 'disc',
  'hyperbola1': 'disc',
  'hyperbola2': 'disc',
  'combined0': 'disc',
  'combined1': 'disc',
  'combined2': 'disc',
  'marker0': 'disc',
  'marker1': 'disc',
  'marker2': 'disc'}}

In [7]:
np.random.seed(42)
bn.add_edges(data_discretized_enc.astype("int32"), scoring_function=("K2", K2Score), params=params,
                     progress_bar=False)

In [8]:
bn.plot2("visualization_result" , "bn_after.html", custom_mapper={'marker': {0: 'Zero', 1: 'One', 2: 'Two'}})

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
