In [None]:
%load_ext watermark
%watermark -a 'Christian Schuhegger' -u -d -v -p numpy,xarray,scipy,pandas,sklearn,matplotlib,seaborn,qgrid,rpy2,libpgm,pgmpy,networkx,graphviz,pybnl,pytest

In [None]:
%matplotlib inline
import numpy as np, pandas as pd, xarray as xr, matplotlib.pyplot as plt, seaborn as sns
import sklearn, sklearn.pipeline
import networkx as nx, graphviz, networkx.algorithms.dag
import random
import itertools

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180

sns.set()

In [None]:
from IPython.display import display, HTML

from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        if type(df) == np.ndarray:
            df = pd.DataFrame(df)
        html_str+=df.to_html()
    html_str = html_str.replace('table','table style="display:inline"')
    # print(html_str)
    display_html(html_str,raw=True)

CSS = """
.output {
    flex-direction: row;
}
"""

def display_graphs_side_by_side(*args):
    html_str='<table><tr>'
    for g in args:
        html_str += '<td>'
        html_str += g._repr_svg_()
        html_str += '</td>'
    html_str += '</tr></table>'
    display_html(html_str,raw=True)
    

display(HTML("<style>.container { width:70% !important; }</style>"))

In [None]:
%load_ext rpy2.ipython

In [None]:
%load_ext autoreload
%autoreload 1
%aimport pybnl.bn

In [None]:
%aimport dsbasics.bin

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'C')

import rpy2, rpy2.rinterface, rpy2.robjects, rpy2.robjects.packages, rpy2.robjects.lib, rpy2.robjects.lib.grid, \
    rpy2.robjects.lib.ggplot2, rpy2.robjects.pandas2ri, rpy2.interactive.process_revents, \
    rpy2.interactive, rpy2.robjects.lib.grdevices
# rpy2.interactive.process_revents.start()
rpy2.robjects.pandas2ri.activate()

In [None]:
rpackageversionfn = rpy2.robjects.r('packageVersion')
print(rpackageversionfn("bnlearn")[0])

# House Prices in Ames, Iowa

* [Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project](http://ww2.amstat.org/publications/jse/v19n3/decock.pdf)
  * [AmesResidential.pdf](https://ww2.amstat.org/publications/jse/v19n3/decock/AmesResidential.pdf)
  * [DataDocumentation.txt](https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt)
  * [AmesHousing.txt](https://ww2.amstat.org/publications/jse/v19n3/decock/AmesHousing.txt)
  * [AmesHousing.xls](http://www.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls)
  * Also on [kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

The below example reproduces the example from chapter 5 (page 79) in [Bayesian Networks and BayesiaLab: A Practical Introduction for Researchers](https://www.amazon.com/Bayesian-Networks-BayesiaLab-Introduction-Researchers/dp/0996533303).

In [None]:
df = pd.read_csv('./AmesHousing.txt.gz', sep='\t', index_col=0)
df['MS SubClass'] = df['MS SubClass'].apply(lambda x: '{0:0>3}'.format(x))
df.iloc[:5,:15]

In [None]:
df.columns

In [None]:
discrete_non_null, discrete_with_null, continuous_non_null, continuous_with_null, levels_map = pybnl.bn.discrete_and_continuous_variables_with_and_without_nulls(df, cutoff=30)
# discrete_non_null, discrete_with_null, continuous_non_null, continuous_with_null, levels_map

In [None]:
ddf = df.copy()
#cat_columns = ['Alley', 'Bedroom AbvGr', 'Bldg Type', 'Bsmt Cond', ]
cat_columns = [
    'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
    'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type', 'Sale Condition'
] + [
    'Overall Qual', 'Overall Cond'
]
cat_columns_ordinal = [
    ('Lot Shape',      ['Reg','IR1','IR2','IR3']),
    ('Utilities',      ['AllPub','NoSewr','NoSeWa','ELO']),
    ('Land Slope',     ['Gtl', 'Mod', 'Sev']),
    ('Exter Qual',     ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Exter Cond',     ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Bsmt Qual',      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Bsmt Cond',      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Bsmt Exposure',  ['Gd', 'Av', 'Mn', 'No', 'NA']),
    ('BsmtFin Type 1', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']),
    ('BsmtFin Type 2', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']),
    ('Heating QC',     ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Electrical',     ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix']),
    ('Kitchen Qual',   ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Functional',     ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']),
    ('Fireplace Qu',   ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Garage Finish',  ['Fin', 'RFn', 'Unf', 'NA']),
    ('Garage Qual',    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Garage Cond',    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Paved Drive',    ['Y', 'P', 'N']),
    ('Pool QC',        ['Ex', 'Gd', 'TA', 'Fa', 'NA']),
    ('Fence',          ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA']),
]

continuous_columns = [
    'Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
    'Screen Porch', 'Pool Area', 'Misc Val', 'SalePrice'
]
discrete_columns = ['Year Built', 'Year Remod/Add', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Mo Sold', 'Yr Sold', 'Bedroom AbvGr', 'Kitchen AbvGr']# do not exist: 'Bedroom',  'Kitchen'

for col in cat_columns:
    levels = levels_map[col]
    # print('col: {}'.format(col))
    # if all([isinstance(level, (int, float)) for level in levels]):
    if all([np.issubdtype(type(level), np.number) for level in levels]):
        # print('int, float column: {}'.format(col))
        levels = sorted(levels)
        ddf[col] = df[col].astype(pd.api.types.CategoricalDtype(levels, ordered=True))
    else:
        ddf[col] = df[col].astype(pd.api.types.CategoricalDtype(levels, ordered=False))

for col, levels in cat_columns_ordinal:
    ddf[col] = df[col].astype(pd.api.types.CategoricalDtype(levels[::-1], ordered=True))

for col in continuous_columns:
    ddf[col] = df[col].astype(float)

for col in discrete_columns:
    if pd.isnull(df[col]).any():
        ddf[col] = df[col].astype(float)
    else:
        ddf[col] = df[col].astype(int)
    
# col   = 'Alley'
# ddf[col]
# ddf[~pd.isnull(ddf[col])][col]
# value = np.nan
# ddf.loc[df[col]==value,col]

[Working with Pandas: Fixing messy column names](https://medium.com/@chaimgluck1/working-with-pandas-fixing-messy-column-names-42a54a6659cd)

In [None]:
ddf.columns = ddf.columns.str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [None]:
type(ddf.columns)

In [None]:
ddf.head()

In [None]:
# ddf.to_hdf('AmesHousing.h5', 'AmesHousing',format='table', append=False)

In [None]:
# pd.read_hdf('AmesHousing.h5', 'AmesHousing').head()

## Treating Filtered Values ('FV')

See page 84 in "Bayesian Networks and BayesiaLab"

### Bsmt fields

In [None]:
bsmt_fields_ = ['Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Bsmt Full Bath', 'Bsmt Half Bath', 
               'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF']
bsmt_fields = pd.Index(bsmt_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[bsmt_fields].query('Bsmt_Qual == "NA"')
# ddf[ddf['Bsmt_Qual'] == 'NA'][bsmt_fields]
# df[bsmt_fields_][df['Bsmt Qual'] == 'NA']

It seems that there are no filtered values for 'Bsmt' fields, e.g. each home contains a basement.

[Querying for NaN and other names in Pandas](https://stackoverflow.com/questions/26535563/querying-for-nan-and-other-names-in-pandas)

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.Bsmt_Qual)].head()

In [None]:
df[bsmt_fields_][pd.isnull(df['Bsmt Qual'])].head()

But there are quite a lot of 'NaN' entries. Not sure why the data description contains an "NA" value as "No Basement", but no actual data-set uses it. Most likely these values are supposed to be "NA"

In [None]:
bsmt_na_fields = ['Bsmt_Qual', 'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2']
ddf.loc[pd.isnull(ddf.Bsmt_Qual), bsmt_na_fields] = "NA"
ddf[bsmt_fields].query('Bsmt_Qual == "NA"').head()

### Electrical field

In [None]:
ddf.Electrical.value_counts(dropna=False)

In [None]:
ddf[pd.isnull(ddf.Electrical)]

The one NaN value seems to be a missing value

### Fireplaces

In [None]:
fireplaces_fields_ = ['Fireplaces', 'Fireplace Qu']
fireplaces_fields = pd.Index(fireplaces_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[fireplaces_fields].query('Fireplaces == 0').head()

In [None]:
ddf.loc[ddf.Fireplaces == 0,['Fireplace_Qu']] = 'NA'
ddf[fireplaces_fields].query('Fireplaces == 0').head()

### Garage fields

In [None]:
garage_fields_ = ['Garage Type', 'Garage Finish', 'Garage Cars', 'Garage Qual', 'Garage Cond', 'Garage Yr Blt', 'Garage Area']
garage_fields = pd.Index(garage_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[garage_fields][pd.isnull(ddf.Garage_Type)].head()

In [None]:
ddf['Garage_Type'] = ddf['Garage_Type'].astype(str)\
    .astype(pd.api.types.CategoricalDtype(set(list(ddf.Garage_Type.dtype.categories) + ['NA'])))
ddf.Garage_Type.dtype.categories

In [None]:
ddf.loc[pd.isnull(ddf.Garage_Type),['Garage_Type', 'Garage_Finish', 'Garage_Qual', 'Garage_Cond']] = 'NA'
ddf.loc[ddf.Garage_Type == 'NA',['Garage_Yr_Blt']] = -1.0
#ddf[garage_fields][pd.isnull(ddf.Garage_Yr_Blt)]
#ddf['Garage_Yr_Blt'] = ddf['Garage_Yr_Blt'].astype(int)
ddf[garage_fields][ddf.Garage_Type == 'NA'].head()

### Mas Vnr fields

In [None]:
mas_vnr_fields_ = ['Mas Vnr Type', 'Mas Vnr Area']
mas_vnr_fields = pd.Index(mas_vnr_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[mas_vnr_fields][pd.isnull(ddf.Mas_Vnr_Type)].head()

In [None]:
ddf.Mas_Vnr_Type.dtype

In [None]:
ddf.loc[pd.isnull(ddf.Mas_Vnr_Type), ['Mas_Vnr_Type']] = 'None'
ddf.loc[ddf.Mas_Vnr_Type == 'None', ['Mas_Vnr_Area']] = 0.0

In [None]:
ddf[mas_vnr_fields][ddf.Mas_Vnr_Type == 'None'].head()

### Pool fields

In [None]:
pool_fields_ = ['Pool QC', 'Pool Area']
pool_fields = pd.Index(pool_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[pool_fields][pd.isnull(ddf.Pool_QC)].head()

In [None]:
ddf.loc[pd.isnull(ddf.Pool_QC), ['Pool_QC']] = 'NA'
ddf[pool_fields][ddf.Pool_QC == 'NA'].head()

### Fence field

In [None]:
fence_fields_ = ['Fence']
fence_fields = pd.Index(fence_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf.loc[pd.isnull(ddf.Fence), ['Fence']] = 'NA'
ddf[fence_fields][ddf.Fence == 'NA'].head()

### Misc Feature field

In [None]:
misc_feature_fields_ = ['Misc Feature']
misc_feature_fields = pd.Index(misc_feature_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf['Misc_Feature'] = ddf['Misc_Feature'].astype(str)\
    .astype(pd.api.types.CategoricalDtype(set(list(ddf.Misc_Feature.dtype.categories) + ['NA'])))
ddf.loc[pd.isnull(ddf.Misc_Feature), ['Misc_Feature']] = 'NA'
ddf[misc_feature_fields][ddf.Misc_Feature == 'NA'].head()

### Check remaining nan fields

In [None]:
_, discrete_with_null_, _, continuous_with_null_, _ = pybnl.bn.discrete_and_continuous_variables_with_and_without_nulls(ddf, cutoff=30)
discrete_with_null_, continuous_with_null_

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.Bsmt_Exposure)]

Just set the few NaN values to 'No'

In [None]:
ddf.loc[pd.isnull(ddf.Bsmt_Exposure),['Bsmt_Exposure']] = 'No'

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.BsmtFin_Type_2)]

Just set the few NaN values to 'Unf'

In [None]:
ddf.loc[pd.isnull(ddf.BsmtFin_Type_2),['BsmtFin_Type_2']] = 'Unf'

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.Bsmt_Full_Bath)]

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.Bsmt_Half_Bath)]

Just set all the NaN values to 0.0

In [None]:
ddf.loc[pd.isnull(ddf.Bsmt_Full_Bath),['Bsmt_Full_Bath','Bsmt_Half_Bath','BsmtFin_SF_1','BsmtFin_SF_2','Bsmt_Unf_SF','Total_Bsmt_SF']] = [0.0,0.0,0.0,0.0,0.0,0.0]

In [None]:
ddf[['Electrical']][pd.isnull(ddf.Electrical)]

Just set this single NaN value to 'Mix'

In [None]:
ddf.loc[pd.isnull(ddf.Electrical),['Electrical']] = 'Mix'

The remaining NaN garage fields seem to be really missing values so don't touch them.

In [None]:
ddf[garage_fields][pd.isnull(ddf.Garage_Finish)]

In [None]:
ddf[garage_fields][pd.isnull(ddf.Garage_Cars)]

In [None]:
ddf[garage_fields][pd.isnull(ddf.Garage_Qual)]

In [None]:
ddf[garage_fields][pd.isnull(ddf.Garage_Cond)]

The Lot_Frontage NaN fields seem to be really missing values so don't touch them.

In [None]:
ddf[pd.isnull(ddf.Lot_Frontage)].head()

In [None]:
_, discrete_with_null_, _, continuous_with_null_, _ = pybnl.bn.discrete_and_continuous_variables_with_and_without_nulls(ddf, cutoff=30)
discrete_with_null_, continuous_with_null_

## Binning / Discretization

In [None]:
ddf1 = ddf.copy()

### The target variable is SalePrice

In [None]:
ddf.SalePrice.describe()

In [None]:
ddf1.SalePrice = pd.cut(ddf.SalePrice, [0.0,75000.0, 150000.0, 225000.0, 300000.0,np.PINF], right=False)
ddf1['SalePrice'].value_counts()

### Continuous, discrete and ordinal variables

In [None]:
continuous_columns_without_sale_price = list(set(continuous_columns) - set(['SalePrice']))

In [None]:
target_variable_decision_tree_binning_variables_ = [c for c,r in cat_columns_ordinal] + continuous_columns_without_sale_price + discrete_columns + ['Overall Qual', 'Overall Cond']
target_variable_decision_tree_binning_variables = pd.Index(target_variable_decision_tree_binning_variables_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
target_variable_decision_tree_binning_variables

In [None]:
tvbt = dsbasics.bin.TargetVariableDecisionTreeBinTransformer(max_leaf_nodes=3)
ddf1.loc[:,target_variable_decision_tree_binning_variables] = \
    tvbt.fit_transform(ddf[target_variable_decision_tree_binning_variables], ddf1.SalePrice)
ddf1.head()

In [None]:
len(df.columns),len(ddf1.columns)

In [None]:
ddf['Overall_Qual'].value_counts().sort_index().plot(kind='bar');

In [None]:
ddf1['Overall_Qual'].value_counts().sort_index().plot(kind='bar');

### Convert interval indices to strings

In [None]:
ddf1 = pybnl.bn.convert_interval_index_categories_to_string_categories(ddf1,inplace=False)

In [None]:
ddf1.Bsmt_Full_Bath.dtype

### Rename columns to fit with R conventions

In [None]:
ddf1 = ddf1.rename(columns={
    "1st_Flr_SF":"X1st_Flr_SF",
    "2nd_Flr_SF":"X2nd_Flr_SF",
    "3Ssn_Porch":"X3Ssn_Porch",
    "Year_Remod/Add":"Year_Remod_Add"
})

### Drop unused columns

In [None]:
if any(ddf1.columns.isin(['PID'])):
    ddf1.drop('PID', axis=1, inplace=True)
# ddf1.columns

#### Drop the 'Alley' column, which contains most null values and will not contribute to the quality of the fit

In [None]:
if any(ddf1.columns.isin(['Alley'])):
    ddf1.drop('Alley', axis=1, inplace=True)
# ddf1.columns

The remaining columns with null values are:

In [None]:
_, discrete_with_null_, _, continuous_with_null_, _ = pybnl.bn.discrete_and_continuous_variables_with_and_without_nulls(ddf1, cutoff=30)
discrete_with_null_, continuous_with_null_

The statistics of the NaN values are as follows. Only the column `Lot_Frontage` still has relevant null values, e.g. we could simply filter out the 2 `Garage` rows that contain null values:

In [None]:
display_side_by_side(
    pd.DataFrame(ddf1.Lot_Frontage.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Yr_Blt.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Finish.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Cars.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Area.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Qual.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Cond.value_counts(dropna=False))
)

In [None]:
null_value_idx = pd.isnull(ddf1.Lot_Frontage) | pd.isnull(ddf1.Garage_Yr_Blt) | pd.isnull(ddf1.Garage_Finish) | \
    pd.isnull(ddf1.Garage_Cars) | pd.isnull(ddf1.Garage_Area) | pd.isnull(ddf1.Garage_Qual) | pd.isnull(ddf1.Garage_Cond)
null_value_idx.value_counts()

In [None]:
ddf_without_null_values = ddf1[~null_value_idx]

In [None]:
ddf_with_null_values = ddf1

## Naive Bayes Classifier

### bnlearn by hand

#### Define net

In [None]:
dg = nx.DiGraph()

dg.add_nodes_from(ddf1.columns.values)

in_vars = ddf1.columns.values[:-1]
out_var = ddf1.columns.values[-1:]
dg.add_edges_from(list(itertools.product(out_var, in_vars)))

In [None]:
ns = pybnl.bn.digraph2netstruct(dg)
# ns.dot()
display(HTML(ns.dot()._repr_svg_()))

In [None]:
nbn1 = pybnl.bn.NetAndDataDiscreteBayesNetwork(ldf=ddf_without_null_values, dg=dg)
nbn1.fit()
tmp_rnet = nbn1.rnet
tmp_rfit = nbn1.rfit
tmp_data = pybnl.bn.pydf_to_factorrdf(ddf_without_null_values)

In [None]:
%%R -i tmp_rnet -i tmp_rfit -i tmp_data -o tmp_arc_strength
tmp_arc_strength = arc.strength(tmp_rnet, tmp_data, criterion="loglik")
# tmp_arc_strength = arc.strength(tmp_rnet, tmp_data, criterion="bic")
# tmp_arc_strength = arc.strength(tmp_rnet, tmp_data, criterion="mc-mi")
# tmp_arc_strength = arc.strength(tmp_rnet, tmp_data, criterion="mi")

In [None]:
# tmp_arc_strength = pybnl.bn.factorrdf_to_pydf(tmp_arc_strength)
tmp_arc_strength = rpy2.robjects.pandas2ri.ri2py(tmp_arc_strength)
# tmp_arc_strength['log_strength'] = np.log(tmp_arc_strength.strength)

In [None]:
tmp_arc_strength.sort_values(['strength'], ascending=True).iloc[:20,:]

In [None]:
tmp_arc_strength.query('to == "Neighborhood"')

In [None]:
cpts = nbn1.to_xrds()
cpts['cptNeighborhood']

In [None]:
cpts['cptSalePrice']

In [None]:
pt_saleprice_neighborhood = cpts['cptNeighborhood'] * cpts['cptSalePrice']
pt_saleprice_neighborhood

In [None]:
nzx, nzy = np.nonzero(pt_saleprice_neighborhood.values)
nzx, nzy

In [None]:
nz_val = pt_saleprice_neighborhood.values[nzx, nzy]
nz_val

In [None]:
import math

In [None]:
contingency = pt_saleprice_neighborhood.values
pi = np.ravel(contingency.sum(axis=1))
pj = np.ravel(contingency.sum(axis=0))
contingency_nm = nz_val
log_contingency_nm = np.log(contingency_nm)
outer = pi.take(nzx) * pj.take(nzy)
log_outer = -np.log(outer) #+ math.log(pi.sum()) + math.log(pj.sum())
mi = (contingency_nm * log_contingency_nm + contingency_nm * log_outer)
mi.sum()

In [None]:
float((pt_saleprice_neighborhood * np.log((pt_saleprice_neighborhood+np.finfo(np.float64).eps) / pt_saleprice_neighborhood.sum(['SalePrice']) / pt_saleprice_neighborhood.sum(['Neighborhood']))).sum())

In [None]:
nbn1.to_dtcpm_dict()['cptNeighborhood']

In [None]:
tmp_grain = nbn1.grain

In [None]:
%%R -i tmp_grain -o rpt_saleprice_neighborhood
rpt_saleprice_neighborhood = querygrain(tmp_grain, nodes = c("SalePrice", "Neighborhood"), type = "joint")

In [None]:
np.array(rpt_saleprice_neighborhood).T

In [None]:
pt_saleprice_neighborhood

### bnlearn via naive.bayes

### sklearn MultinomialNB