In [None]:
%load_ext watermark
%watermark -a 'Christian Schuhegger' -u -d -v -p numpy,xarray,scipy,pandas,sklearn,matplotlib,seaborn,qgrid,rpy2,libpgm,pgmpy,networkx,graphviz,pybnl,pytest

In [None]:
%matplotlib inline
import numpy as np, pandas as pd, xarray as xr, matplotlib.pyplot as plt, seaborn as sns
import sklearn, sklearn.pipeline, sklearn.model_selection, sklearn.naive_bayes
import networkx as nx, graphviz, networkx.algorithms.dag
import random
import itertools, collections
import tqdm
import warnings

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
np.set_printoptions(edgeitems=10)
np.set_printoptions(suppress=True)
np.core.arrayprint._line_width = 180

sns.set()

In [None]:
from IPython.display import display, HTML

from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        if type(df) == np.ndarray:
            df = pd.DataFrame(df)
        html_str+=df.to_html()
    html_str = html_str.replace('table','table style="display:inline"')
    # print(html_str)
    display_html(html_str,raw=True)

CSS = """
.output {
    flex-direction: row;
}
"""

def display_graphs_side_by_side(*args):
    html_str='<table><tr>'
    for g in args:
        html_str += '<td>'
        html_str += g._repr_svg_()
        html_str += '</td>'
    html_str += '</tr></table>'
    display_html(html_str,raw=True)
    

display(HTML("<style>.container { width:70% !important; }</style>"))

In [None]:
%load_ext rpy2.ipython

In [None]:
%load_ext autoreload
%autoreload 1
%aimport pybnl.bn

In [None]:
%aimport dsbasics.bin

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'C')

import rpy2, rpy2.rinterface, rpy2.robjects, rpy2.robjects.packages, rpy2.robjects.lib, rpy2.robjects.lib.grid, \
    rpy2.robjects.lib.ggplot2, rpy2.robjects.pandas2ri, rpy2.interactive.process_revents, \
    rpy2.interactive, rpy2.robjects.lib.grdevices
# rpy2.interactive.process_revents.start()
rpy2.robjects.pandas2ri.activate()

In [None]:
rpackageversionfn = rpy2.robjects.r('packageVersion')
print(rpackageversionfn("bnlearn")[0])
print(rpackageversionfn("gRain")[0])

# House Prices in Ames, Iowa

* [Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project](http://ww2.amstat.org/publications/jse/v19n3/decock.pdf)
  * [AmesResidential.pdf](https://ww2.amstat.org/publications/jse/v19n3/decock/AmesResidential.pdf)
  * [DataDocumentation.txt](https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt)
  * [AmesHousing.txt](https://ww2.amstat.org/publications/jse/v19n3/decock/AmesHousing.txt)
  * [AmesHousing.xls](http://www.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls)
  * Also on [kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

The below example reproduces the example from chapter 5 (page 79) in [Bayesian Networks and BayesiaLab: A Practical Introduction for Researchers](https://www.amazon.com/Bayesian-Networks-BayesiaLab-Introduction-Researchers/dp/0996533303).

In [None]:
df = pd.read_csv('./AmesHousing.txt.gz', sep='\t', index_col=0)
df['MS SubClass'] = df['MS SubClass'].apply(lambda x: '{0:0>3}'.format(x))
df.iloc[:5,:15]

In [None]:
df.columns

In [None]:
discrete_non_null, discrete_with_null, continuous_non_null, continuous_with_null, levels_map = pybnl.bn.discrete_and_continuous_variables_with_and_without_nulls(df, cutoff=30)
# discrete_non_null, discrete_with_null, continuous_non_null, continuous_with_null, levels_map

In [None]:
ddf = df.copy()
#cat_columns = ['Alley', 'Bedroom AbvGr', 'Bldg Type', 'Bsmt Cond', ]
cat_columns = [
    'MS SubClass', 'MS Zoning', 'Street', 'Alley', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style',
    'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type', 'Sale Condition'
] + [
    'Overall Qual', 'Overall Cond'
]
cat_columns_ordinal = [
    ('Lot Shape',      ['Reg','IR1','IR2','IR3']),
    ('Utilities',      ['AllPub','NoSewr','NoSeWa','ELO']),
    ('Land Slope',     ['Gtl', 'Mod', 'Sev']),
    ('Exter Qual',     ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Exter Cond',     ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Bsmt Qual',      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Bsmt Cond',      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Bsmt Exposure',  ['Gd', 'Av', 'Mn', 'No', 'NA']),
    ('BsmtFin Type 1', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']),
    ('BsmtFin Type 2', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']),
    ('Heating QC',     ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Electrical',     ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix']),
    ('Kitchen Qual',   ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Functional',     ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']),
    ('Fireplace Qu',   ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Garage Finish',  ['Fin', 'RFn', 'Unf', 'NA']),
    ('Garage Qual',    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Garage Cond',    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Paved Drive',    ['Y', 'P', 'N']),
    ('Pool QC',        ['Ex', 'Gd', 'TA', 'Fa', 'NA']),
    ('Fence',          ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA']),
]

continuous_columns = [
    'Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
    'Screen Porch', 'Pool Area', 'Misc Val', 'SalePrice'
]
discrete_columns = ['Year Built', 'Year Remod/Add', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Mo Sold', 'Yr Sold', 'Bedroom AbvGr', 'Kitchen AbvGr']# do not exist: 'Bedroom',  'Kitchen'

for col in cat_columns:
    levels = levels_map[col]
    # print('col: {}'.format(col))
    # if all([isinstance(level, (int, float)) for level in levels]):
    if all([np.issubdtype(type(level), np.number) for level in levels]):
        # print('int, float column: {}'.format(col))
        levels = sorted(levels)
        ddf[col] = df[col].astype(pd.api.types.CategoricalDtype(levels, ordered=True))
    else:
        ddf[col] = df[col].astype(pd.api.types.CategoricalDtype(levels, ordered=False))

for col, levels in cat_columns_ordinal:
    ddf[col] = df[col].astype(pd.api.types.CategoricalDtype(levels[::-1], ordered=True))

for col in continuous_columns:
    ddf[col] = df[col].astype(float)

for col in discrete_columns:
    if pd.isnull(df[col]).any():
        ddf[col] = df[col].astype(float)
    else:
        ddf[col] = df[col].astype(int)
    
# col   = 'Alley'
# ddf[col]
# ddf[~pd.isnull(ddf[col])][col]
# value = np.nan
# ddf.loc[df[col]==value,col]

[Working with Pandas: Fixing messy column names](https://medium.com/@chaimgluck1/working-with-pandas-fixing-messy-column-names-42a54a6659cd)

In [None]:
ddf.columns = ddf.columns.str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [None]:
type(ddf.columns)

In [None]:
ddf.head()

In [None]:
# ddf.to_hdf('AmesHousing.h5', 'AmesHousing',format='table', append=False)

In [None]:
# pd.read_hdf('AmesHousing.h5', 'AmesHousing').head()

## Treating Filtered Values ('FV')

See page 84 in "Bayesian Networks and BayesiaLab"

### Bsmt fields

In [None]:
bsmt_fields_ = ['Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Bsmt Full Bath', 'Bsmt Half Bath', 
               'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF']
bsmt_fields = pd.Index(bsmt_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[bsmt_fields].query('Bsmt_Qual == "NA"')
# ddf[ddf['Bsmt_Qual'] == 'NA'][bsmt_fields]
# df[bsmt_fields_][df['Bsmt Qual'] == 'NA']

It seems that there are no filtered values for 'Bsmt' fields, e.g. each home contains a basement.

[Querying for NaN and other names in Pandas](https://stackoverflow.com/questions/26535563/querying-for-nan-and-other-names-in-pandas)

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.Bsmt_Qual)].head()

In [None]:
df[bsmt_fields_][pd.isnull(df['Bsmt Qual'])].head()

But there are quite a lot of 'NaN' entries. Not sure why the data description contains an "NA" value as "No Basement", but no actual data-set uses it. Most likely these values are supposed to be "NA"

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.Bsmt_Qual) & ~pd.isnull(ddf.BsmtFin_Type_2)].head()

In [None]:
bsmt_na_fields = ['Bsmt_Qual', 'Bsmt_Cond', 'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2']
ddf.loc[pd.isnull(ddf.Bsmt_Qual), bsmt_na_fields] = "NA"
ddf[bsmt_fields].query('Bsmt_Qual == "NA"').head()

### Electrical field

In [None]:
ddf.Electrical.value_counts(dropna=False)

In [None]:
ddf[pd.isnull(ddf.Electrical)]

The one NaN value seems to be a missing value

### Fireplaces

In [None]:
fireplaces_fields_ = ['Fireplaces', 'Fireplace Qu']
fireplaces_fields = pd.Index(fireplaces_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[fireplaces_fields].query('Fireplaces == 0').head()

In [None]:
ddf.loc[ddf.Fireplaces == 0,['Fireplace_Qu']] = 'NA'
ddf[fireplaces_fields].query('Fireplaces == 0').head()

### Garage fields

In [None]:
garage_fields_ = ['Garage Type', 'Garage Finish', 'Garage Cars', 'Garage Qual', 'Garage Cond', 'Garage Yr Blt', 'Garage Area']
garage_fields = pd.Index(garage_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[garage_fields][pd.isnull(ddf.Garage_Type)].head()

In [None]:
ddf['Garage_Type'] = ddf['Garage_Type'].astype(str)\
    .astype(pd.api.types.CategoricalDtype(set(list(ddf.Garage_Type.dtype.categories) + ['NA'])))
ddf.Garage_Type.dtype.categories

In [None]:
ddf.loc[pd.isnull(ddf.Garage_Type),['Garage_Type', 'Garage_Finish', 'Garage_Qual', 'Garage_Cond']] = 'NA'
ddf.loc[ddf.Garage_Type == 'NA',['Garage_Yr_Blt']] = -1.0
#ddf[garage_fields][pd.isnull(ddf.Garage_Yr_Blt)]
#ddf['Garage_Yr_Blt'] = ddf['Garage_Yr_Blt'].astype(int)
ddf[garage_fields][ddf.Garage_Type == 'NA'].head()

### Mas Vnr fields

In [None]:
mas_vnr_fields_ = ['Mas Vnr Type', 'Mas Vnr Area']
mas_vnr_fields = pd.Index(mas_vnr_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[mas_vnr_fields][pd.isnull(ddf.Mas_Vnr_Type)].head()

In [None]:
ddf[mas_vnr_fields][(ddf.Mas_Vnr_Type == 'None') & (ddf.Mas_Vnr_Area != 0.0)]

In [None]:
ddf.Mas_Vnr_Type.dtype

In [None]:
ddf.loc[pd.isnull(ddf.Mas_Vnr_Type), ['Mas_Vnr_Type']] = 'None'
# ddf.loc[ddf.Mas_Vnr_Type == 'None', ['Mas_Vnr_Area']] = 0.0

In [None]:
ddf.loc[pd.isnull(ddf.Mas_Vnr_Area), ['Mas_Vnr_Area']] = 0.0

In [None]:
ddf[mas_vnr_fields][ddf.Mas_Vnr_Type == 'None'].head()

### Pool fields

In [None]:
pool_fields_ = ['Pool QC', 'Pool Area']
pool_fields = pd.Index(pool_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf[pool_fields][pd.isnull(ddf.Pool_QC)].head()

In [None]:
ddf.loc[pd.isnull(ddf.Pool_QC), ['Pool_QC']] = 'NA'
ddf[pool_fields][ddf.Pool_QC == 'NA'].head()

### Fence field

In [None]:
fence_fields_ = ['Fence']
fence_fields = pd.Index(fence_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf.loc[pd.isnull(ddf.Fence), ['Fence']] = 'NA'
ddf[fence_fields][ddf.Fence == 'NA'].head()

### Misc Feature field

In [None]:
misc_feature_fields_ = ['Misc Feature']
misc_feature_fields = pd.Index(misc_feature_fields_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
ddf['Misc_Feature'] = ddf['Misc_Feature'].astype(str)\
    .astype(pd.api.types.CategoricalDtype(set(list(ddf.Misc_Feature.dtype.categories) + ['NA'])))
ddf.loc[pd.isnull(ddf.Misc_Feature), ['Misc_Feature']] = 'NA'
ddf[misc_feature_fields][ddf.Misc_Feature == 'NA'].head()

### Check remaining nan fields

In [None]:
_, discrete_with_null_, _, continuous_with_null_, _ = pybnl.bn.discrete_and_continuous_variables_with_and_without_nulls(ddf, cutoff=30)
discrete_with_null_, continuous_with_null_

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.Bsmt_Exposure)]

Just set the few NaN values to 'No'

In [None]:
ddf.loc[pd.isnull(ddf.Bsmt_Exposure),['Bsmt_Exposure']] = 'No'

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.BsmtFin_Type_2)]

Just set the few NaN values to 'Unf'

In [None]:
ddf.loc[pd.isnull(ddf.BsmtFin_Type_2),['BsmtFin_Type_2']] = 'NA'

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.Bsmt_Full_Bath)]

In [None]:
ddf[bsmt_fields][pd.isnull(ddf.Bsmt_Half_Bath)]

Just set all the NaN values to 0.0

In [None]:
ddf.loc[pd.isnull(ddf.Bsmt_Full_Bath),['Bsmt_Full_Bath','Bsmt_Half_Bath','BsmtFin_SF_1','BsmtFin_SF_2','Bsmt_Unf_SF','Total_Bsmt_SF']] = [0.0,0.0,0.0,0.0,0.0,0.0]

In [None]:
ddf[['Electrical']][pd.isnull(ddf.Electrical)]

Just set this single NaN value to 'Mix'

In [None]:
ddf.loc[pd.isnull(ddf.Electrical),['Electrical']] = 'Mix'

The remaining NaN garage fields seem to be really missing values so don't touch them.

In [None]:
ddf[garage_fields][pd.isnull(ddf.Garage_Finish)]

In [None]:
ddf[garage_fields][pd.isnull(ddf.Garage_Cars)]

In [None]:
ddf[garage_fields][pd.isnull(ddf.Garage_Qual)]

In [None]:
ddf[garage_fields][pd.isnull(ddf.Garage_Cond)]

The Lot_Frontage NaN fields seem to be really missing values so don't touch them.

In [None]:
ddf[pd.isnull(ddf.Lot_Frontage)].head()

In [None]:
_, discrete_with_null_, _, continuous_with_null_, _ = pybnl.bn.discrete_and_continuous_variables_with_and_without_nulls(ddf, cutoff=30)
discrete_with_null_, continuous_with_null_

## Binning / Discretization

In [None]:
ddf1 = ddf.copy()

### The target variable is SalePrice

In [None]:
ddf.SalePrice.describe()

In [None]:
ddf1.SalePrice = pd.cut(ddf.SalePrice, [0.0,75000.0, 150000.0, 225000.0, 300000.0,np.PINF], right=False)
ddf1['SalePrice'].value_counts()

### Continuous, discrete and ordinal variables

In [None]:
continuous_columns_without_sale_price = list(set(continuous_columns) - set(['SalePrice']))

In [None]:
target_variable_decision_tree_binning_variables_ = [c for c,r in cat_columns_ordinal] + continuous_columns_without_sale_price + discrete_columns + ['Overall Qual', 'Overall Cond']
target_variable_decision_tree_binning_variables = pd.Index(target_variable_decision_tree_binning_variables_).str.strip().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
target_variable_decision_tree_binning_variables

In [None]:
tmp_ddf_before_binning = ddf.copy()

In [None]:
tvbt = dsbasics.bin.TargetVariableDecisionTreeBinTransformer0(max_leaf_nodes=3)
ddf1.loc[:,target_variable_decision_tree_binning_variables] = \
    tvbt.fit_transform(ddf[target_variable_decision_tree_binning_variables], ddf1.SalePrice)
ddf1.head()

In [None]:
len(df.columns),len(ddf1.columns)

In [None]:
ddf['Overall_Qual'].value_counts().sort_index().plot(kind='bar');

In [None]:
ddf1['Overall_Qual'].value_counts().sort_index().plot(kind='bar');

### Convert interval indices to strings

In [None]:
ddf1 = pybnl.bn.convert_interval_index_categories_to_string_categories(ddf1,inplace=False)

In [None]:
ddf1.Bsmt_Full_Bath.dtype

### Rename columns to fit with R conventions

In [None]:
ddf1 = ddf1.rename(columns={
    "1st_Flr_SF":"X1st_Flr_SF",
    "2nd_Flr_SF":"X2nd_Flr_SF",
    "3Ssn_Porch":"X3Ssn_Porch",
    "Year_Remod/Add":"Year_Remod_Add"
})

### Drop unused columns

In [None]:
if any(ddf1.columns.isin(['PID'])):
    ddf1.drop('PID', axis=1, inplace=True)
# ddf1.columns

#### Drop the 'Alley' column, which contains most null values and will not contribute to the quality of the fit

In [None]:
if any(ddf1.columns.isin(['Alley'])):
    ddf1.drop('Alley', axis=1, inplace=True)
# ddf1.columns

The remaining columns with null values are:

In [None]:
_, discrete_with_null_, _, continuous_with_null_, _ = pybnl.bn.discrete_and_continuous_variables_with_and_without_nulls(ddf1, cutoff=30)
discrete_with_null_, continuous_with_null_

The statistics of the NaN values are as follows. Only the column `Lot_Frontage` still has relevant null values, e.g. we could simply filter out the 2 `Garage` rows that contain null values:

In [None]:
display_side_by_side(
    pd.DataFrame(ddf1.Lot_Frontage.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Yr_Blt.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Finish.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Cars.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Area.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Qual.value_counts(dropna=False)),
    pd.DataFrame(ddf1.Garage_Cond.value_counts(dropna=False))
)

In [None]:
null_value_idx = pd.isnull(ddf1.Lot_Frontage) | pd.isnull(ddf1.Garage_Yr_Blt) | pd.isnull(ddf1.Garage_Finish) | \
    pd.isnull(ddf1.Garage_Cars) | pd.isnull(ddf1.Garage_Area) | pd.isnull(ddf1.Garage_Qual) | pd.isnull(ddf1.Garage_Cond)
null_value_idx.value_counts()

In [None]:
ddf_without_null_values = ddf1[~null_value_idx]

In [None]:
ddf_with_null_values = ddf1

## Naive Bayes Classifier

### bnlearn by hand

#### Define net

In [None]:
dg = nx.DiGraph()

dg.add_nodes_from(ddf1.columns.values)

in_vars = ddf1.columns.values[:-1]
out_var = ddf1.columns.values[-1:]
dg.add_edges_from(list(itertools.product(out_var, in_vars)))

In [None]:
ns = pybnl.bn.digraph2netstruct(dg)
# ns.dot()
display(HTML(ns.dot()._repr_svg_()))

In [None]:
nbn1 = pybnl.bn.NetAndDataDiscreteBayesNetwork(ldf=ddf_without_null_values, dg=dg, predict_var='SalePrice')
nbn1.fit();

In [None]:
N=20 # restricted to speed up runtime of notebook
y_pred = nbn1.predict(ddf_without_null_values.iloc[:N,:-1])

In [None]:
y_true = ddf_without_null_values.iloc[:N,-1:]

In [None]:
sklearn.metrics.accuracy_score(y_true, y_pred)

In [None]:
tmp_cm = sklearn.metrics.confusion_matrix(y_true, y_pred)
tmp_cm

In [None]:
np.diag(tmp_cm).sum()/tmp_cm.sum()

In [None]:
nbn1.arc_strength_info().head()

In [None]:
nbn1.arc_strength_info().tail()

In [None]:
display(HTML(nbn1.dot()._repr_svg_()))

### bnlearn via naive.bayes

In [None]:
mbn = pybnl.bn.MultinomialNB()
mbn.fit(ddf_without_null_values[in_vars], ddf_without_null_values[out_var[0]])

In [None]:
mbn.arc_strength_info().head()

In [None]:
# display(HTML(mbn.dot()._repr_svg_()))

### Predicted vs. real

In [None]:
mbn_predict_df = pd.DataFrame([
    mbn.predict(ddf_without_null_values[in_vars]),
    ddf_without_null_values[out_var[0]].reset_index(drop=True),    
])
mbn_predict_df.columns = ['predicted', 'actual']
mbn_predict_df.head()

In [None]:
mbn.predict_proba(ddf_without_null_values[in_vars]).head()

### Predicted vs. real with probabilities

In [None]:
mbn_predict_with_probabilities_df = pd.DataFrame([
    mbn.predict(ddf_without_null_values[in_vars]),
    mbn.predict_proba(ddf_without_null_values[in_vars]).max(axis=1).reset_index(drop=True),
    ddf_without_null_values[out_var[0]].reset_index(drop=True),
    mbn.predict_proba(
        ddf_without_null_values[in_vars]
    ).values[
        np.array(range(len(ddf_without_null_values))).reshape(-1,1),
        ddf_without_null_values[out_var[0]].values.codes.reshape(-1,1)
    ].reshape(-1)
])
mbn_predict_with_probabilities_df.columns = ['predicted', 'predicted-probability', 'actual', 'actual-probability']
mbn_predict_with_probabilities_df.head()

### Label scores / how well do you get the labels right

In [None]:
sklearn.metrics.accuracy_score(mbn_predict_df.actual, mbn_predict_df.predicted)

In [None]:
mbn_predict_cm = sklearn.metrics.confusion_matrix(mbn_predict_df.actual, mbn_predict_df.predicted)
mbn_predict_cm

In [None]:
np.diag(mbn_predict_cm).sum()/mbn_predict_cm.sum()

In [None]:
sklearn.metrics.precision_score(mbn_predict_df.actual, mbn_predict_df.predicted, average='micro')

In [None]:
sklearn.metrics.recall_score(mbn_predict_df.actual, mbn_predict_df.predicted, average='micro')

### sklearn MultinomialNB

In [None]:
clf = sklearn.naive_bayes.MultinomialNB()
X_ = ddf_without_null_values[in_vars]
X  = X_.apply(lambda x: x.cat.codes, axis=0)
y_ = ddf_without_null_values[out_var[0]]
y  = y_.cat.codes
clf.fit(X, y)
# 

In [None]:
clf_predict_with_probabilities_df = pd.DataFrame(
    collections.OrderedDict([
        ('predicted', pybnl.bn.from_codes_to_category(clf.predict(X),y_.dtype)),
        ('predicted-probability', clf.predict_proba(X).max(axis=1)),
        ('actual', y_.reset_index(drop=True)), 
        ('actual-probability', clf.predict_proba(X)[np.array(range(len(y))).reshape(-1,1),y.values.reshape(-1,1)].reshape(-1))
    ])
)
clf_predict_with_probabilities_df.head()

In [None]:
y_.dtype.categories

In [None]:
sklearn.metrics.accuracy_score(clf_predict_with_probabilities_df.actual, clf_predict_with_probabilities_df.predicted)

In [None]:
# vertically: true label
# horizontally: predicted label
clf_predict_cm = sklearn.metrics.confusion_matrix(clf_predict_with_probabilities_df.actual, clf_predict_with_probabilities_df.predicted)
clf_predict_cm

#### Usage of the SKLearnMultinomialNBWrapper to avoid the manual mapping between numbers and categories

In [None]:
clf_ = pybnl.bn.SKLearnMultinomialNBWrapper()
clf_.fit(ddf_without_null_values[in_vars], ddf_without_null_values[out_var[0]])

In [None]:
clf_.predict(ddf_without_null_values[in_vars]).head()

## End to end sklearn pipeline

In [None]:
df.iloc[:5,:15]

### sklearn pipeline for data transformation steps

In [None]:
df_test = df.copy()
# df_test.dsbmd = {}
df_test.__dict__['dsbmd'] = {}
df_test.dsbmd

In [None]:
cat_columns = [
    'MS SubClass', 'MS Zoning', 'Street', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', #'Alley', 
    'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature', 'Sale Type', 'Sale Condition'
] + [
    'Overall Qual', 'Overall Cond'
]
cat_columns_ordinal = [
    ('Lot Shape',      ['Reg','IR1','IR2','IR3']),
    ('Utilities',      ['AllPub','NoSewr','NoSeWa','ELO']),
    ('Land Slope',     ['Gtl', 'Mod', 'Sev']),
    ('Exter Qual',     ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Exter Cond',     ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Bsmt Qual',      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Bsmt Cond',      ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Bsmt Exposure',  ['Gd', 'Av', 'Mn', 'No', 'NA']),
    ('BsmtFin Type 1', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']),
    ('BsmtFin Type 2', ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']),
    ('Heating QC',     ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Electrical',     ['SBrkr', 'FuseA', 'FuseF', 'FuseP', 'Mix']),
    ('Kitchen Qual',   ['Ex', 'Gd', 'TA', 'Fa', 'Po']),
    ('Functional',     ['Typ', 'Min1', 'Min2', 'Mod', 'Maj1', 'Maj2', 'Sev', 'Sal']),
    ('Fireplace Qu',   ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Garage Finish',  ['Fin', 'RFn', 'Unf', 'NA']),
    ('Garage Qual',    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Garage Cond',    ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']),
    ('Paved Drive',    ['Y', 'P', 'N']),
    ('Pool QC',        ['Ex', 'Gd', 'TA', 'Fa', 'NA']),
    ('Fence',          ['GdPrv', 'MnPrv', 'GdWo', 'MnWw', 'NA']),
]

continuous_columns = [
    'Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
    'Screen Porch', 'Pool Area', 'Misc Val', 'SalePrice'
]
discrete_columns = ['Year Built', 'Year Remod/Add', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Mo Sold', 'Yr Sold', 'Bedroom AbvGr', 'Kitchen AbvGr']# do not exist: 'Bedroom',  'Kitchen'


In [None]:
cat_columns_NA = [
    ('Misc Feature', ['Shed', 'Gar2', 'Othr', 'TenC', 'Elev', 'NA']),
    ('Garage Type', ['Attchd', 'Detchd', 'BuiltIn', 'Basment', '2Types', 'CarPort', 'NA']),
]

In [None]:
# df['Garage Type'].value_counts(dropna=False)

In [None]:
tmp_cat_columns_ordinal = [e for e,_ in cat_columns_ordinal]
# tmp_cat_columns_ordinal

In [None]:
tmp_cat_columns_NA = [(e, 'NA') for e,_ in cat_columns_NA]
tmp_cat_columns_NA += [('Bsmt Qual', 'NA'), ('Bsmt Cond', 'NA'), ('Bsmt Exposure', 'NA'), ('BsmtFin Type 1', 'NA'), ('BsmtFin Type 2', 'NA')] # 
tmp_cat_columns_NA += [('Fireplace Qu', 'NA')]
tmp_cat_columns_NA += [('Garage Finish', 'NA'), ('Garage Qual', 'NA'), ('Garage Cond', 'NA'), ('Garage Yr Blt', -1.0)]
tmp_cat_columns_NA += [('Mas Vnr Type', 'None'), ['Mas Vnr Area', 0.0]]
tmp_cat_columns_NA += [('Pool QC', 'NA')]
tmp_cat_columns_NA += [('Fence', 'NA')]
tmp_cat_columns_NA += [('Misc Feature', 'NA')]
tmp_cat_columns_NA += [('Bsmt Exposure', 'No'), ('BsmtFin Type 2', 'Unf'), ('Bsmt Full Bath', 0.0), ('Bsmt Half Bath', 0.0), ('BsmtFin SF 1', 0.0), ('BsmtFin SF 2', 0.0), ('Bsmt Unf SF', 0.0), ('Total Bsmt SF', 0.0)]
tmp_cat_columns_NA += [('Electrical', 'Mix')]
# tmp_cat_columns_NA

In [None]:
'{}'.format(sorted(['020', '060', '120', '050', '085', '160', '080', '030', '090', '190', '045', '070', '075', '040', '180', '150']))

In [None]:
tmp_cat_columns_ordinal_1 = [(e, l[::-1]) for e,l in cat_columns_ordinal + cat_columns_NA]

tmp_cat_columns_ordinal_1 += [
    ('MS SubClass', ['020', '030', '040', '045', '050', '060', '070', '075', '080', '085', '090', '120', '150', '160', '180', '190']),
    ('MS Zoning', ['RL', 'RH', 'FV', 'RM', 'C (all)', 'I (all)', 'A (agr)']), 
    ('Street', ['Pave', 'Grvl']),
    ('Land Contour', ['Lvl', 'HLS', 'Bnk', 'Low']),
    ('Lot Config', ['Corner', 'Inside', 'CulDSac', 'FR2', 'FR3']),
    ('Neighborhood', ['NAmes', 'Gilbert', 'StoneBr', 'NWAmes', 'Somerst', 'BrDale', 'NPkVill', 'NridgHt', 'Blmngtn', 'NoRidge', 'SawyerW', 'Sawyer', 'Greens', 'BrkSide', 'OldTown', 'IDOTRR', 'ClearCr', 'SWISU', 'Edwards', 'CollgCr', 'Crawfor', 'Blueste', 'Mitchel', 'Timber', 'MeadowV', 'Veenker', 'GrnHill', 'Landmrk']),
    ('Condition 1', ['Norm', 'Feedr', 'PosN', 'RRNe', 'RRAe', 'Artery', 'PosA', 'RRAn', 'RRNn']),
    ('Condition 2', ['Norm', 'Feedr', 'PosA', 'PosN', 'Artery', 'RRNn', 'RRAe', 'RRAn']),
    ('Bldg Type', ['1Fam', 'TwnhsE', 'Twnhs', 'Duplex', '2fmCon']),
    ('House Style', ['1Story', '2Story', '1.5Fin', 'SFoyer', 'SLvl', '2.5Unf', '1.5Unf', '2.5Fin']),
    ('Roof Style', ['Hip', 'Gable', 'Mansard', 'Gambrel', 'Shed', 'Flat']),
    ('Roof Matl', ['CompShg', 'WdShake', 'Tar&Grv', 'WdShngl', 'Membran', 'ClyTile', 'Roll', 'Metal']),
    ('Exterior 1st', ['BrkFace', 'VinylSd', 'Wd Sdng', 'CemntBd', 'HdBoard', 'Plywood', 'MetalSd', 'AsbShng', 'WdShing', 'Stucco', 'AsphShn', 'BrkComm', 'CBlock', 'PreCast', 'Stone', 'ImStucc']),
    ('Exterior 2nd', ['Plywood', 'VinylSd', 'Wd Sdng', 'BrkFace', 'CmentBd', 'HdBoard', 'Wd Shng', 'MetalSd', 'ImStucc', 'Brk Cmn', 'AsbShng', 'Stucco', 'AsphShn', 'CBlock', 'Stone', 'PreCast', 'Other']),
    ('Mas Vnr Type', ['Stone', 'None', 'BrkFace', 'BrkCmn', 'CBlock']),
    ('Foundation', ['CBlock', 'PConc', 'Wood', 'BrkTil', 'Slab', 'Stone']),
    ('Heating', ['GasA', 'GasW', 'Grav', 'Wall', 'Floor', 'OthW']),
    ('Central Air', ['Y', 'N']),
    ('Sale Type', ['WD ', 'New', 'COD', 'ConLI', 'Con', 'ConLD', 'Oth', 'ConLw', 'CWD', 'VWD']),
    ('Sale Condition', ['Normal', 'Partial', 'Family', 'Abnorml', 'Alloca', 'AdjLand']),
    ('Overall Qual', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
    ('Overall Cond', [1, 2, 3, 4, 5, 6, 7, 8, 9]),
]

tmp_levels_map = dict(tmp_cat_columns_ordinal_1)
# tmp_levels_map

In [None]:
it = dsbasics.bin.MetaDataInitTransformer()

dc = dsbasics.bin.DropColumnTransformer(['PID', 'Alley'])

ct = dsbasics.bin.CategoricalTransformer(
    categorical_columns = cat_columns,
    ordered_categorical_columns = tmp_cat_columns_ordinal, 
    discrete_columns = discrete_columns, 
    continuous_columns = continuous_columns, 
    levels_map = tmp_levels_map
)

nt = dsbasics.bin.NullToNATransformer(null_to_NA_columns = tmp_cat_columns_NA)

pc = dsbasics.bin.PandasCutBinTransformer({'SalePrice': [75000.0, 150000.0, 225000.0, 300000.0]})

tvbt = dsbasics.bin.TargetVariableDecisionTreeBinTransformer(
    max_leaf_nodes=3, 
    binning_variables=target_variable_decision_tree_binning_variables_
)

lt = dsbasics.bin.CategoryLevelsAsStringsTransformer()

fn = dsbasics.bin.FilterNullTransformer()

pl = sklearn.pipeline.Pipeline(
    memory=None,
    steps=[
        ('init', it),
        ('drop_columns', dc),
        ('ct', ct),
        ('null_to_NA', nt),
        ('target_variable_binning', pc),
        ('decision_tree_discretization', tvbt),
        ('levels_as_strings', lt),
        ('filter_null_rows', fn),
])

tmp = pl.fit_transform(df.iloc[:,:-1], df.SalePrice)
tmp.iloc[:5,:15]

In [None]:
ddf_without_null_values.shape

In [None]:
tmp.shape

The difference is due to record 1357 that we left in our hand example untouched (see above):

In [None]:
set(tmp.index) - set(ddf_without_null_values.index)

The other rows match perfectly, which you can check via the following:

In [None]:
# for i in tqdm.tqdm(range(2438)):
#     idx = ddf_without_null_values.index[i]
#     row1 = ddf_without_null_values.iloc[:,:-1].loc[idx].reset_index(drop=True)
#     row2 = tmp.loc[idx].reset_index(drop=True)
#     if not row1.equals(row2):
#         print(idx)
#         break

# eql=(row1==row2)
# tmp_cmp = pd.DataFrame(collections.OrderedDict(nm=tmp.columns,row1=row1, row2=row2, eql=eql))
# tmp_cmp[~tmp_cmp.eql]

In [None]:
pl.named_steps['filter_null_rows'].null_fields

As SalePrice is not part of the X argument of the transform you can only look at the final outcome for this value via the internal state of the last transform step:

In [None]:
pl.named_steps['levels_as_strings'].df.SalePrice.dtype

You can also have a look at some of the other transformed fields like nominal, ordered and continous. The nominal stay untouched, because the tree regression binning is only working for ordered datatypes:

In [None]:
tmp.Garage_Type.dtype

In [None]:
tmp.Garage_Type.value_counts(dropna=False)

In [None]:
tmp.Bsmt_Qual.dtype

In [None]:
tmp.Overall_Qual.dtype

In [None]:
tmp.Lot_Area.dtype

### sklearn pipeline with bayes network

The above data pipeline has element 1357 in addition to what we did above by hand. In order to generate comparable results let's drop this row:

In [None]:
# the below 
df_ = df.copy()
df_.drop(index=1357,inplace=True)

The following data pipeline predicts the labels:

In [None]:
mbnp = dsbasics.bin.MetaDataTransformerClassifierOrRegressorWrapper(pybnl.bn.MultinomialNB())

it = dsbasics.bin.MetaDataInitTransformer()

dc = dsbasics.bin.DropColumnTransformer(['PID', 'Alley'])

ct = dsbasics.bin.CategoricalTransformer(
    categorical_columns = cat_columns,
    ordered_categorical_columns = tmp_cat_columns_ordinal, 
    discrete_columns = discrete_columns, 
    continuous_columns = continuous_columns, 
    levels_map = tmp_levels_map
)

nt = dsbasics.bin.NullToNATransformer(null_to_NA_columns = tmp_cat_columns_NA)

pc = dsbasics.bin.PandasCutBinTransformer({'SalePrice': [75000.0, 150000.0, 225000.0, 300000.0]})

tvbt = dsbasics.bin.TargetVariableDecisionTreeBinTransformer(
    max_leaf_nodes=3, 
    binning_variables=target_variable_decision_tree_binning_variables_,
)

lt = dsbasics.bin.CategoryLevelsAsStringsTransformer()

fn = dsbasics.bin.FilterNullTransformer()

pl1 = sklearn.pipeline.Pipeline(
    memory=None,
    steps=[
        ('init', it),
        ('drop_columns', dc),
        ('ct', ct),
        ('null_to_NA', nt),
        ('target_variable_binning', pc),
        ('decision_tree_discretization', tvbt),
        ('levels_as_strings', lt),
        ('filter_null_rows', fn),
        ('mbn', mbnp)
])

pl1.fit(df_.iloc[:,:-1], df_.SalePrice)
pl1_pred_y = pl1.predict(df_.iloc[:,:-1])

In [None]:
pl1_actual_y = pl1.steps[-1][1].y

The `accuracy_score` gives as expected the exact same values as above with our manual case.

In [None]:
sklearn.metrics.accuracy_score(pl1_actual_y, pl1_pred_y)

Something like the below does not work in general, because the pipeline, by default, does not transform y-values for the real y-values:

In [None]:
# pl1.score(df_.iloc[:,:-1], df_.iloc[:,-1])

### Pay attention to dataframes containing categories; they are tricky!

Data frames that contain categories store the type information per column. If you take in principle two data frames that contain the same string values but use different encodings you can get unexpected results.

`tmp_row1` is a row series of the data frame generated by the pipeline `pl2` as an intermediate step. As `tmp_row1` is a series it knows nothing about category datatypes and only stores strings. `tmp_row1_df` on the other hand is a dataframe and knows about categories and their encoding:

In [None]:
tmp_row1    = pl1.steps[-1][1].df[in_vars].loc[1,:]
tmp_row1_df = pl1.steps[-1][1].df[in_vars].loc[[1],:]

`tmp_row2` and `tmp_row2_df` is the same for data generated above manually in `ddf_without_null_values`:

In [None]:
tmp_row2    = ddf_without_null_values[in_vars].loc[1,:]
tmp_row2_df = ddf_without_null_values[in_vars].loc[[1],:]

If we now put the rows (remember: they don't know anything about the categories) into a data-frame to compare the two we see no differences (the result is an empty data-frame):

In [None]:
tmp_row_cmp = pd.DataFrame(collections.OrderedDict(l=tmp_row1, r=tmp_row2))
tmp_row_cmp['dodiffer'] = tmp_row_cmp.l != tmp_row_cmp.r
tmp_row_cmp[tmp_row_cmp.dodiffer]

If we look at the data-frames by eye we also do not see any differences:

In [None]:
tmp_row1_df

In [None]:
tmp_row2_df

But if we would put them together into a single data-frame via `pd.concat` for example, we will get differences:

In [None]:
differences_columns = pybnl.bn.data_frame_data_type_diff(tmp_row1_df, tmp_row2_df)
differences_columns

In [None]:
pd.concat([tmp_row1_df, tmp_row2_df])[differences_columns]

If we would do the same in the inverse order we get:

In [None]:
pd.concat([tmp_row2_df, tmp_row1_df])[differences_columns]

The reason why we can use above the `pl1.steps[-1][1].base_classifier` on the `ddf_without_null_values` data-frame, which is a category data-frame with a different encoding, is that internally the `predict` (and `predict_proba`) methods use `pybnl.bn.coerce_data_frame_types` function to ensure the correct encoding (by first converting the values to strings and then to a category data-type again).

In [None]:
pd.concat([tmp_row1_df, pybnl.bn.coerce_data_frame_types(tmp_row2_df,tmp_row1_df)])[differences_columns]

In [None]:
pd.concat([tmp_row2_df, pybnl.bn.coerce_data_frame_types(tmp_row1_df,tmp_row2_df)])[differences_columns]

## Scoring and scores

### median_absolute_error

Let's first look at what the best outcome would look like if we predict all labels right and how far our predicted value would differ from the real value.

The real price is in `ddf_without_null_values_saleprice`

In [None]:
ddf_without_null_values_saleprice = ddf.loc[ddf_without_null_values.index]['SalePrice']

The true labels are in `ddf_without_null_values['SalePrice']`

If we map each category to the median we get the following:

In [None]:
tmp_saleprice_category_to_median_mapping = \
    ddf_without_null_values_saleprice.groupby(ddf_without_null_values['SalePrice']).median()
# tmp_saleprice_category_to_median_mapping_dict = \
#     dict(zip(tmp_saleprice_category_to_median_mapping.index, tmp_saleprice_category_to_median_mapping.values))
tmp_saleprice_category_to_median_mapping

The error introduced alone because of the discretization of the sale price even if you predict the "label" correctly is:

In [None]:
ddf_without_null_values_saleprice_reconstruct = ddf_without_null_values['SalePrice'].map(tmp_saleprice_category_to_median_mapping)
minimum_reconstruction_error = \
    sklearn.metrics.median_absolute_error(ddf_without_null_values_saleprice, ddf_without_null_values_saleprice_reconstruct)
minimum_reconstruction_error

The reconstructed values for our `mbn` model that we created by hand are:

In [None]:
mbn_pred_y = pd.Series(mbn.predict(ddf_without_null_values[in_vars])).map(tmp_saleprice_category_to_median_mapping)
mbn_pred_y.head()

And the resulting error is:

In [None]:
mbn_reconstruction_error = sklearn.metrics.median_absolute_error(ddf_without_null_values_saleprice, mbn_pred_y)
mbn_reconstruction_error

The biggest part of this error is coming from the binning/discretization. Only a small part of the error is coming from our model predicting the wrong label.

In [None]:
(mbn_reconstruction_error - minimum_reconstruction_error)/minimum_reconstruction_error

### Other metrics to look at

There are other metrics to quantify the error, e.g.:

In [None]:
np.sqrt(sklearn.metrics.mean_squared_error(ddf_without_null_values_saleprice, mbn_pred_y))

In [None]:
sklearn.metrics.mean_absolute_error(ddf_without_null_values_saleprice, mbn_pred_y)

[What is the difference between the R2 and the explained variance score in Scikit-learn?](https://stats.stackexchange.com/questions/210168/what-is-the-difference-between-r2-and-variance-score-in-scikit-learn)
* [explained-variance-score](http://scikit-learn.org/stable/modules/model_evaluation.html#explained-variance-score)
* [r2-score](http://scikit-learn.org/stable/modules/model_evaluation.html#r2-score)

In [None]:
sklearn.metrics.explained_variance_score(ddf_without_null_values_saleprice, mbn_pred_y)

In [None]:
sklearn.metrics.r2_score(ddf_without_null_values_saleprice, mbn_pred_y)

### sklearn end-to-end pipeline with predicted y values (regression)

And the next one predicts the y-values via the mechanism we saw above where we map the labels to the medians for the training values falling into that label-category:

In [None]:
mbny = dsbasics.bin.ClassifierToRegressorHelper(pybnl.bn.MultinomialNB())

it = dsbasics.bin.MetaDataInitTransformer()

dc = dsbasics.bin.DropColumnTransformer(['PID', 'Alley'])

ct = dsbasics.bin.CategoricalTransformer(
    categorical_columns = cat_columns,
    ordered_categorical_columns = tmp_cat_columns_ordinal, 
    discrete_columns = discrete_columns, 
    continuous_columns = continuous_columns, 
    levels_map = tmp_levels_map
)

nt = dsbasics.bin.NullToNATransformer(null_to_NA_columns = tmp_cat_columns_NA)

pc = dsbasics.bin.PandasCutBinTransformer({'SalePrice': [75000.0, 150000.0, 225000.0, 300000.0]})

tvbt = dsbasics.bin.TargetVariableDecisionTreeBinTransformer(
    max_leaf_nodes=3, 
    binning_variables=target_variable_decision_tree_binning_variables_
)

lt = dsbasics.bin.CategoryLevelsAsStringsTransformer()

fn = dsbasics.bin.FilterNullTransformer()

pl2 = sklearn.pipeline.Pipeline(
    memory=None,
    steps=[
        ('init', it),
        ('drop_columns', dc),
        ('ct', ct),
        ('null_to_NA', nt),
        ('target_variable_binning', pc),
        ('decision_tree_discretization', tvbt),
        ('levels_as_strings', lt),
        ('filter_null_rows', fn),
        ('mbny', mbny)
])

pl2.fit(df_.iloc[:,:-1], df_.SalePrice)
pl2_pred_y = pl2.predict(df_.iloc[:,:-1])

The pipeline is now doing the same as we did above manually as a sequence of transformations and the results match (as you will be able to see below). First let's look at the predicted results:

In [None]:
pl2_pred_y.head()

Then let's verify that the length of both predicted sequences is the same:

In [None]:
len(pl2_pred_y), len(mbn_pred_y)

Then let's compare the predicted values generated by hand above in `mbn_pred_y` and the predicted values generated by `pl2` in `pl2_pred_y`. In order to do that we first have to make sure that the index values of `mbn_pred_y` match the data-frame indices:

In [None]:
mbn_pred_y.index = ddf_without_null_values.index

Now let's compare:

In [None]:
def generate_cmp_data_frame(ldf1_pred_y, ldf2_pred_y):
    if not (ldf1_pred_y.index == ldf2_pred_y.index).all():
        raise RuntimeError('Indices of the two data frames do not match! {}'.format(dsbasics.bin.index_compare(ldf1_pred_y, ldf2_pred_y)))
    ldf_cmp = pd.DataFrame(collections.OrderedDict(df1=ldf1_pred_y, df2=ldf2_pred_y), index=ldf1_pred_y.index)
    ldf_cmp['delta'] = np.abs(ldf_cmp.df1 - ldf_cmp.df2)
    return ldf_cmp
tmp_df_cmp = generate_cmp_data_frame(mbn_pred_y, pl2_pred_y)

And as you can see there are 0 differences:

In [None]:
tmp_df_cmp.delta.value_counts()

In [None]:
# tmp_df_cmp.sort_values(['delta'], ascending=False).head()

Now let's use a few score methods. First the `r2_score` integrated into the predictor:

In [None]:
pl2.score(df_.iloc[:,:-1], df_.iloc[:,-1])

Let's recreate this by hand. First we need to sub-select the real values from the input for which we have predictions:

In [None]:
pl2_without_null_values_saleprice = df_.iloc[:,-1].loc[pl2_pred_y.index]
# pl2_without_null_values_saleprice.head()

And then we can call the scoring function:

In [None]:
sklearn.metrics.r2_score(pl2_without_null_values_saleprice, pl2_pred_y)

We can also use other scoring functions of course:

In [None]:
sklearn.metrics.median_absolute_error(pl2_without_null_values_saleprice, pl2_pred_y)

It does not make an awful lot of sense for a regressor, but we can also look at the label probabilities as follows:

In [None]:
pl2.steps[-1][1].base_classifier.predict_proba(ddf_without_null_values[in_vars].loc[[1],:])

We can also access the data-frame of the last transform sequence step used during fitting via: `pl2.steps[-1][1].df` and repeat the call above:

In [None]:
pl2.steps[-1][1].base_classifier.predict_proba(pl2.steps[-1][1].df[in_vars].loc[[1],:])

### sklearn end-to-end pipeline with predicted y values (regression) with the sklearn.naive_bayes.MultinomialNB 

In [None]:
skl_mnby = dsbasics.bin.ClassifierToRegressorHelper(pybnl.bn.SKLearnMultinomialNBWrapper())

it = dsbasics.bin.MetaDataInitTransformer()

dc = dsbasics.bin.DropColumnTransformer(['PID', 'Alley'])

ct = dsbasics.bin.CategoricalTransformer(
    categorical_columns = cat_columns,
    ordered_categorical_columns = tmp_cat_columns_ordinal, 
    discrete_columns = discrete_columns, 
    continuous_columns = continuous_columns, 
    levels_map = tmp_levels_map
)

nt = dsbasics.bin.NullToNATransformer(null_to_NA_columns = tmp_cat_columns_NA)

pc = dsbasics.bin.PandasCutBinTransformer({'SalePrice': [75000.0, 150000.0, 225000.0, 300000.0]})

tvbt = dsbasics.bin.TargetVariableDecisionTreeBinTransformer(
    max_leaf_nodes=3, 
    binning_variables=target_variable_decision_tree_binning_variables_
)

lt = dsbasics.bin.CategoryLevelsAsStringsTransformer()

fn = dsbasics.bin.FilterNullTransformer()

pl3 = sklearn.pipeline.Pipeline(
    memory=None,
    steps=[
        ('init', it),
        ('drop_columns', dc),
        ('ct', ct),
        ('null_to_NA', nt),
        ('target_variable_binning', pc),
        ('decision_tree_discretization', tvbt),
        ('levels_as_strings', lt),
        ('filter_null_rows', fn),
        ('skl_mnby', skl_mnby)
])

pl3.fit(df_.iloc[:,:-1], df_.SalePrice)
pl3_pred_y = pl2.predict(df_.iloc[:,:-1])

In [None]:
pl3.score(df_.iloc[:,:-1], df_.SalePrice)

### scross validation

* [Tutorial on MultinomialNB](http://universityofbigdata.net/competition/tutorial/5681717746597888?lang=en)
* [Vectorization, Multinomial Naive Bayes Classifier and Evaluation](https://www.ritchieng.com/machine-learning-multinomial-naive-bayes-vectorization/)
* [Computing cross-validated metrics](http://scikit-learn.org/stable/modules/cross_validation.html)
* [scoring-parameter](http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

Above we have looked at scores evaluating our model against the training data itself. A model can always fit the training data perfectly but then fail at predicting unseen values. This is where [cross validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) comes in. You train the model on a certain sub-set of the overall data and you evaluate its predictions on the not seen data.

Below we validate our model via [10-fold cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation).

In [None]:
def my_cross_val_score(clf, X, y, cv=sklearn.model_selection.ShuffleSplit(n_splits=10, random_state=0)):
    scores = pd.DataFrame(columns=['median_absolute_error', 'r2'])

    i = 0
    for train, test in tqdm.tqdm(list(cv.split(X,y))):
        X_train = X.iloc[train,:]
        y_train = y.iloc[train]
        X_test = X.iloc[test,:]
        y_test = y.iloc[test]

        clf.fit(X_train, y_train)
        tmp_pred_y = clf.predict(X_test)
        y_test = y_test.loc[tmp_pred_y.index]
        mae_score = sklearn.metrics.median_absolute_error(y_test, tmp_pred_y)
        r2_score  = sklearn.metrics.r2_score(y_test, tmp_pred_y)
        
        scores.loc[i] = [mae_score, r2_score]
        i += 1
    
    return scores    

In [None]:
X, y      = df_.iloc[:,:-1], df_.SalePrice
# scores_pl2 = my_cross_val_score(pl2, X, y)

In [None]:
# scores_pl2.describe()

In [None]:
# scores_pl3 = my_cross_val_score(pl3, X, y)

In [None]:
# scores_pl3.describe()

We can see that the [bnlearn implementation](http://www.bnlearn.com/documentation/man/naive.bayes.html) of Naive Bayes performs better than the  [sklearn implementation](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html). This matches our observations from above where we've seen that the scores for the bnlearn implementation when training and evaluating on the full data are better than the sklearn ones.

I still do not understand why, because my expectations would be that Naive Bayes is a standard algorithm and different implementations should just match.

In [None]:
def my_learning_curve(clf, X, y, train_sizes=np.array([ 0.1, 0.33, 0.55, 0.78, 1. ])):
    cv_score_sequence = pd.DataFrame(columns=['median_absolute_error', 'r2'])
    train_score_sequence = pd.DataFrame(columns=['median_absolute_error', 'r2'])
    for ts in train_sizes:
        if ts == 1.:
            X_ = X.copy()
            y_ = y.copy()
        else:
            _, X_, _, y_ = sklearn.model_selection.train_test_split(X, y, test_size=ts, random_state=42)

        cv_scores = my_cross_val_score(clf, X_, y_, cv=sklearn.model_selection.ShuffleSplit(n_splits=3, random_state=42)).mean()
        cv_score_sequence.loc[ts] = cv_scores
        
        clf.fit(X_,y_)
        clf_predict_y = clf.predict(X_)
        y_subselect = y_.loc[clf_predict_y.index]
        clf_train_r2  = sklearn.metrics.r2_score(y_subselect, clf_predict_y)
        clf_train_mae = sklearn.metrics.median_absolute_error(y_subselect, clf_predict_y)
        train_score_sequence.loc[ts] = [clf_train_mae, clf_train_r2]
    
    score_sequence = pd.concat([train_score_sequence, cv_score_sequence],axis=1)
    score_sequence.columns =  ['train_median_absolute_error', 'train_r2', 'cv_median_absolute_error', 'cv_r2']
    return score_sequence

In [None]:
# pl3_learning_curve_df = my_learning_curve(pl3, X, y)

In [None]:
# pl3_learning_curve_df

[UserWarning about columns and attribute while plotting in Jupyter](https://github.com/pandas-dev/pandas/issues/18671)

In [None]:
#  [UserWarning about columns and attribute while plotting in Jupyter](https://github.com/pandas-dev/pandas/issues/18671)
# with warnings.catch_warnings():
#     warnings.simplefilter("ignore", category=UserWarning)
#     pl3_learning_curve_df.plot(y=['train_r2', 'cv_r2'])

## Tree / Random Forest Regression

In [None]:
import sklearn.ensemble

In [None]:
# df.head()

In [None]:
rf_X = df.loc[:,['Lot Area']]
# rf_X = df.loc[:,continuous_columns]
rf_y = df.SalePrice

In [None]:
rf_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=10)
rf_clf.fit(rf_X, rf_y)

In [None]:
rf_pred_y = rf_clf.predict(rf_X)

In [None]:
sklearn.metrics.r2_score(rf_y, rf_pred_y)

In [None]:
sklearn.metrics.median_absolute_error(rf_y, rf_pred_y)

In [None]:
# rf_cv_mae_scores = \
#     sklearn.model_selection.cross_val_score(rf_clf, rf_X, rf_y, cv=sklearn.model_selection.ShuffleSplit(n_splits=10, random_state=0), 
#                                             scoring='neg_median_absolute_error')
# rf_cv_r2_scores = \
#     sklearn.model_selection.cross_val_score(rf_clf, rf_X, rf_y, cv=sklearn.model_selection.ShuffleSplit(n_splits=10, random_state=0), 
#                                             scoring='r2')

scoring = {'r2': 'r2',
           'neg_median_absolute_error': 'neg_median_absolute_error'
          }
scores = sklearn.model_selection.cross_validate(rf_clf, rf_X, rf_y, scoring=scoring, 
                                                cv=sklearn.model_selection.ShuffleSplit(n_splits=10, random_state=0), 
                                                return_train_score=True)

In [None]:
rf_scores_df0 = pd.DataFrame(scores)
# rf_scores_df0

In [None]:
rf_scores_df = pd.DataFrame()
rf_scores_df['test_median_absolute_error'] = -rf_scores_df0['test_neg_median_absolute_error']
rf_scores_df['train_median_absolute_error'] = -rf_scores_df0['train_neg_median_absolute_error']
rf_scores_df['test_r2'] = rf_scores_df0['test_r2']
rf_scores_df['train_r2'] = rf_scores_df0['train_r2']
rf_scores_df