In [278]:
import numpy as np
import pandas as pd

from scipy.stats import wilcoxon, ranksums, mannwhitneyu

from datasets import binclas_datasets, regr_datasets
from config import dataset_map

In [279]:
postfix = ''

In [280]:
splits = {
    'dtc': pd.read_csv('splits_dtc.csv'),
    'dtr': pd.read_csv('splits_dtr.csv'),
    'rfc': pd.read_csv('splits_rfc.csv'),
    'rfr': pd.read_csv('splits_rfr.csv')
}
evaluations = {
    'dtc': pd.read_csv(f'evaluation_dtc{postfix}.csv'),
    'dtr': pd.read_csv(f'evaluation_dtr{postfix}.csv'),
    'rfc': pd.read_csv(f'evaluation_rfc{postfix}.csv'),
    'rfr': pd.read_csv(f'evaluation_rfr{postfix}.csv')
}

scores = {
    'dtc': 'auc',
    'rfc': 'auc',
    'dtr': 'r2',
    'rfr': 'r2'
}

binclas_order = binclas_datasets[['name']]\
    .apply(lambda row: pd.Series({'name': dataset_map.get(row['name'], row['name'])}), axis=1)
regr_order = regr_datasets[['name']]\
    .apply(lambda row: pd.Series({'name': dataset_map.get(row['name'], row['name'])}), axis=1)

In [281]:
splits = {
    key: value.rename(columns={'Unnamed: 0': 'name'})
    for key, value in splits.items()
}

In [282]:
splits['dtc']['name'] = splits['dtc']['name'].apply(lambda name: dataset_map.get(name, name))
splits['dtr']['name'] = splits['dtr']['name'].apply(lambda name: dataset_map.get(name, name))
splits['rfc']['name'] = splits['rfc']['name'].apply(lambda name: dataset_map.get(name, name))
splits['rfr']['name'] = splits['rfr']['name'].apply(lambda name: dataset_map.get(name, name))

In [283]:
splits['dtc']

Unnamed: 0,name,n_lattice_splits,n_splits,n_lattice_splits_kfold,n_splits_kfold
0,appendicitis,0,3,1,283
1,haberman,0,7,14,687
2,new_thyroid1,0,2,3,202
3,glass0,0,5,18,557
4,shuttle-6_vs_2-3,0,1,0,100
5,bupa,2,17,158,1652
6,cleveland-0_vs_4,0,3,4,279
7,ecoli1,0,3,17,300
8,poker-9_vs_7,0,2,0,223
9,monk-2,0,5,0,474


In [284]:
for key in splits:
    splits[key]['rate'] = splits[key]['n_lattice_splits'] / splits[key]['n_splits']
    splits[key]['rate_kfold'] = splits[key]['n_lattice_splits_kfold'] / splits[key]['n_splits_kfold']

In [285]:
evaluations = {
    key: value[value['mode'].isin(['<', '<='])]
    for key, value in evaluations.items()
}

In [286]:
evaluations = {
    key: value.groupby(['name', 'mode'])\
            .apply(lambda pdf: pdf.sort_values('fold')[scores[key]].values.tolist())\
            .reset_index(drop=False)\
            .rename(columns={0: scores[key]})
    for key, value in evaluations.items()
}

In [287]:
def evaluate(pdf):
    score = 'auc' if 'auc' in pdf.columns else 'r2'
    leq = pdf[pdf['mode'] == '<='].iloc[0]
    l = pdf[pdf['mode'] == '<'].iloc[0]

    p_less = wilcoxon(leq[score], l[score], alternative='less', zero_method='zsplit').pvalue
    p_gr = wilcoxon(leq[score], l[score], alternative='greater', zero_method='zsplit').pvalue

    return pd.Series({
        f'{score}_diff': np.mean(leq[score]) - np.mean(l[score]),
        'p': min(p_less, p_gr)
    })

In [288]:
figures = {
    key: value.groupby('name').apply(evaluate)
    for key, value in evaluations.items()
}

In [289]:
joined = {
    key: pd.merge(value.reset_index(drop=False), splits[key][['name', 'rate', 'rate_kfold']], on=['name'])\
            .set_index('name')
    for key, value in figures.items()
}

In [290]:
joined['dtc'] = pd.merge(binclas_order, joined['dtc'], on=['name']).set_index('name')
joined['rfc'] = pd.merge(binclas_order, joined['rfc'], on=['name']).set_index('name')
joined['dtr'] = pd.merge(regr_order, joined['dtr'], on=['name']).set_index('name')
joined['rfr'] = pd.merge(regr_order, joined['rfr'], on=['name']).set_index('name')

In [291]:
joined['dtc'] = joined['dtc'][['rate', 'rate_kfold', 'auc_diff', 'p']]
joined['rfc'] = joined['rfc'][['rate', 'rate_kfold', 'auc_diff', 'p']]
joined['dtr'] = joined['dtr'][['rate', 'rate_kfold', 'r2_diff', 'p']]
joined['rfr'] = joined['rfr'][['rate', 'rate_kfold', 'r2_diff', 'p']]

In [292]:
joined['dtc'].columns = pd.MultiIndex.from_tuples(('Decision Tree', col) for col in joined['dtc'].columns)
joined['rfc'].columns = pd.MultiIndex.from_tuples(('Random Forest', col) for col in joined['rfc'].columns)
joined['dtr'].columns = pd.MultiIndex.from_tuples(('Decision Tree', col) for col in joined['dtr'].columns)
joined['rfr'].columns = pd.MultiIndex.from_tuples(('Random Forest', col) for col in joined['rfr'].columns)

In [293]:
binclas = pd.merge(joined['dtc'], joined['rfc'], left_index=True, right_index=True).reset_index(drop=False)
regr = pd.merge(joined['dtr'], joined['rfr'], left_index=True, right_index=True).reset_index(drop=False)

In [294]:
binclas.columns = pd.MultiIndex.from_tuples(('Classification', *col) for col in binclas.columns)
regr.columns = pd.MultiIndex.from_tuples(('Regression', *col) for col in regr.columns)

In [295]:
result = pd.concat([binclas, regr], axis=1)

In [296]:
def formatting(row):
    res = pd.Series()
    for label, value in row.items():
        if label.endswith('diff'):
            string = f'{value:.0e}' if abs(value) > 1e-9 else '0'
            if string[0] != '-':
                string = f' {string}'
            res[label] = string
        elif label.startswith('rate'):
            res[label] = f'{value:.2f}'[1:] if value > 0 else '0'
        elif label == 'p':
            res[label] = '$\\ast$' if value < 0.05 else ''
    return res

In [297]:
result[('Classification', 'Decision Tree')]

  result[('Classification', 'Decision Tree')]


Unnamed: 0,rate,rate_kfold,auc_diff,p
0,0.0,0.003534,0.0,0.5
1,0.0,0.020378,-0.000335,0.0004971797
2,0.0,0.014851,4e-06,0.4119086
3,0.0,0.032316,0.0,0.5
4,0.0,0.0,0.0,0.5
5,0.117647,0.095642,0.000105,0.007051001
6,0.0,0.014337,-0.003139,0.07374231
7,0.0,0.056667,0.0,0.5
8,0.0,0.0,0.0,0.5
9,0.0,0.0,0.0,0.5


In [298]:
result[('Classification', 'Decision Tree')] = result[('Classification', 'Decision Tree')].apply(formatting, axis=1)
result[('Classification', 'Random Forest')] = result[('Classification', 'Random Forest')].apply(formatting, axis=1)
result[('Regression', 'Decision Tree')] = result[('Regression', 'Decision Tree')].apply(formatting, axis=1)
result[('Regression', 'Random Forest')] = result[('Regression', 'Random Forest')].apply(formatting, axis=1)

  result[('Classification', 'Decision Tree')] = result[('Classification', 'Decision Tree')].apply(formatting, axis=1)
  result[('Classification', 'Decision Tree')] = result[('Classification', 'Decision Tree')].apply(formatting, axis=1)
  result[('Classification', 'Random Forest')] = result[('Classification', 'Random Forest')].apply(formatting, axis=1)
  result[('Classification', 'Random Forest')] = result[('Classification', 'Random Forest')].apply(formatting, axis=1)
  result[('Regression', 'Decision Tree')] = result[('Regression', 'Decision Tree')].apply(formatting, axis=1)
  result[('Regression', 'Decision Tree')] = result[('Regression', 'Decision Tree')].apply(formatting, axis=1)
  result[('Regression', 'Random Forest')] = result[('Regression', 'Random Forest')].apply(formatting, axis=1)
  result[('Regression', 'Random Forest')] = result[('Regression', 'Random Forest')].apply(formatting, axis=1)


In [299]:
result

Unnamed: 0_level_0,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression
Unnamed: 0_level_1,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest
Unnamed: 0_level_2,Unnamed: 1_level_2,rate,rate_kfold,auc_diff,p,rate,rate_kfold,auc_diff,p,Unnamed: 10_level_2,rate,rate_kfold,r2_diff,p,rate,rate_kfold,r2_diff,p
0,appendicitis,0.0,0.0,0.0,,0.02,0.03,-4e-06,,diabetes,0.0,0.02,-7e-05,,0.0,0.0,0.0007,$\ast$
1,haberman,0.0,0.02,-0.0003,$\ast$,0.04,0.05,6e-06,,o-ring,0.0,0.03,-0.004,$\ast$,0.13,0.1,-0.02,$\ast$
2,new_thyroid1,0.0,0.01,4e-06,,0.07,0.09,8e-05,$\ast$,stock-portfolio,0.05,0.03,-0.0001,,0.02,0.03,2e-05,$\ast$
3,glass0,0.0,0.03,0.0,,0.05,0.05,-2e-05,,wsn-ale,0.0,0.12,-0.001,,0.12,0.11,-0.001,$\ast$
4,shuttle-6_vs_2-3,0.0,0.0,0.0,,0.14,0.13,0.0,,daily-demand,0.0,0.01,-0.0003,$\ast$,0.03,0.03,-0.0007,$\ast$
5,bupa,0.12,0.1,0.0001,$\ast$,0.25,0.24,0.002,$\ast$,slump_test,0.1,0.09,0.0005,$\ast$,0.18,0.16,-0.0004,$\ast$
6,cleveland-0_vs_4,0.0,0.01,-0.003,,0.09,0.11,-0.0008,$\ast$,servo,0.0,0.0,-7e-06,,0.0,0.0,-9e-06,$\ast$
7,ecoli1,0.0,0.06,0.0,,0.12,0.12,-7e-05,$\ast$,yacht_hydrodynamics,0.0,0.0,0.0,,0.0,0.0,0.0,
8,poker-9_vs_7,0.0,0.0,0.0,,0.14,0.16,-0.002,$\ast$,autoMPG6,0.09,0.05,-0.0003,$\ast$,0.18,0.17,3e-05,
9,monk-2,0.0,0.0,0.0,,0.0,0.0,0.0,,excitation_current,0.1,0.1,-4e-08,$\ast$,0.13,0.12,2e-07,$\ast$


In [300]:
result[('Classification', 'name', '')] = result[('Classification', 'name', '')].apply(lambda x: x.replace('_', '-'))
result[('Regression', 'name', '')] = result[('Regression', 'name', '')].apply(lambda x: x.replace('_', '-'))

In [301]:
result

Unnamed: 0_level_0,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression
Unnamed: 0_level_1,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest
Unnamed: 0_level_2,Unnamed: 1_level_2,rate,rate_kfold,auc_diff,p,rate,rate_kfold,auc_diff,p,Unnamed: 10_level_2,rate,rate_kfold,r2_diff,p,rate,rate_kfold,r2_diff,p
0,appendicitis,0.0,0.0,0.0,,0.02,0.03,-4e-06,,diabetes,0.0,0.02,-7e-05,,0.0,0.0,0.0007,$\ast$
1,haberman,0.0,0.02,-0.0003,$\ast$,0.04,0.05,6e-06,,o-ring,0.0,0.03,-0.004,$\ast$,0.13,0.1,-0.02,$\ast$
2,new-thyroid1,0.0,0.01,4e-06,,0.07,0.09,8e-05,$\ast$,stock-portfolio,0.05,0.03,-0.0001,,0.02,0.03,2e-05,$\ast$
3,glass0,0.0,0.03,0.0,,0.05,0.05,-2e-05,,wsn-ale,0.0,0.12,-0.001,,0.12,0.11,-0.001,$\ast$
4,shuttle-6-vs-2-3,0.0,0.0,0.0,,0.14,0.13,0.0,,daily-demand,0.0,0.01,-0.0003,$\ast$,0.03,0.03,-0.0007,$\ast$
5,bupa,0.12,0.1,0.0001,$\ast$,0.25,0.24,0.002,$\ast$,slump-test,0.1,0.09,0.0005,$\ast$,0.18,0.16,-0.0004,$\ast$
6,cleveland-0-vs-4,0.0,0.01,-0.003,,0.09,0.11,-0.0008,$\ast$,servo,0.0,0.0,-7e-06,,0.0,0.0,-9e-06,$\ast$
7,ecoli1,0.0,0.06,0.0,,0.12,0.12,-7e-05,$\ast$,yacht-hydrodynamics,0.0,0.0,0.0,,0.0,0.0,0.0,
8,poker-9-vs-7,0.0,0.0,0.0,,0.14,0.16,-0.002,$\ast$,autoMPG6,0.09,0.05,-0.0003,$\ast$,0.18,0.17,3e-05,
9,monk-2,0.0,0.0,0.0,,0.0,0.0,0.0,,excitation-current,0.1,0.1,-4e-08,$\ast$,0.13,0.12,2e-07,$\ast$


In [302]:
def rename_columns(col):
    if col == 'rate':
        return '$\\rho$'
    elif col == 'rate_kfold':
        return '$\\rho_{k}$'
    elif col == 'auc_diff':
        return 'auc$_{d}$'
    elif col == 'r2_diff':
        return 'r$^2_{d}$'
    elif col == 'p':
        return 'p$_{\\neq}$'
    else:
        return col

In [303]:
result.columns = pd.MultiIndex.from_tuples([(col[0], col[1], rename_columns(col[2]))
                                            for col in result.columns])

In [304]:
result

Unnamed: 0_level_0,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression
Unnamed: 0_level_1,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest
Unnamed: 0_level_2,Unnamed: 1_level_2,$\rho$,$\rho_{k}$,auc$_{d}$,p$_{\neq}$,$\rho$,$\rho_{k}$,auc$_{d}$,p$_{\neq}$,Unnamed: 10_level_2,$\rho$,$\rho_{k}$,r$^2_{d}$,p$_{\neq}$,$\rho$,$\rho_{k}$,r$^2_{d}$,p$_{\neq}$
0,appendicitis,0.0,0.0,0.0,,0.02,0.03,-4e-06,,diabetes,0.0,0.02,-7e-05,,0.0,0.0,0.0007,$\ast$
1,haberman,0.0,0.02,-0.0003,$\ast$,0.04,0.05,6e-06,,o-ring,0.0,0.03,-0.004,$\ast$,0.13,0.1,-0.02,$\ast$
2,new-thyroid1,0.0,0.01,4e-06,,0.07,0.09,8e-05,$\ast$,stock-portfolio,0.05,0.03,-0.0001,,0.02,0.03,2e-05,$\ast$
3,glass0,0.0,0.03,0.0,,0.05,0.05,-2e-05,,wsn-ale,0.0,0.12,-0.001,,0.12,0.11,-0.001,$\ast$
4,shuttle-6-vs-2-3,0.0,0.0,0.0,,0.14,0.13,0.0,,daily-demand,0.0,0.01,-0.0003,$\ast$,0.03,0.03,-0.0007,$\ast$
5,bupa,0.12,0.1,0.0001,$\ast$,0.25,0.24,0.002,$\ast$,slump-test,0.1,0.09,0.0005,$\ast$,0.18,0.16,-0.0004,$\ast$
6,cleveland-0-vs-4,0.0,0.01,-0.003,,0.09,0.11,-0.0008,$\ast$,servo,0.0,0.0,-7e-06,,0.0,0.0,-9e-06,$\ast$
7,ecoli1,0.0,0.06,0.0,,0.12,0.12,-7e-05,$\ast$,yacht-hydrodynamics,0.0,0.0,0.0,,0.0,0.0,0.0,
8,poker-9-vs-7,0.0,0.0,0.0,,0.14,0.16,-0.002,$\ast$,autoMPG6,0.09,0.05,-0.0003,$\ast$,0.18,0.17,3e-05,
9,monk-2,0.0,0.0,0.0,,0.0,0.0,0.0,,excitation-current,0.1,0.1,-4e-08,$\ast$,0.13,0.12,2e-07,$\ast$


In [305]:
latex = result.to_latex(
    index=False,
    multicolumn_format='c'
)

In [306]:
tabular_string = latex[len('\\begin{tabular}{'): len('\\begin{tabular}{') + len(result.columns)]
print(tabular_string)
tabular_string_new = 'lrrrcrrrclrrrcrrrc'
updated = '@{\hspace{4pt}}'.join(tabular_string_new)
latex = latex.replace(tabular_string, updated).replace('name', 'dataset')

llllllllllllllllll


In [307]:
print(latex)

\begin{tabular}{l@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}c@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}c@{\hspace{4pt}}l@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}c@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}c}
\toprule
\multicolumn{9}{c}{Classification} & \multicolumn{9}{c}{Regression} \\
dataset & \multicolumn{4}{c}{Decision Tree} & \multicolumn{4}{c}{Random Forest} & dataset & \multicolumn{4}{c}{Decision Tree} & \multicolumn{4}{c}{Random Forest} \\
 & $\rho$ & $\rho_{k}$ & auc$_{d}$ & p$_{\neq}$ & $\rho$ & $\rho_{k}$ & auc$_{d}$ & p$_{\neq}$ &  & $\rho$ & $\rho_{k}$ & r$^2_{d}$ & p$_{\neq}$ & $\rho$ & $\rho_{k}$ & r$^2_{d}$ & p$_{\neq}$ \\
\midrule
appendicitis & 0 & .00 &  0 &  & .02 & .03 & -4e-06 &  & diabetes & 0 & .02 & -7e-05 &  & 0 & .00 &  7e-04 & $\ast$ \\
haberman & 0 & .02 & -3e-04 & $\ast$ & .04 & .05 &  6e-06 &  & o-ring & 0 & .03 & -4e-03 & $\ast$ & .13 & .10 & -2e-02 & $\ast$ \\
new-thyroi

In [308]:
with open('tab_presence.tex', 'wt') as file:
    file.write(latex)