In [318]:
import numpy as np
import pandas as pd

from scipy.stats import wilcoxon

from datasets import binclas_datasets, regr_datasets
from config import dataset_map

In [319]:
splits = {
    'dtc': pd.read_csv('splits_dtc.csv'),
    'dtr': pd.read_csv('splits_dtr.csv'),
    'rfc': pd.read_csv('splits_rfc.csv'),
    'rfr': pd.read_csv('splits_rfr.csv')
}
evaluations = {
    'dtc': pd.read_csv('evaluation_dtc.csv'),
    'dtr': pd.read_csv('evaluation_dtr.csv'),
    'rfc': pd.read_csv('evaluation_rfc.csv'),
    'rfr': pd.read_csv('evaluation_rfr.csv')
}

scores = {
    'dtc': 'auc',
    'rfc': 'auc',
    'dtr': 'r2',
    'rfr': 'r2'
}

binclas_order = binclas_datasets[['name']]\
    .apply(lambda row: pd.Series({'name': dataset_map.get(row['name'], row['name'])}), axis=1)
regr_order = regr_datasets[['name']]\
    .apply(lambda row: pd.Series({'name': dataset_map.get(row['name'], row['name'])}), axis=1)

In [320]:
splits = {
    key: value.rename(columns={'Unnamed: 0': 'name'})
    for key, value in splits.items()
}

In [321]:
splits['dtc']['name'] = splits['dtc']['name'].apply(lambda name: dataset_map.get(name, name))
splits['dtr']['name'] = splits['dtr']['name'].apply(lambda name: dataset_map.get(name, name))
splits['rfc']['name'] = splits['rfc']['name'].apply(lambda name: dataset_map.get(name, name))
splits['rfr']['name'] = splits['rfr']['name'].apply(lambda name: dataset_map.get(name, name))

In [322]:
splits['dtc']

Unnamed: 0,name,n_lattice_splits,n_splits,n_lattice_splits_kfold,n_splits_kfold
0,appendicitis,0,3,1,283
1,haberman,0,7,14,687
2,new_thyroid1,0,2,3,202
3,glass0,0,5,18,557
4,shuttle-6_vs_2-3,0,1,0,100
5,bupa,2,17,158,1652
6,cleveland-0_vs_4,0,3,4,279
7,ecoli1,0,3,17,300
8,poker-9_vs_7,0,2,0,223
9,monk-2,0,5,0,474


In [323]:
for key in splits:
    splits[key]['rate'] = splits[key]['n_lattice_splits'] / splits[key]['n_splits']
    splits[key]['rate_kfold'] = splits[key]['n_lattice_splits_kfold'] / splits[key]['n_splits_kfold']

In [324]:
evaluations = {
    key: value[value['mode'].isin(['<', '<='])]
    for key, value in evaluations.items()
}

In [325]:
evaluations = {
    key: value.groupby(['name', 'mode'])\
            .apply(lambda pdf: pdf.sort_values('fold')[scores[key]].values.tolist())\
            .reset_index(drop=False)\
            .rename(columns={0: scores[key]})
    for key, value in evaluations.items()
}

In [326]:
def evaluate(pdf):
    score = 'auc' if 'auc' in pdf.columns else 'r2'
    leq = pdf[pdf['mode'] == '<='].iloc[0]
    l = pdf[pdf['mode'] == '<'].iloc[0]

    p_less = wilcoxon(leq[score], l[score], zero_method='zsplit', alternative='less').pvalue
    p_gr = wilcoxon(leq[score], l[score], zero_method='zsplit', alternative='greater').pvalue

    return pd.Series({
        f'{score}_diff': np.mean(leq[score]) - np.mean(l[score]),
        'p': min(p_less, p_gr)
    })

In [327]:
figures = {
    key: value.groupby('name').apply(evaluate)
    for key, value in evaluations.items()
}

In [328]:
joined = {
    key: pd.merge(value.reset_index(drop=False), splits[key][['name', 'rate', 'rate_kfold']], on=['name'])\
            .set_index('name')
    for key, value in figures.items()
}

In [329]:
print(joined['dtc']), binclas_order

                        auc_diff             p      rate  rate_kfold
name                                                                
abalone9_18             0.000000  5.000000e-01  0.000000    0.000000
appendicitis            0.000000  5.000000e-01  0.000000    0.003534
bupa                    0.000105  7.051001e-03  0.117647    0.095642
cleveland-0_vs_4       -0.003139  7.374231e-02  0.000000    0.014337
ecoli1                  0.000000  5.000000e-01  0.000000    0.056667
glass0                  0.000000  5.000000e-01  0.000000    0.032316
haberman               -0.000335  4.971797e-04  0.000000    0.020378
hepatitis              -0.000244  4.734861e-01  0.000000    0.044077
lymphography           -0.003967  1.546562e-01  0.000000    0.000000
mammographic            0.000026  3.769766e-03  0.000000    0.001086
monk-2                  0.000000  5.000000e-01  0.000000    0.000000
new_thyroid1            0.000004  4.119086e-01  0.000000    0.014851
page-blocks-1-3_vs_4    0.000149  

(None,
                       name
 0             appendicitis
 1                 haberman
 2             new_thyroid1
 3                   glass0
 4         shuttle-6_vs_2-3
 5                     bupa
 6         cleveland-0_vs_4
 7                   ecoli1
 8             poker-9_vs_7
 9                   monk-2
 10               hepatitis
 11    yeast-0-3-5-9_vs_7-8
 12            mammographic
 13                 saheart
 14    page-blocks-1-3_vs_4
 15            lymphography
 16                    pima
 17               wisconsin
 18             abalone9_18
 19  winequality-red-3_vs_5)

In [330]:
joined['dtc'] = pd.merge(binclas_order, joined['dtc'], on=['name']).set_index('name')
joined['rfc'] = pd.merge(binclas_order, joined['rfc'], on=['name']).set_index('name')
joined['dtr'] = pd.merge(regr_order, joined['dtr'], on=['name']).set_index('name')
joined['rfr'] = pd.merge(regr_order, joined['rfr'], on=['name']).set_index('name')

In [331]:
joined['dtc'].columns = pd.MultiIndex.from_tuples(('Decision Tree', col) for col in joined['dtc'].columns)
joined['rfc'].columns = pd.MultiIndex.from_tuples(('Random Forest', col) for col in joined['rfc'].columns)
joined['dtr'].columns = pd.MultiIndex.from_tuples(('Decision Tree', col) for col in joined['dtr'].columns)
joined['rfr'].columns = pd.MultiIndex.from_tuples(('Random Forest', col) for col in joined['rfr'].columns)

In [332]:
binclas = pd.merge(joined['dtc'], joined['rfc'], left_index=True, right_index=True).reset_index(drop=False)
regr = pd.merge(joined['dtr'], joined['rfr'], left_index=True, right_index=True).reset_index(drop=False)

In [333]:
binclas.columns = pd.MultiIndex.from_tuples(('Classification', *col) for col in binclas.columns)
regr.columns = pd.MultiIndex.from_tuples(('Regression', *col) for col in regr.columns)

In [334]:
result = pd.concat([binclas, regr], axis=1)

In [335]:
def formatting(row):
    res = pd.Series()
    for label, value in row.items():
        if label.endswith('diff'):
            string = f'{value:.0e}' if abs(value) > 1e-9 else '0'
            if string[0] != '-':
                string = f' {string}'
            res[label] = string
        elif label.startswith('rate'):
            res[label] = f'{value:.2f}'[1:] if value > 0 else '0'
        elif label == 'p':
            res[label] = '$\\neq$' if value < 0.05 else ''
    return res

In [336]:
result[('Classification', 'Decision Tree')]

  result[('Classification', 'Decision Tree')]


Unnamed: 0,auc_diff,p,rate,rate_kfold
0,0.0,0.5,0.0,0.003534
1,-0.000335,0.0004971797,0.0,0.020378
2,4e-06,0.4119086,0.0,0.014851
3,0.0,0.5,0.0,0.032316
4,0.0,0.5,0.0,0.0
5,0.000105,0.007051001,0.117647,0.095642
6,-0.003139,0.07374231,0.0,0.014337
7,0.0,0.5,0.0,0.056667
8,0.0,0.5,0.0,0.0
9,0.0,0.5,0.0,0.0


In [337]:
result[('Classification', 'Decision Tree')] = result[('Classification', 'Decision Tree')].apply(formatting, axis=1)
result[('Classification', 'Random Forest')] = result[('Classification', 'Random Forest')].apply(formatting, axis=1)
result[('Regression', 'Decision Tree')] = result[('Regression', 'Decision Tree')].apply(formatting, axis=1)
result[('Regression', 'Random Forest')] = result[('Regression', 'Random Forest')].apply(formatting, axis=1)

  result[('Classification', 'Decision Tree')] = result[('Classification', 'Decision Tree')].apply(formatting, axis=1)
  result[('Classification', 'Decision Tree')] = result[('Classification', 'Decision Tree')].apply(formatting, axis=1)
  result[('Classification', 'Random Forest')] = result[('Classification', 'Random Forest')].apply(formatting, axis=1)
  result[('Classification', 'Random Forest')] = result[('Classification', 'Random Forest')].apply(formatting, axis=1)
  result[('Regression', 'Decision Tree')] = result[('Regression', 'Decision Tree')].apply(formatting, axis=1)
  result[('Regression', 'Decision Tree')] = result[('Regression', 'Decision Tree')].apply(formatting, axis=1)
  result[('Regression', 'Random Forest')] = result[('Regression', 'Random Forest')].apply(formatting, axis=1)
  result[('Regression', 'Random Forest')] = result[('Regression', 'Random Forest')].apply(formatting, axis=1)


In [338]:
result

Unnamed: 0_level_0,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression
Unnamed: 0_level_1,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest
Unnamed: 0_level_2,Unnamed: 1_level_2,auc_diff,p,rate,rate_kfold,auc_diff,p,rate,rate_kfold,Unnamed: 10_level_2,r2_diff,p,rate,rate_kfold,r2_diff,p,rate,rate_kfold
0,appendicitis,0.0,,0.0,0.0,-4e-06,,0.02,0.03,diabetes,-7e-05,,0.0,0.02,0.0007,$\neq$,0.0,0.0
1,haberman,-0.0003,$\neq$,0.0,0.02,6e-06,,0.04,0.05,o-ring,-0.004,$\neq$,0.0,0.03,-0.02,$\neq$,0.13,0.1
2,new_thyroid1,4e-06,,0.0,0.01,8e-05,$\neq$,0.07,0.09,stock-portfolio,-0.0001,,0.05,0.03,2e-05,$\neq$,0.02,0.03
3,glass0,0.0,,0.0,0.03,-2e-05,,0.05,0.05,wsn-ale,-0.001,,0.0,0.12,-0.001,$\neq$,0.12,0.11
4,shuttle-6_vs_2-3,0.0,,0.0,0.0,0.0,,0.14,0.13,daily-demand,-0.0003,$\neq$,0.0,0.01,-0.0007,$\neq$,0.03,0.03
5,bupa,0.0001,$\neq$,0.12,0.1,0.002,$\neq$,0.25,0.24,slump_test,0.0005,$\neq$,0.1,0.09,-0.0004,$\neq$,0.18,0.16
6,cleveland-0_vs_4,-0.003,,0.0,0.01,-0.0008,$\neq$,0.09,0.11,servo,-7e-06,,0.0,0.0,-9e-06,$\neq$,0.0,0.0
7,ecoli1,0.0,,0.0,0.06,-7e-05,$\neq$,0.12,0.12,yacht_hydrodynamics,0.0,,0.0,0.0,0.0,,0.0,0.0
8,poker-9_vs_7,0.0,,0.0,0.0,-0.002,$\neq$,0.14,0.16,autoMPG6,-0.0003,$\neq$,0.09,0.05,3e-05,,0.18,0.17
9,monk-2,0.0,,0.0,0.0,0.0,,0.0,0.0,excitation_current,-4e-08,$\neq$,0.1,0.1,2e-07,$\neq$,0.13,0.12


In [339]:
result[('Classification', 'name', '')] = result[('Classification', 'name', '')].apply(lambda x: x.replace('_', '-'))
result[('Regression', 'name', '')] = result[('Regression', 'name', '')].apply(lambda x: x.replace('_', '-'))

In [340]:
result

Unnamed: 0_level_0,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression
Unnamed: 0_level_1,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest
Unnamed: 0_level_2,Unnamed: 1_level_2,auc_diff,p,rate,rate_kfold,auc_diff,p,rate,rate_kfold,Unnamed: 10_level_2,r2_diff,p,rate,rate_kfold,r2_diff,p,rate,rate_kfold
0,appendicitis,0.0,,0.0,0.0,-4e-06,,0.02,0.03,diabetes,-7e-05,,0.0,0.02,0.0007,$\neq$,0.0,0.0
1,haberman,-0.0003,$\neq$,0.0,0.02,6e-06,,0.04,0.05,o-ring,-0.004,$\neq$,0.0,0.03,-0.02,$\neq$,0.13,0.1
2,new-thyroid1,4e-06,,0.0,0.01,8e-05,$\neq$,0.07,0.09,stock-portfolio,-0.0001,,0.05,0.03,2e-05,$\neq$,0.02,0.03
3,glass0,0.0,,0.0,0.03,-2e-05,,0.05,0.05,wsn-ale,-0.001,,0.0,0.12,-0.001,$\neq$,0.12,0.11
4,shuttle-6-vs-2-3,0.0,,0.0,0.0,0.0,,0.14,0.13,daily-demand,-0.0003,$\neq$,0.0,0.01,-0.0007,$\neq$,0.03,0.03
5,bupa,0.0001,$\neq$,0.12,0.1,0.002,$\neq$,0.25,0.24,slump-test,0.0005,$\neq$,0.1,0.09,-0.0004,$\neq$,0.18,0.16
6,cleveland-0-vs-4,-0.003,,0.0,0.01,-0.0008,$\neq$,0.09,0.11,servo,-7e-06,,0.0,0.0,-9e-06,$\neq$,0.0,0.0
7,ecoli1,0.0,,0.0,0.06,-7e-05,$\neq$,0.12,0.12,yacht-hydrodynamics,0.0,,0.0,0.0,0.0,,0.0,0.0
8,poker-9-vs-7,0.0,,0.0,0.0,-0.002,$\neq$,0.14,0.16,autoMPG6,-0.0003,$\neq$,0.09,0.05,3e-05,,0.18,0.17
9,monk-2,0.0,,0.0,0.0,0.0,,0.0,0.0,excitation-current,-4e-08,$\neq$,0.1,0.1,2e-07,$\neq$,0.13,0.12


In [341]:
def rename_columns(col):
    if col == 'rate':
        return '$\\rho$'
    elif col == 'rate_kfold':
        return '$\\rho_{k}$'
    elif col == 'auc_diff':
        return 'auc$_{d}$'
    elif col == 'r2_diff':
        return 'r$^2_{d}$'
    else:
        return col

In [342]:
result.columns = pd.MultiIndex.from_tuples([(col[0], col[1], rename_columns(col[2]))
                                            for col in result.columns])

In [343]:
result

Unnamed: 0_level_0,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression
Unnamed: 0_level_1,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest,name,Decision Tree,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Random Forest
Unnamed: 0_level_2,Unnamed: 1_level_2,auc$_{d}$,p,$\rho$,$\rho_{k}$,auc$_{d}$,p,$\rho$,$\rho_{k}$,Unnamed: 10_level_2,r$^2_{d}$,p,$\rho$,$\rho_{k}$,r$^2_{d}$,p,$\rho$,$\rho_{k}$
0,appendicitis,0.0,,0.0,0.0,-4e-06,,0.02,0.03,diabetes,-7e-05,,0.0,0.02,0.0007,$\neq$,0.0,0.0
1,haberman,-0.0003,$\neq$,0.0,0.02,6e-06,,0.04,0.05,o-ring,-0.004,$\neq$,0.0,0.03,-0.02,$\neq$,0.13,0.1
2,new-thyroid1,4e-06,,0.0,0.01,8e-05,$\neq$,0.07,0.09,stock-portfolio,-0.0001,,0.05,0.03,2e-05,$\neq$,0.02,0.03
3,glass0,0.0,,0.0,0.03,-2e-05,,0.05,0.05,wsn-ale,-0.001,,0.0,0.12,-0.001,$\neq$,0.12,0.11
4,shuttle-6-vs-2-3,0.0,,0.0,0.0,0.0,,0.14,0.13,daily-demand,-0.0003,$\neq$,0.0,0.01,-0.0007,$\neq$,0.03,0.03
5,bupa,0.0001,$\neq$,0.12,0.1,0.002,$\neq$,0.25,0.24,slump-test,0.0005,$\neq$,0.1,0.09,-0.0004,$\neq$,0.18,0.16
6,cleveland-0-vs-4,-0.003,,0.0,0.01,-0.0008,$\neq$,0.09,0.11,servo,-7e-06,,0.0,0.0,-9e-06,$\neq$,0.0,0.0
7,ecoli1,0.0,,0.0,0.06,-7e-05,$\neq$,0.12,0.12,yacht-hydrodynamics,0.0,,0.0,0.0,0.0,,0.0,0.0
8,poker-9-vs-7,0.0,,0.0,0.0,-0.002,$\neq$,0.14,0.16,autoMPG6,-0.0003,$\neq$,0.09,0.05,3e-05,,0.18,0.17
9,monk-2,0.0,,0.0,0.0,0.0,,0.0,0.0,excitation-current,-4e-08,$\neq$,0.1,0.1,2e-07,$\neq$,0.13,0.12


In [344]:
latex = result.to_latex(
    index=False,
    multicolumn_format='c'
)

In [345]:
tabular_string = latex[len('\\begin{tabular}{'): len('\\begin{tabular}{') + len(result.columns)]
print(tabular_string)
tabular_string_new = 'lrlrrrlrrlrrrlrrrl'
updated = '@{\hspace{4pt}}'.join(tabular_string_new)
latex = latex.replace(tabular_string, updated)

llllllllllllllllll


In [346]:
print(latex)

\begin{tabular}{l@{\hspace{4pt}}r@{\hspace{4pt}}l@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}l@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}l@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}l@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}r@{\hspace{4pt}}l}
\toprule
\multicolumn{9}{c}{Classification} & \multicolumn{9}{c}{Regression} \\
name & \multicolumn{4}{c}{Decision Tree} & \multicolumn{4}{c}{Random Forest} & name & \multicolumn{4}{c}{Decision Tree} & \multicolumn{4}{c}{Random Forest} \\
 & auc$_{d}$ & p & $\rho$ & $\rho_{k}$ & auc$_{d}$ & p & $\rho$ & $\rho_{k}$ &  & r$^2_{d}$ & p & $\rho$ & $\rho_{k}$ & r$^2_{d}$ & p & $\rho$ & $\rho_{k}$ \\
\midrule
appendicitis &  0 &  & 0 & .00 & -4e-06 &  & .02 & .03 & diabetes & -7e-05 &  & 0 & .02 &  7e-04 & $\neq$ & 0 & .00 \\
haberman & -3e-04 & $\neq$ & 0 & .02 &  6e-06 &  & .04 & .05 & o-ring & -4e-03 & $\neq$ & 0 & .03 & -2e-02 & $\neq$ & .13 & .10 \\
new-thyroid1 &  4e-06 &  & 0 & .01 &  8e-05 & $\neq$

In [347]:
with open('tab_presence.tex', 'wt') as file:
    file.write(latex)