In [74]:
import logging

import numpy as np
import pandas as pd

from datasets import binclas_datasets, regr_datasets

from config import dataset_map

In [91]:
best_params = {
    'dtc': pd.read_csv('params_dtc.csv'),
    'dtr': pd.read_csv('params_dtr.csv'),
    'rfc': pd.read_csv('params_rfc.csv'),
    'rfr': pd.read_csv('params_rfr.csv')
}

In [76]:
binclas_datasets['name'] = binclas_datasets['name'].apply(lambda x: dataset_map.get(x, x))
regr_datasets['name'] = regr_datasets['name'].apply(lambda x: dataset_map.get(x, x))

In [77]:
binclas_datasets['name_key'] = binclas_datasets.apply(lambda row: f'{row["name"]} \\cite{{{row["citation_key"]}}}', axis=1)
regr_datasets['name_key'] = regr_datasets.apply(lambda row: f'{row["name"]} \\cite{{{row["citation_key"]}}}', axis=1)

In [78]:
binclas_params = pd.merge(best_params['dtc'].rename(columns={'params': 'par_t', 'auc': 'auc_t'}), best_params['rfc'].rename(columns={'params': 'par_f', 'auc': 'auc_f'}), on=['name'])
regr_params = pd.merge(best_params['dtr'].rename(columns={'params': 'par_t', 'r2': 'r2_t'}), best_params['rfr'].rename(columns={'params': 'par_f', 'r2': 'r2_f'}), on=['name'])

In [79]:
binclas_datasets = pd.merge(binclas_datasets, binclas_params, on=['name'])
regr_datasets = pd.merge(regr_datasets, regr_params, on=['name'])

In [80]:
def clean_params(row):
    par = eval(row['par_t'])
    if 'min_samples_leaf' in par:
        val = int(np.floor(par["min_samples_leaf"] * row["n"] * 0.8))
        row['par_t'] = f'$\\alpha$: {val}'
    elif 'max_depth' in par:
        if par['max_depth'] is not None:
            row['par_t'] = f'$\\beta$: {par["max_depth"]}'
        else:
            row['par_t'] = ''

    par = eval(row['par_f'])
    if 'min_samples_leaf' in par:
        val = int(np.floor(par["min_samples_leaf"] * row["n"] * 0.8))
        row['par_f'] = f'$\\alpha$: {val}'
    elif 'max_depth' in par:
        if par['max_depth'] is not None:
            row['par_f'] = f'$\\beta$: {par["max_depth"]}'
        else:
            row['par_f'] = ''

    return row

In [81]:
binclas_datasets = binclas_datasets.apply(clean_params, axis=1)
regr_datasets = regr_datasets.apply(clean_params, axis=1)

In [82]:
binclas = binclas_datasets[['name_key', 'n', 'n_col', 'n_grid', 'n_minority', 'par_t', 'auc_t', 'par_f', 'auc_f']]\
    .rename(columns={
        'name_key': 'name',
        'n': 'N',
        'n_col': '$N_a$',
        'n_grid': '$N_l$',
        'n_minority': '$N_m$',
        'par_t': 'par$_t$',
        'auc_t': 'auc$_t$',
        'par_f': 'par$_f$',
        'auc_f': 'auc$_f$'
    })

regr = regr_datasets[['name_key', 'n', 'n_col', 'n_grid', 'par_t', 'r2_t', 'par_f', 'r2_f']]\
    .rename(columns={
        'name_key': 'name',
        'n': 'N',
        'n_col': '$N_a$',
        'n_grid': '$N_l$',
        'par_t': 'par$_t$',
        'r2_t': 'r$^2_t$',
        'par_f': 'par$_f$',
        'r2_f': 'r$^2_f$'
    })

In [83]:
binclas['name'] = binclas['name'].apply(lambda x: x.replace('_', '-'))
regr['name'] = regr['name'].apply(lambda x: x.replace('_', '-'))

In [84]:
binclas.columns = pd.MultiIndex.from_tuples([('Classification', col) for col in binclas.columns])
regr.columns = pd.MultiIndex.from_tuples([('Regression', col) for col in regr.columns])

In [85]:
binclas

Unnamed: 0_level_0,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification
Unnamed: 0_level_1,name,N,$N_a$,$N_l$,$N_m$,par$_t$,auc$_t$,par$_f$,auc$_f$
0,appendicitis \cite{keel},106,7,7,21,$\alpha$: 16,0.774544,$\alpha$: 16,0.851897
1,haberman \cite{keel},306,3,3,81,$\alpha$: 22,0.661721,$\alpha$: 13,0.721319
2,new-thyroid1 \cite{keel},215,5,5,35,$\alpha$: 15,0.95869,$\beta$: 6,0.999802
3,glass0 \cite{keel},214,9,9,70,$\alpha$: 15,0.837056,$\beta$: 10,0.929381
4,shuttle-6-vs-2-3 \cite{keel},230,9,9,10,$\alpha$: 2,1.0,$\alpha$: 1,1.0
5,bupa \cite{keel},345,6,6,145,$\alpha$: 11,0.695431,$\alpha$: 2,0.76731
6,cleveland-0-vs-4 \cite{keel},177,13,10,13,$\alpha$: 7,0.898073,$\alpha$: 4,0.978595
7,ecoli1 \cite{keel},336,7,5,77,$\alpha$: 31,0.95418,$\alpha$: 3,0.958448
8,poker-9-vs-7 \cite{keel},244,10,10,8,$\alpha$: 17,0.681871,$\beta$: 4,0.985988
9,monk-2 \cite{keel},432,6,4,204,$\alpha$: 2,1.0,$\alpha$: 2,1.0


In [86]:
result = pd.concat([binclas, regr], axis=1)
result

Unnamed: 0_level_0,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Classification,Regression,Regression,Regression,Regression,Regression,Regression,Regression,Regression
Unnamed: 0_level_1,name,N,$N_a$,$N_l$,$N_m$,par$_t$,auc$_t$,par$_f$,auc$_f$,name,N,$N_a$,$N_l$,par$_t$,r$^2_t$,par$_f$,r$^2_f$
0,appendicitis \cite{keel},106,7,7,21,$\alpha$: 16,0.774544,$\alpha$: 16,0.851897,diabetes \cite{keel},43,2,2,$\alpha$: 6,-0.375384,$\beta$: 2,-0.083344
1,haberman \cite{keel},306,3,3,81,$\alpha$: 22,0.661721,$\alpha$: 13,0.721319,o-ring \cite{uci},23,6,4,$\alpha$: 1,0.126992,$\beta$: 2,0.150356
2,new-thyroid1 \cite{keel},215,5,5,35,$\alpha$: 15,0.95869,$\beta$: 6,0.999802,wsn-ale \cite{uci},107,5,4,$\alpha$: 5,0.433964,$\alpha$: 2,0.560735
3,glass0 \cite{keel},214,9,9,70,$\alpha$: 15,0.837056,$\beta$: 10,0.929381,daily-demand \cite{uci},60,12,7,$\alpha$: 1,0.69713,$\beta$: 7,0.828057
4,shuttle-6-vs-2-3 \cite{keel},230,9,9,10,$\alpha$: 2,1.0,$\alpha$: 1,1.0,slump-test \cite{krnn},103,9,9,$\alpha$: 2,0.623463,$\beta$: 8,0.768588
5,bupa \cite{keel},345,6,6,145,$\alpha$: 11,0.695431,$\alpha$: 2,0.76731,servo \cite{uci},167,10,2,$\alpha$: 4,0.685812,$\alpha$: 2,0.720899
6,cleveland-0-vs-4 \cite{keel},177,13,10,13,$\alpha$: 7,0.898073,$\alpha$: 4,0.978595,yacht-hydrodynamics \cite{krnn},307,6,6,$\alpha$: 1,0.99325,$\beta$: 11,0.99493
7,ecoli1 \cite{keel},336,7,5,77,$\alpha$: 31,0.95418,$\alpha$: 3,0.958448,autoMPG6 \cite{keel},392,5,5,$\alpha$: 10,0.82272,$\beta$: 12,0.872429
8,poker-9-vs-7 \cite{keel},244,10,10,8,$\alpha$: 17,0.681871,$\beta$: 4,0.985988,excitation-current \cite{uci},557,4,4,$\beta$: 10,0.999832,$\beta$: 11,0.999914
9,monk-2 \cite{keel},432,6,4,204,$\alpha$: 2,1.0,$\alpha$: 2,1.0,real-estate-valuation \cite{uci},414,6,5,$\alpha$: 6,0.657628,$\alpha$: 4,0.709048


In [87]:
latex = result.to_latex(
    index=False,
    formatters={col: lambda x: f'{x:.3f}' for col in result.columns if '^2' in col[-1] or 'auc' in col[-1]},
    multicolumn_format='c'
    )

In [88]:
tabular_string = latex[len('\\begin{tabular}{'): len('\\begin{tabular}{') + len(result.columns)]
tabular_string


'lrrrrlrlrlrrrlrlr'

In [89]:
updated = '@{\hspace{6pt}}'.join(tabular_string)
latex = latex.replace(tabular_string, updated)

In [90]:
with open('tab_datasets.tex', 'wt') as file:
    file.write(latex)