# Model selection for the minimum number of samples on leaves

In [1]:
import os

import numpy as np

from conditioning_bias import (OperatorDecisionTreeClassifier,
                                OperatorDecisionTreeRegressor,
                                OperatorRandomForestClassifier,
                                OperatorRandomForestRegressor)

from datasets import binclas_datasets, regr_datasets

from evaluation import evaluate_classification, evaluate_regression
from config import dataset_map, data_dir, random_seed, n_splits_ms, n_repeats_ms

2023-12-11 16:52:18 INFO     querying the filtered classification datasets
2023-12-11 16:52:18 INFO     ranking the datasets
2023-12-11 16:52:18 INFO     binary classification datasets prepared
2023-12-11 16:52:18 INFO     querying the filtered regression datasets
2023-12-11 16:52:18 INFO     ranking the datasets
2023-12-11 16:52:18 INFO     regression datasets prepared


In [2]:
min_samples_leaf = np.round(np.logspace(np.log10(0.005), np.log10(0.2), 15), 3)

postfix = '_bs_msl'

In [3]:
configurations = [
    {
        'datasets': binclas_datasets,
        'estimator': OperatorDecisionTreeClassifier,
        'function': evaluate_classification,
        'label': 'dtc'
    },
    {
        'datasets': binclas_datasets,
        'estimator': OperatorRandomForestClassifier,
        'function': evaluate_classification,
        'label': 'rfc'
    },
    {
        'datasets': regr_datasets,
        'estimator': OperatorDecisionTreeRegressor,
        'function': evaluate_regression,
        'label': 'dtr'
    },
    {
        'datasets': regr_datasets,
        'estimator': OperatorRandomForestRegressor,
        'function': evaluate_regression,
        'label': 'rfr'
    }
]

In [4]:
for conf in configurations:
    results = conf['function'](
        datasets=conf['datasets'],
        estimator=conf['estimator'],
        params=[{'random_state': random_seed, 'mode': '<=', 'min_samples_leaf': msl} for msl in min_samples_leaf],
        validator_params={'n_splits': n_splits_ms, 'n_repeats': n_repeats_ms, 'random_state': random_seed},
        random_seed=random_seed,
        modes=['<=']
    )
    results['name'] = results['name'].apply(lambda x: dataset_map.get(x, x))
    label = conf['label']
    results.to_csv(os.path.join(data_dir, f'bootstrap_{label}{postfix}.csv'))

2023-12-11 16:52:18.420328 appendicitis
2023-12-11 16:52:22.791834 haberman
2023-12-11 16:52:27.051307 new_thyroid1
2023-12-11 16:52:31.031749 glass0
2023-12-11 16:52:36.635475 shuttle-6_vs_2-3
2023-12-11 16:52:41.564518 bupa
2023-12-11 16:52:47.537052 cleveland-0_vs_4
2023-12-11 16:52:52.022905 ecoli1
2023-12-11 16:52:56.749187 poker-9_vs_7
2023-12-11 16:53:01.829776 monk-2
2023-12-11 16:53:06.735893 hepatitis
2023-12-11 16:53:13.612560 yeast-0-3-5-9_vs_7-8
2023-12-11 16:53:19.068668 mammographic
2023-12-11 16:53:25.016630 saheart
2023-12-11 16:53:31.397906 page-blocks-1-3_vs_4
2023-12-11 16:53:37.388250 lymphography-normal-fibrosis
2023-12-11 16:53:41.678235 pima
2023-12-11 16:53:48.929478 wisconsin
2023-12-11 16:53:54.099257 abalone9_18
2023-12-11 16:54:02.198637 winequality-red-3_vs_5
2023-12-11 16:54:09.208459 appendicitis
2023-12-11 16:58:56.904110 haberman
2023-12-11 17:03:54.838556 new_thyroid1
