# Model selection for the maximum depth

In [1]:
import os

import numpy as np

from conditioning_bias import (OperatorDecisionTreeClassifier,
                                OperatorDecisionTreeRegressor,
                                OperatorRandomForestClassifier,
                                OperatorRandomForestRegressor)

from datasets import binclas_datasets, regr_datasets

from evaluation import evaluate_classification, evaluate_regression
from config import dataset_map, data_dir, random_seed, n_splits_ms, n_repeats_ms

2023-12-11 16:52:11 INFO     querying the filtered classification datasets
2023-12-11 16:52:11 INFO     ranking the datasets
2023-12-11 16:52:11 INFO     binary classification datasets prepared
2023-12-11 16:52:11 INFO     querying the filtered regression datasets
2023-12-11 16:52:11 INFO     ranking the datasets
2023-12-11 16:52:11 INFO     regression datasets prepared


In [2]:
max_depth = [None] + list(range(2, 16))

postfix = '_md'

In [3]:
configurations = [
    {
        'datasets': binclas_datasets,
        'estimator': OperatorDecisionTreeClassifier,
        'function': evaluate_classification,
        'label': 'dtc'
    },
    {
        'datasets': binclas_datasets,
        'estimator': OperatorRandomForestClassifier,
        'function': evaluate_classification,
        'label': 'rfc'
    },
    {
        'datasets': regr_datasets,
        'estimator': OperatorDecisionTreeRegressor,
        'function': evaluate_regression,
        'label': 'dtr'
    },
    {
        'datasets': regr_datasets,
        'estimator': OperatorRandomForestRegressor,
        'function': evaluate_regression,
        'label': 'rfr'
    }
]

In [4]:
for conf in configurations:
    results = conf['function'](
        datasets=conf['datasets'],
        estimator=conf['estimator'],
        params=[{'random_state': random_seed, 'mode': '<=', 'max_depth': md} for md in max_depth],
        validator_params={'n_splits': n_splits_ms, 'n_repeats': n_repeats_ms, 'random_state': random_seed},
        random_seed=random_seed,
        modes=['<=']
    )
    results['name'] = results['name'].apply(lambda x: dataset_map.get(x, x))
    label = conf['label']
    results.to_csv(os.path.join(data_dir, f'model_selection_{label}{postfix}.csv'))

2023-12-11 16:52:11.358867 appendicitis
2023-12-11 16:52:15.172419 haberman
2023-12-11 16:52:19.809464 new_thyroid1
2023-12-11 16:52:23.985686 glass0
2023-12-11 16:52:28.700926 shuttle-6_vs_2-3
2023-12-11 16:52:32.765841 bupa
2023-12-11 16:52:39.097109 cleveland-0_vs_4
2023-12-11 16:52:44.815493 ecoli1
2023-12-11 16:52:49.911229 poker-9_vs_7
2023-12-11 16:52:54.243098 monk-2
2023-12-11 16:52:59.332854 hepatitis
2023-12-11 16:53:04.441671 yeast-0-3-5-9_vs_7-8
2023-12-11 16:53:12.204678 mammographic
2023-12-11 16:53:18.444724 saheart
2023-12-11 16:53:26.342439 page-blocks-1-3_vs_4
2023-12-11 16:53:32.028843 lymphography-normal-fibrosis
2023-12-11 16:53:36.845942 pima
2023-12-11 16:53:45.113280 wisconsin
2023-12-11 16:53:50.693087 abalone9_18
2023-12-11 16:53:59.829412 winequality-red-3_vs_5
2023-12-11 16:54:08.197563 appendicitis
2023-12-11 16:58:59.421330 haberman
2023-12-11 17:04:12.655673 new_thyroid1
2023-12-11 17:09:09.021230 glass0
2023-12-11 17:13:38.264285 shuttle-6_vs_2-3
2023-1