## Imports

In [None]:
import lfd
import pandas as pd
import numpy as np
logparams = dict(stdout_level='ERROR')
lfd.set_logging(**logparams)

In [None]:
DATASET = 'Houses'
TARGET = 'SalePrice'
SET_ASIDE = [TARGET]
MODE = 'linear'

## Data

In [None]:
data = lfd.Data(f'../../learnfromdata_new/datasets/tabular/{DATASET.lower()}')
data.set_dtypes()
data

In [None]:
np.random.seed(10)

all_params = dict(
    set_aside = SET_ASIDE,
    data = dict(
        add_noise = dict(seed=0),
        test_split = dict(test_size=0.2, stratify_col=None, seed=0),
        valid_split = dict(test_size=0.2, stratify_col=None, seed=0),
    ),
    transform = dict(
        uniselector = dict(min_occ=0.01, max_occ=0.99),
        encoder = dict(min_occ=0.001, method='target', target=TARGET),
        biselector = dict(threshold=0.8, target=TARGET),
    ),
    model = dict(
        target=TARGET, mode=MODE, seed_train=0,
        base0 = dict(algorithm='xgboost', name='Xgboost', hyper_params=dict(
            n_estimators=np.arange(5, 100), max_depth=np.arange(6, 11),
        )),
        calibrate = dict(algorithm='isotonic', hyper_params=dict(method='quantile')),
    ),
)

In [None]:
boot = lfd.Bootstrap(f'../../experiments/Bootstrap - {DATASET} - {TARGET}', logparams)
boot.learn_pipelines(data.copy(), all_params, data_iters=5, model_iters=3)

In [None]:
meta = boot.get_meta(model='C_Xgboost', dataset='Test', metrics=None, predictions=None)
meta

In [None]:
bins = 5
conf = pd.crosstab(
    meta.df['model|base0|hyper_params|max_depth'], 
    pd.cut(meta.df['model|base0|hyper_params|n_estimators'], bins, duplicates='drop'), 
    meta.df.c_index, aggfunc='mean').round(3)
conf