## Imports

In [None]:
import lfd
import pandas as pd
import numpy as np

In [None]:
DATASET = 'Houses'
TARGET = 'SalePrice'
SET_ASIDE = [TARGET]
MODE = 'linear'

## Data

In [None]:
data = lfd.Data(f'../../learnfromdata_new/datasets/tabular/{DATASET.lower()}')
data.set_dtypes()
data

In [None]:
data.analyse(broken_by='SalePrice', bins=5, to_excel=False)

## Model Training

In [None]:
params = dict(
    set_aside = SET_ASIDE,
    data = dict(
        add_noise = None,#dict(seed=0),
        test_split = dict(test_size=0.2, stratify_col=None, seed=0),
        #valid_split = dict(test_size=0.2, stratify_col=None, seed=0),
    ),
    transform = dict(
        uniselector = dict(min_occ=0.01, max_occ=0.99),
        encoder = dict(min_occ=0.01, method='onehot', target=TARGET),
        biselector = dict(threshold=0.8, target=TARGET),
    ),
    model = dict(
        target=TARGET, mode=MODE, seed_train=0,
        base0 = dict(algorithm='xgboost', name='Xgboost1', hyper_params=dict(
            n_estimators=20, max_depth=3
        )),
        base1 = dict(algorithm='xgboost', name='Xgboost2', hyper_params=dict(
            n_estimators=50, max_depth=6
        )),
        calibrate = dict(algorithm='isotonic', hyper_params=dict(method='quantile')),
    ),
)

In [None]:
pipe = lfd.Pipeline(name=f'{DATASET}_{TARGET}').learn(
    params, data=data.copy(), evaluate=True, explain=True)

In [None]:
pipe.save('../../experiments', slim=False, as_pickle=False)

## Model Prediction

In [None]:
pipe2 = lfd.Pipeline().load(f'../../experiments/{DATASET}_{TARGET}', slim=True)

In [None]:
pipe2.apply(data)

## Evaluation

In [None]:
model = pipe.cal_models['Xgboost1']
shapvalues = pipe.models['Xgboost1'].shapvalues.abs().mean().sort_values(ascending=False)

In [None]:
model.metrics

In [None]:
plotter = lfd.PlotterModel(theme='dark')

In [None]:
plotter.confusion_heatmaps(model.confusion)

In [None]:
model.confusion_cuts

In [None]:
plotter.plot_barchart(shapvalues.head(20), '')

In [None]:
plotter.lift_curve(model.predictions.df, bins=20)

In [None]:
plotter.histogram(
    pipe.models['Xgboost1'].predictions.df.loc['Test'].scores, 
    model.predictions.df.loc['Test'].scores, 
    model.predictions.df.loc['Test'].target,
    bins=20
)