## Imports

In [None]:
import lfd
import pandas as pd
import numpy as np

In [None]:
DATASET = 'Titanic'
TARGET = 'Survived'
SET_ASIDE = [TARGET]
MODE = 'binaryclass'

## Data

In [None]:
data = lfd.Data(f'../../learnfromdata_new/datasets/tabular/titanic')
data.set_dtypes()
data

In [None]:
params = dict(
    set_aside = SET_ASIDE,
    data = dict(
        add_noise = dict(seed=0),
        test_split = dict(test_size=0.3, stratify_col=None, seed=0),
        train_balance = dict(target=TARGET, seed=0, stratified=None, counts={0: 1000, 1: 1000}),
    ),
    transform = dict(
        uniselector = dict(min_occ=0.01, max_occ=0.99),
        encoder = dict(min_occ=0.001, method='target', target=TARGET),
        biselector = dict(threshold=0.8, target=TARGET),
    ),
    model = dict(
        target=TARGET, mode=MODE, seed_train=0,
        base0 = dict(algorithm='xgboost', name='Xgboost', hyper_params=dict(
            n_estimators=100, max_depth=6
        )),
        calibrate = dict(algorithm='regression', hyper_params=dict()),
    ),
)

In [None]:
pipe = lfd.Pipeline(name=f'{DATASET}_{TARGET}').learn(
    params, data=data.copy(), evaluate=True, explain=True, cutoff_params=dict(fix_flags=[0.2, 0.6]))

In [None]:
pipe.save('../../experiments', slim=False, as_pickle=False)

## Evaluation

In [None]:
model = pipe.cal_models['Xgboost']
test_target = model.predictions.df.loc['Test', 'target'].value_counts().sort_index().rename('Actual (test set)')
test_preds = model.predictions.df.loc['Test', 'scores'].value_counts().sort_index().rename('Predicted (test set)')
shapvalues = pipe.models['Xgboost'].shapvalues.abs().mean().sort_values(ascending=False)

In [None]:
plotter = lfd.PlotterModel(theme='dark')

In [None]:
model.predictions

In [None]:
plotter.confusion_heatmaps(model.confusion)

In [None]:
model.metrics

In [None]:
shapvalues

In [None]:
plotter.plot_barchart(shapvalues.head(20), title='')

In [None]:
plotter.histogram(
    pipe.cal_models['Xgboost'].predictions.df.loc['Test'].scores, 
    pipe.models['Xgboost'].predictions.df.loc['Test'].scores, 
    pipe.models['Xgboost'].predictions.df.loc['Test'].target, 
    bins=5)