## Imports

In [None]:
import lfd
import pandas as pd
import numpy as np

In [None]:
DATASET = 'Houses'
SET_ASIDE = []
MODE = 'clustering'

## Data

In [None]:
data = lfd.Data(f'../../learnfromdata_new/datasets/tabular/{DATASET.lower()}')
data.set_dtypes()
data.df.head(3)

In [None]:
data.df.describe()

In [None]:
params = dict(
    set_aside = SET_ASIDE,
    data = dict(
        add_noise = dict(seed=0),
        test_split = dict(test_size=0.2, stratify_col=None, seed=0),
        valid_split = dict(test_size=0.2, stratify_col=None, seed=0),
    ),
    transform = dict(
        uniselector = dict(min_occ=0.01, max_occ=0.99),
        imputer = dict(default_cat='MISSING', default_cont='median'),
        encoder = dict(min_occ=0.05, method='target', target='SalePrice'),
        biselector = dict(threshold=0.8),
        standardizer = dict(),
    ),
    model = dict(
        mode=MODE, seed_train=0,
        base0 = dict(algorithm='gaussianmixture', name='GM', hyper_params=dict(
            n_components=5
        )),
    ),
)

In [None]:
pipe = lfd.Pipeline(name=f'{DATASET}_clustering').learn(
    params, data=data.copy(), evaluate=False, explain=False)

## Evaluation

In [None]:
model = pipe.models['GM']

In [None]:
model.predictions.df

In [None]:
model.clf.aic(pipe.data.df.loc['Test'])

In [None]:
plotter = lfd.PlotterModel(theme='dark')

In [None]:
plotter.scatter(
    pipe.data.df.loc['Test', 'SalePrice'], 
    pipe.data.df.loc['Test', 'LotArea'], 
    model.predictions.df.loc['Test', 'predictions'], 
)

In [None]:
plotter.histogram(
    model.predictions.select('Train').df.predictions, 
    model.predictions.select('Test').df.predictions, 
    bins=100, normalize=True
)

In [None]:
model.feature_imp.sort_values('cluster_all', ascending=False)

In [None]:
pipe.data.df.groupby(model.predictions.df.predictions).median().std().sort_values()