In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from prefect import Flow, Parameter, unmapped
import pandas as pd
from prefect.engine.executors import DaskExecutor
from crawto.meta_model import MetaModel

In [4]:
input_df = pd.read_csv("data/house-prices-advanced-regression-techniques/train.csv")
test= pd.read_csv("data/house-prices-advanced-regression-techniques/test.csv")

In [9]:
from crawto.ml_flow import *
from tinydb import TinyDB
with Flow("data_cleaning") as flow:
    input_data = Parameter("input_data")
    problem, target, features = (
        Parameter("problem"),
        Parameter("target"),
        Parameter("features"),
    )
    #tinydb = recreate_tinydb()
    nan_features = extract_nan_features(input_data)
    problematic_features = extract_problematic_features(input_data)
    undefined_features = extract_undefined_features(
        input_data, features, target, nan_features, problematic_features
    )
    input_data_with_missing = fit_transform_missing_indicator(
        input_data, undefined_features
    )

    train_valid_split = extract_train_valid_split(
        input_data=input_data_with_missing, problem=problem, target=target
    )
    train_data = extract_train_data(train_valid_split)
    valid_data = extract_valid_data(train_valid_split)
    numeric_features = extract_numeric_features(input_data, undefined_features)
    categorical_features = extract_categorical_features(input_data, undefined_features)

    # numeric columns work
    numeric_imputer = fit_numeric_imputer(train_data, numeric_features)
    imputed_train_numeric_df = impute_numeric_df(
        numeric_imputer, train_data, numeric_features
    )
    imputed_valid_numeric_df = impute_numeric_df(
        numeric_imputer, valid_data, numeric_features
    )

    yeo_johnson_transformer = fit_yeo_johnson_transformer(imputed_train_numeric_df)
    yeo_johnson_train_transformed = transform_yeo_johnson_transformer(
        imputed_train_numeric_df, yeo_johnson_transformer
    )
    yeo_johnson_valid_transformed = transform_yeo_johnson_transformer(
        imputed_valid_numeric_df, yeo_johnson_transformer
    )

    # categorical columns work
    categorical_imputer = fit_categorical_imputer(train_data, categorical_features)
    imputed_train_categorical_df = transform_categorical_data(
        train_data, categorical_features, categorical_imputer
    )
    imputed_valid_categorical_df = transform_categorical_data(
        valid_data, categorical_features, categorical_imputer
    )

    target_transformer = fit_target_transformer(problem, target, train_data)
    transformed_train_target = transform_target(
        problem, target, train_data, target_transformer
    )
    transformed_valid_target = transform_target(
        problem, target, valid_data, target_transformer
    )

    target_encoder_transformer = fit_target_encoder(
        imputed_train_categorical_df, transformed_train_target
    )
    target_encoded_train_df = target_encoder_transform(
        target_encoder_transformer, imputed_train_categorical_df
    )
    target_encoded_valid_df = target_encoder_transform(
        target_encoder_transformer, imputed_valid_categorical_df
    )

    # merge_data
    transformed_train_df = merge_transformed_data(
        target_encoded_train_df, yeo_johnson_train_transformed,
    )
    transformed_valid_df = merge_transformed_data(
        target_encoded_valid_df, yeo_johnson_valid_transformed,
    )

    # outlierness
    hbos_transformer = fit_hbos_transformer(transformed_train_df)
    hbos_transform_train_data = hbos_transform(transformed_train_df, hbos_transformer)
    hbos_transform_valid_data = hbos_transform(transformed_valid_df, hbos_transformer)

    # merge outlierness
    transformed_train_df = merge_hbos_df(
        transformed_train_df, hbos_transform_train_data
    )
    transformed_valid_df = merge_hbos_df(
        transformed_valid_df, hbos_transform_valid_data
    )
    save_data(transformed_train_df, "transformed_train.df",)
    save_data(transformed_valid_df, "transformed_valid.df",)
    
#     #dimensionality reduction
#     svd = fit_svd(transformed_train_df)
#     svd_train = svd_transform(svd, transformed_train_df, "transformed_train_df",tinydb)
#     svd_valid = svd_transform(svd, transformed_valid_df, "transformed_valid_df",tinydb)
    

    # models
    meta = MetaModel(problem="regression", db=TinyDB("db.json"))
    model_path = meta.models
    fit_models = fit_model.map(
        model_path=model_path,
        train_data=unmapped(transformed_train_df),
        target=unmapped(transformed_train_target),
        problem=unmapped(problem),
    )
#     predict_models = predict_model.map(
#         model=fit_models, valid_data=unmapped(transformed_valid_df),
#     )

[autoreload of crawto.meta_model failed: Traceback (most recent call last):
  File "e:\projects\crawto\.venv\lib\site-packages\IPython\extensions\autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "e:\projects\crawto\.venv\lib\site-packages\IPython\extensions\autoreload.py", line 394, in superreload
    module = reload(module)
  File "c:\users\cwcol\appdata\local\programs\python\python38\lib\imp.py", line 314, in reload
    return importlib.reload(module)
  File "c:\users\cwcol\appdata\local\programs\python\python38\lib\importlib\__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 604, in _exec
  File "<frozen importlib._bootstrap_external>", line 783, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "E:\projects\crawto\crawto\meta_model.py", line 170, in <module>
    meta = MetaModel(problem, db, use_default_models=True)
NameError: name 

In [8]:
executor = DaskExecutor()
flow_state = flow.run(
    input_data= input_df, 
    problem="regression", 
    target = "SalePrice", 
    features = "infer",
    executor=executor
)

[2020-04-22 20:15:06,750] INFO - prefect.FlowRunner | Beginning Flow run for 'data_cleaning'
[2020-04-22 20:15:06,754] INFO - prefect.FlowRunner | Starting flow run.
[2020-04-22 20:15:06,844] INFO - prefect.TaskRunner | Task 'target': Starting task run...
[2020-04-22 20:15:06,896] INFO - prefect.TaskRunner | Task 'target': finished task run for task with final state: 'Success'
[2020-04-22 20:15:06,912] INFO - prefect.TaskRunner | Task 'problem': Starting task run...
[2020-04-22 20:15:06,921] INFO - prefect.TaskRunner | Task 'input_data': Starting task run...
[2020-04-22 20:15:06,934] INFO - prefect.TaskRunner | Task 'features': Starting task run...
[2020-04-22 20:15:06,937] INFO - prefect.TaskRunner | Task 'problem': finished task run for task with final state: 'Success'
[2020-04-22 20:15:06,950] INFO - prefect.TaskRunner | Task 'input_data': finished task run for task with final state: 'Success'
[2020-04-22 20:15:06,981] INFO - prefect.TaskRunner | Task 'features': finished task run f

  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())


[2020-04-22 20:15:07,774] INFO - prefect.TaskRunner | Task 'transform_categorical_data': finished task run for task with final state: 'Success'
[2020-04-22 20:15:07,786] INFO - prefect.TaskRunner | Task 'transform_categorical_data': finished task run for task with final state: 'Success'
[2020-04-22 20:15:07,799] INFO - prefect.TaskRunner | Task 'fit_target_transformer': finished task run for task with final state: 'Success'
[2020-04-22 20:15:07,833] INFO - prefect.TaskRunner | Task 'transform_target': Starting task run...
[2020-04-22 20:15:07,835] INFO - prefect.TaskRunner | Task 'transform_target': Starting task run...
[2020-04-22 20:15:07,844] INFO - prefect.TaskRunner | Task 'transform_target': finished task run for task with final state: 'Success'
[2020-04-22 20:15:07,854] INFO - prefect.TaskRunner | Task 'transform_target': finished task run for task with final state: 'Success'
[2020-04-22 20:15:07,872] INFO - prefect.TaskRunner | Task 'fit_target_encoder': Starting task run...


  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())
  loglike = -n_samples / 2 * np.log(x_trans.var())


[2020-04-22 20:15:08,595] INFO - prefect.TaskRunner | Task 'fit_target_encoder': finished task run for task with final state: 'Success'
[2020-04-22 20:15:08,628] INFO - prefect.TaskRunner | Task 'target_encoder_transform': Starting task run...
[2020-04-22 20:15:08,629] INFO - prefect.TaskRunner | Task 'target_encoder_transform': Starting task run...
[2020-04-22 20:15:08,797] INFO - prefect.TaskRunner | Task 'fit_yeo_johnson_transformer': finished task run for task with final state: 'Success'
[2020-04-22 20:15:08,840] INFO - prefect.TaskRunner | Task 'transform_yeo_johnson_transformer': Starting task run...
[2020-04-22 20:15:08,841] INFO - prefect.TaskRunner | Task 'transform_yeo_johnson_transformer': Starting task run...
[2020-04-22 20:15:08,860] INFO - prefect.TaskRunner | Task 'transform_yeo_johnson_transformer': finished task run for task with final state: 'Success'
[2020-04-22 20:15:08,892] INFO - prefect.TaskRunner | Task 'transform_yeo_johnson_transformer': finished task run for 

  model = cd_fast.enet_coordinate_descent(


[2020-04-22 20:15:11,346] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


In [None]:
#flow.visualize(flow_state=flow_state)

In [None]:
from tinydb import TinyDB, Query
db = TinyDB("db.json")
db.all()

In [None]:
q = Query()
r = db.search(q.chunk == "svdname")

In [None]:
dir(q.chunk

In [None]:
flow._sorted_tasks()[38]

In [None]:
dir(flow._sorted_tasks()[38])