Skip to content

Latest commit

 

History

History
108 lines (82 loc) · 3.71 KB

example-ml.md

File metadata and controls

108 lines (82 loc) · 3.71 KB

Example Usage For a Machine Learning Workflow

Below is an example of a typical machine learning workflow: you retreive data, preprocess it, train a model and evaluate the model output.

In this example you will:

  • Build a machine learning workflow made up of individual tasks
  • Check task dependencies and their execution status
  • Execute the model training task including dependencies
  • Save intermediary task output to Parquet, pickle and in-memory
  • Load task output to pandas dataframe and model object for model evaluation
  • Intelligently rerun workflow after changing a preprocessing parameter

Code is below, interactive example available on mybinder.

Workflow with d6tpipe

import d6tflow
import sklearn, sklearn.datasets, sklearn.ensemble, sklearn.linear_model
import pandas as pd

# define workflow tasks
class GetData(d6tflow.tasks.TaskPqPandas):  # save dataframe as parquet

    def run(self):
        ds = sklearn.datasets.load_boston()
        df_train = pd.DataFrame(ds.data, columns=ds.feature_names)
        df_train['y'] = ds.target
        self.save(df_train) # quickly save dataframe


@d6tflow.requires(GetData) # define dependency
class ModelData(d6tflow.tasks.TaskPqPandas):
    do_preprocess = d6tflow.BoolParameter(default=True) # parameter for preprocessing yes/no

    def run(self):
        df_train = self.input().load() # quickly load required data
        if self.do_preprocess:
            df_train.iloc[:,:-1] = sklearn.preprocessing.scale(df_train.iloc[:,:-1])
        self.save(df_train)

@d6tflow.requires(ModelData) # automatically pass parameters upstream
class ModelTrain(d6tflow.tasks.TaskPickle): # save output as pickle
    model = d6tflow.Parameter(default='ols') # parameter for model selection

    def run(self):
        df_train = self.input().load()
        if self.model=='ols':
            model = sklearn.linear_model.LinearRegression()
        elif self.model=='gbm':
            model = sklearn.ensemble.GradientBoostingRegressor()
        else:
            raise ValueError('invalid model selection')
        model.fit(df_train.drop('y',1), df_train['y'])
        self.save(model)
        self.saveMeta({'score':model.score(df_train.drop('y',1), df_train['y'])})

# goal: compare performance of two models
params_model1 = {'do_preprocess':True, 'model':'ols'}
params_model2 = {'do_preprocess':False, 'model':'gbm'}

# define workflow manager
flow = d6tflow.WorkflowMulti(ModelTrain, {'ols':params_model1, 'gbm':params_model2})
flow.reset_upstream(confirm=False) # force re-run
print(flow.preview('ols'))

# intelligently figures out which part of the workflow need to run for each model
# for example when training model 2, GetData() does not need to run again
flow.run()
'''
Scheduled 3 tasks of which:
* 3 ran successfully:
    - 1 GetData()
    - 1 ModelData(do_preprocess=True)
    - 1 ModelTrain(do_preprocess=True, model=ols)

# To run 2nd model, don't need to re-run all tasks, only the ones that changed
Scheduled 3 tasks of which:
* 1 complete ones were encountered:
    - 1 GetData()
* 2 ran successfully:
    - 1 ModelData(do_preprocess=False)
    - 1 ModelTrain(do_preprocess=False, model=gbm)

'''

data = flow.outputLoadAll()

scores = flow.outputLoadMeta()
print(scores)
# {'ols': {'score': 0.7406426641094095}, 'gbm': {'score': 0.9761405838418584}}

# get training data and models
data_train = flow.outputLoad(task=ModelData)
models = flow.outputLoad(task=ModelTrain)

print(models['ols'].score(data_train['ols'].drop('y',1), data_train['ols']['y']))
# 0.7406426641094095
print(models['gbm'].score(data_train['gbm'].drop('y',1), data_train['gbm']['y']))
# 0.9761405838418584

Next steps

See https://d6tflow.readthedocs.io/en/latest/transition.html