In [1]:
import mlrun
project = mlrun.get_or_create_project(name='mlops', user_project=True, context='./')

> 2023-01-14 22:35:38,098 [info] loaded project mlops from MLRun DB


### Setting functions

In [2]:
import os

# Setting get data function
get_data = mlrun.code_to_function(name='gen_dataset', kind='job', image='mlrun/mlrun', handler='get_data', filename='src/get_data.py')

# Mount it:
get_data.apply(mlrun.auto_mount())
if os.getenv('V3IO_ACCESS_KEY','False')=='False':
    get_data.spec.disable_auto_mount=False
    
project.set_function(get_data)

# Setting dalex function
dalex = mlrun.code_to_function(name='dalex', kind='job', handler='run_dalex',filename='src/dalex.py')

project.set_function(dalex)

# Setting training function
dalex = mlrun.code_to_function(name='train', kind='job', handler='train',filename='src/auto_trainer.py')
project.set_function(dalex)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7faf001e0fd0>

In [3]:
%%writefile src/trainer_baseline.py
import mlrun
from kfp import dsl
import sklearn
import json

@dsl.pipeline(
    name="Automatic Pipeline",
    description="Train & Evaluate"
)
def kfpipeline(dataset: str='housing',
               path: str='/home/jovyan/data/src/housing.csv',
               label_column:str='MEDV'):
    
    project = mlrun.get_current_project()
        
    get_data_run = mlrun.run_function(name='get_data',
                                      function='gen-dataset',
                                      params={'dataset': dataset,
                                              'path': path},
                                      outputs=[dataset])

    
    dalex = project.run_function(name='dalex',
                                 function='dalex',
                                 params={'df_train': get_data_run.outputs[dataset],
                                         'target': label_column},
                                 outputs=['train_data', 'test_data', 'dalex_output'])
    
    
    params = {"model_class": "xgboost.XGBRegressor",
              "label_columns": label_column,
              "model_name": dataset + '_dalex'}

    for key,val in json.loads(mlrun.get_dataitem(dalex.outputs['dalex_output']).get()).items():
        params['sample_weight'] = val
    
    # Train a model using the auto_trainer hub function
    train_run = mlrun.run_function(name= 'train',
                                   function='train',
                                   inputs={"dataset": dalex.outputs['train_data'], 
                                           "test_set": dalex.outputs['test_data']},
                                   params = params, 
                                   handler='train',
                                   outputs=["model"],
                               )

Overwriting src/trainer_baseline.py


In [4]:
# Register the workflow file:
workflow_name = "trainer_baseline"
project.set_workflow(workflow_name, "src/trainer_baseline.py")

# Save the project:
project.save()

<mlrun.projects.project.MlrunProject at 0x7faf0016a100>

In [None]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'path': '/home/jovyan/data/MLOps22/project/src/housing.csv'})

In [None]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'dataset': 'motor',
                             'path': '/home/jovyan/data/MLOps22/project/src/freMTPL2freq.csv',
                             'label_column': 'ClaimNb'})

> 2023-01-14 22:38:39,675 [info] starting run get_data uid=e8639d8d577c4037b468e43363f63a47 DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...63f63a47,0,Jan 14 22:38:39,completed,get_data,workflow=7c0506600bfe4ec8a32bf6dbb24d7894kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=motorpath=/home/jovyan/data/MLOps22/project/src/freMTPL2freq.csv,,motor





> 2023-01-14 22:38:46,428 [info] run executed, status=completed
> 2023-01-14 22:38:46,441 [info] starting run dalex uid=6538411cb39e4f24975fc7e7d91fe10c DB=http://mlrun-api:8080
dataframe shape before dalex : (542409, 11)
Preparation of a new explainer is initiated

  -> data              : 542409 rows 10 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 542409 values
  -> model_class       : xgboost.sklearn.XGBRegressor (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x7fae982b5e50> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = -0.111, mean = 0.053, max = 2.02
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.844, me