In [121]:
import mlrun
project = mlrun.get_or_create_project(name='mlops', user_project=True, context='./')

> 2023-01-08 13:18:44,159 [info] loaded project mlops from MLRun DB


In [138]:
import os
get_data = mlrun.code_to_function(name='gen_dataset', kind='job', image='mlrun/mlrun', handler='get_data', filename='src/get_data.py')

# Mount it:
get_data.apply(mlrun.auto_mount())
if os.getenv('V3IO_ACCESS_KEY','False')=='False':
    get_data.spec.disable_auto_mount=False
    
project.set_function(get_data)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f6ec89bb400>

In [139]:
outlier_removal = mlrun.code_to_function(name='outlier_removal', kind='job', image='mlrun/mlrun', handler='run', filename='src/outlier_removal.py')

# Mount it:
outlier_removal.apply(mlrun.auto_mount())
if os.getenv('V3IO_ACCESS_KEY','False')=='False':
    outlier_removal.spec.disable_auto_mount=False
    
project.set_function(outlier_removal)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f6ec8901cd0>

# Pipeline

In [143]:
%%writefile src/trainer_baseline.py
import mlrun
from kfp import dsl
import sklearn
from src.outlier_removal import *

@dsl.pipeline(
    name="Automatic Pipeline",
    description="Train & Evaluate"
)
def kfpipeline(dataset: str='housing',
               path: str='/home/jovyan/data/src/housing.csv',
               label_column:str='MEDV',
               remove_outlier:bool= False):
    
    project = mlrun.get_current_project()
        
    get_data_run = mlrun.run_function(name='get_data',
                                      function='gen-dataset',
                                      params={'dataset': dataset,
                                              'path': path},
                                      outputs=[dataset])

    # Setting outlier removal params
    votes_thresholds = 3
    pyod_contamination=0.2 # + (0,0.5)
    z_score_threshold=3
    iqr_low=0.01
    iqr_high=0.99
    iqr_max_removal_percent_per_column=0.95
    remove_outliers_functions = [(remove_outliers_z_score, {'threshold': z_score_threshold}),
                        (remove_outliers_iqr, {'low_quantile': iqr_low, 'high_quantile':iqr_high, 'max_removal_percent_per_column':iqr_max_removal_percent_per_column}),
                        (remove_outliers_LOF, {'contamination': pyod_contamination}),
                        (remove_outliers_ABOD, {'contamination': pyod_contamination}),
                           (remove_outliers_HBOS, {'contamination': pyod_contamination})
                            ]
    
    outlier_removal_run = mlrun.run_function(name='outlier_removal',
                                            function='outlier-removal',
                                            inputs={'dataitem': get_data_run.outputs[dataset]},
                                            params={'remove_outliers_functions': remove_outliers_functions, 
                                                    'remove_outlier': remove_outlier,
                                                    'votes_thresholds': votes_thresholds,
                                                    'label_column': label_column,
                                                    'random_state': 50},
                                            outputs=['outlier_removal', 'outlier_removal_test'])
    
    
    # Train a model using the auto_trainer hub function
    train_run = mlrun.run_function("hub://auto_trainer",
                                   inputs={"dataset": outlier_removal_run.outputs['outlier_removal'],
                                           "test_set": outlier_removal_run.outputs['outlier_removal_test']},
                                   params = {
                                       "model_class": "xgboost.XGBRegressor",
                                       "label_columns": label_column,
                                       "model_name": dataset,                                       
                                   }, 
                                   handler='train',
                                   outputs=["model"],
                               )

Overwriting src/trainer_baseline.py


In [144]:
# Register the workflow file:
workflow_name = "trainer_baseline"
project.set_workflow(workflow_name, "src/trainer_baseline.py")

# Save the project:
project.save()

<mlrun.projects.project.MlrunProject at 0x7f6ec8d953a0>

In [134]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True)



> 2023-01-08 13:20:18,854 [info] starting run get_data uid=97a42f3064e941548a58205624d3855b DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...24d3855b,0,Jan 08 13:20:18,completed,get_data,workflow=6ff9a64cb3904df9b77d86cf0e3b3d8bkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=housingpath=/home/jovyan/data/src/housing.csv,,housing





> 2023-01-08 13:20:19,328 [info] run executed, status=completed
> 2023-01-08 13:20:19,335 [info] starting run outlier_removal uid=f51cd9884844454087d1ac033eefc77d DB=http://mlrun-api:8080
> 2023-01-08 13:20:19,585 [info] Outlier removal function removed successfully 0


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...3eefc77d,0,Jan 08 13:20:19,completed,outlier_removal,workflow=6ff9a64cb3904df9b77d86cf0e3b3d8bkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,dataitem,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.8}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Falsevotes_thresholds=2label_column=MEDVrandom_state=50",,outlier_removaloutlier_removal_test





> 2023-01-08 13:20:19,891 [info] run executed, status=completed
> 2023-01-08 13:20:20,158 [info] starting run auto-trainer-train uid=c03a9fd55cbb47a4af479b03da583ce8 DB=http://mlrun-api:8080
> 2023-01-08 13:20:20,445 [info] Sample set not given, using the whole training set as the sample set
> 2023-01-08 13:20:20,489 [info] training 'housing'


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...da583ce8,0,Jan 08 13:20:20,completed,auto-trainer-train,workflow=6ff9a64cb3904df9b77d86cf0e3b3d8bkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,datasettest_set,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing,mean_absolute_error=1.7996130775002872r2_score=0.9218963689612159root_mean_squared_error=2.450679172306786mean_squared_error=6.005828405578273,feature-importancetest_setmodel





> 2023-01-08 13:20:21,355 [info] run executed, status=completed


uid,start,state,name,parameters,results
...24d3855b,Jan 08 13:20:18,completed,get_data,dataset=housingpath=/home/jovyan/data/src/housing.csv,
...3eefc77d,Jan 08 13:20:19,completed,outlier_removal,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.8}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Falsevotes_thresholds=2label_column=MEDVrandom_state=50",
...da583ce8,Jan 08 13:20:20,completed,auto-trainer-train,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing,mean_absolute_error=1.7996130775002872r2_score=0.9218963689612159root_mean_squared_error=2.450679172306786mean_squared_error=6.005828405578273


> 2023-01-08 13:20:21,393 [info] started run workflow mlops-jovyan-trainer_baseline with run id = '6ff9a64cb3904df9b77d86cf0e3b3d8b' by local engine


In [145]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,arguments={'remove_outlier': True})

> 2023-01-08 13:23:11,893 [info] starting run get_data uid=e00f3460ce0046b78f1295d5f67d28ee DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...f67d28ee,0,Jan 08 13:23:12,completed,get_data,workflow=63504654a14c4ab8a3374a3763ecd29dkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=housingpath=/home/jovyan/data/src/housing.csv,,housing





> 2023-01-08 13:23:12,249 [info] run executed, status=completed
> 2023-01-08 13:23:12,254 [info] starting run outlier_removal uid=db695f652e024bab91af9e09c406a36c DB=http://mlrun-api:8080
Removed: 29
> 2023-01-08 13:23:12,701 [info] Outlier removal function removed successfully 17


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...c406a36c,0,Jan 08 13:23:12,completed,outlier_removal,workflow=63504654a14c4ab8a3374a3763ecd29dkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,dataitem,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=MEDVrandom_state=50",,outlier_removaloutlier_removal_test





> 2023-01-08 13:23:13,007 [info] run executed, status=completed
> 2023-01-08 13:23:13,429 [info] starting run auto-trainer-train uid=7ee5c829aeb64a5094cdd386d3e12bb5 DB=http://mlrun-api:8080
> 2023-01-08 13:23:13,750 [info] Sample set not given, using the whole training set as the sample set
> 2023-01-08 13:23:13,813 [info] training 'housing'


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...d3e12bb5,0,Jan 08 13:23:13,completed,auto-trainer-train,workflow=63504654a14c4ab8a3374a3763ecd29dkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,datasettest_set,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing,mean_absolute_error=2.146644440819236r2_score=0.8866044989567707root_mean_squared_error=2.952900134709232mean_squared_error=8.719619205565799,feature-importancetest_setmodel





> 2023-01-08 13:23:15,190 [info] run executed, status=completed


uid,start,state,name,parameters,results
...f67d28ee,Jan 08 13:23:12,completed,get_data,dataset=housingpath=/home/jovyan/data/src/housing.csv,
...c406a36c,Jan 08 13:23:12,completed,outlier_removal,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=MEDVrandom_state=50",
...d3e12bb5,Jan 08 13:23:13,completed,auto-trainer-train,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing,mean_absolute_error=2.146644440819236r2_score=0.8866044989567707root_mean_squared_error=2.952900134709232mean_squared_error=8.719619205565799


> 2023-01-08 13:23:15,310 [info] started run workflow mlops-jovyan-trainer_baseline with run id = '63504654a14c4ab8a3374a3763ecd29d' by local engine


In [146]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7f6ec8d953a0>

In [147]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'dataset': 'motor',
                             'path': '/home/jovyan/data/src/freMTPL2freq.csv',
                             'label_column': 'ClaimNb'})

> 2023-01-08 13:23:30,392 [info] starting run get_data uid=6b7c12c4f8344233948fba767c99695f DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...7c99695f,0,Jan 08 13:23:30,completed,get_data,workflow=d1d1fd3067eb46548b609bd918b65962kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=motorpath=/home/jovyan/data/src/freMTPL2freq.csv,,motor





> 2023-01-08 13:23:37,543 [info] run executed, status=completed
> 2023-01-08 13:23:37,549 [info] starting run outlier_removal uid=e58a27a0b7e34a0b95ecf2b4136f6eb1 DB=http://mlrun-api:8080
> 2023-01-08 13:23:38,382 [info] Outlier removal function removed successfully 0


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...136f6eb1,0,Jan 08 13:23:37,completed,outlier_removal,workflow=d1d1fd3067eb46548b609bd918b65962kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,dataitem,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Falsevotes_thresholds=3label_column=ClaimNbrandom_state=50",,outlier_removaloutlier_removal_test





> 2023-01-08 13:23:39,165 [info] run executed, status=completed
> 2023-01-08 13:23:39,466 [info] starting run auto-trainer-train uid=3d2fce3c03af47eba18533780846cb75 DB=http://mlrun-api:8080
> 2023-01-08 13:23:39,717 [info] Sample set not given, using the whole training set as the sample set
> 2023-01-08 13:23:39,802 [info] training 'motor'


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...0846cb75,0,Jan 08 13:23:39,completed,auto-trainer-train,workflow=d1d1fd3067eb46548b609bd918b65962kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,datasettest_set,model_class=xgboost.XGBRegressorlabel_columns=ClaimNbmodel_name=motor,mean_absolute_error=0.0969508139257626r2_score=0.03861524283885853root_mean_squared_error=0.23244593029610103mean_squared_error=0.05403111051121986,feature-importancetest_setmodel





> 2023-01-08 13:24:24,727 [info] run executed, status=completed


uid,start,state,name,parameters,results
...7c99695f,Jan 08 13:23:30,completed,get_data,dataset=motorpath=/home/jovyan/data/src/freMTPL2freq.csv,
...136f6eb1,Jan 08 13:23:37,completed,outlier_removal,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Falsevotes_thresholds=3label_column=ClaimNbrandom_state=50",
...0846cb75,Jan 08 13:23:39,completed,auto-trainer-train,model_class=xgboost.XGBRegressorlabel_columns=ClaimNbmodel_name=motor,mean_absolute_error=0.0969508139257626r2_score=0.03861524283885853root_mean_squared_error=0.23244593029610103mean_squared_error=0.05403111051121986


> 2023-01-08 13:24:24,763 [info] started run workflow mlops-jovyan-trainer_baseline with run id = 'd1d1fd3067eb46548b609bd918b65962' by local engine


In [148]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'dataset': 'motor',
                             'path': '/home/jovyan/data/src/freMTPL2freq.csv',
                             'label_column': 'ClaimNb',
                             'remove_outlier': True})

> 2023-01-08 13:24:30,783 [info] starting run get_data uid=2ed750c601bf42dfb5fb03f317911214 DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...17911214,0,Jan 08 13:24:30,completed,get_data,workflow=b50d9ba7009b4c5b897ed725183318c8kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=motorpath=/home/jovyan/data/src/freMTPL2freq.csv,,motor





> 2023-01-08 13:24:38,800 [info] run executed, status=completed
> 2023-01-08 13:24:38,805 [info] starting run outlier_removal uid=c119f33a71df4f60b68f02e6a213fa92 DB=http://mlrun-api:8080



Degrees of freedom <= 0 for slice


invalid value encountered in true_divide


invalid value encountered in double_scalars



Removed: 1444
> 2023-01-08 13:29:35,600 [info] Outlier removal function removed successfully 1241


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...a213fa92,0,Jan 08 13:24:39,completed,outlier_removal,workflow=b50d9ba7009b4c5b897ed725183318c8kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,dataitem,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=ClaimNbrandom_state=50",,outlier_removaloutlier_removal_test





> 2023-01-08 13:29:36,340 [info] run executed, status=completed
> 2023-01-08 13:29:36,791 [info] starting run auto-trainer-train uid=0e67199d2a824d078b7247c4e4c34c9a DB=http://mlrun-api:8080
> 2023-01-08 13:29:37,109 [info] Sample set not given, using the whole training set as the sample set
> 2023-01-08 13:29:37,202 [info] training 'motor'


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...e4c34c9a,0,Jan 08 13:29:36,completed,auto-trainer-train,workflow=b50d9ba7009b4c5b897ed725183318c8kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,datasettest_set,model_class=xgboost.XGBRegressorlabel_columns=ClaimNbmodel_name=motor,mean_absolute_error=0.09703476360663157r2_score=0.039481265403024546root_mean_squared_error=0.23234121219368617mean_squared_error=0.05398243888363151,feature-importancetest_setmodel





> 2023-01-08 13:30:29,351 [info] run executed, status=completed


uid,start,state,name,parameters,results
...17911214,Jan 08 13:24:30,completed,get_data,dataset=motorpath=/home/jovyan/data/src/freMTPL2freq.csv,
...a213fa92,Jan 08 13:24:39,completed,outlier_removal,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=ClaimNbrandom_state=50",
...e4c34c9a,Jan 08 13:29:36,completed,auto-trainer-train,model_class=xgboost.XGBRegressorlabel_columns=ClaimNbmodel_name=motor,mean_absolute_error=0.09703476360663157r2_score=0.039481265403024546root_mean_squared_error=0.23234121219368617mean_squared_error=0.05398243888363151


> 2023-01-08 13:30:29,403 [info] started run workflow mlops-jovyan-trainer_baseline with run id = 'b50d9ba7009b4c5b897ed725183318c8' by local engine
