In [1]:
import mlrun
project = mlrun.get_or_create_project(name='mlops', user_project=True, context='./')

> 2023-01-10 11:21:46,217 [info] loaded project mlops from MLRun DB


In [2]:
import os
get_data = mlrun.code_to_function(name='gen_dataset', kind='job', image='mlrun/mlrun', handler='get_data', filename='src/get_data.py')

# Mount it:
get_data.apply(mlrun.auto_mount())
if os.getenv('V3IO_ACCESS_KEY','False')=='False':
    get_data.spec.disable_auto_mount=False
    
project.set_function(get_data)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f9e60fd01f0>

In [3]:
outlier_removal = mlrun.code_to_function(name='outlier_removal', kind='job', image='mlrun/mlrun', handler='run', filename='src/outlier_removal.py')

# Mount it:
outlier_removal.apply(mlrun.auto_mount())
if os.getenv('V3IO_ACCESS_KEY','False')=='False':
    outlier_removal.spec.disable_auto_mount=False
    
project.set_function(outlier_removal)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f9e60fd0400>

# Pipeline

In [13]:
%%writefile src/trainer_baseline.py
import mlrun
from kfp import dsl
import sklearn
from src.outlier_removal import *

@dsl.pipeline(
    name="Automatic Pipeline",
    description="Train & Evaluate"
)
def kfpipeline(dataset: str='housing',
               path: str='/home/jovyan/data/src/housing.csv',
               label_column:str='MEDV',
               k: int=5,
               min_votes: float=3,
               remove_outlier:bool= False):
    
    project = mlrun.get_current_project()
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@ Getting the data @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

    get_data_run = mlrun.run_function(name='get_data',
                                      function='gen-dataset',
                                      params={'dataset': dataset,
                                              'path': path},
                                      outputs=[dataset])
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ feature selection @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    feature_selection_run = mlrun.run_function("hub://feature_selection",
                                               params={'ignore_type_errors': True,
                                                       "stat_filters": ['f_classif'
                                                                        ,'f_regression'
                                                                        ,'r_regression'
                                                                        ,'mutual_info_regression'],
                                                       "model_filters": {'AdaBoostRegressor':'AdaBoostRegressor',
                                                                         'ExtraTreesRegressor':'ExtraTreesRegressor',
                                                                         'GradientBoostingRegressor':'GradientBoostingRegressor',
                                                                         'RandomForestRegressor':'RandomForestRegressor',
                                                                         'RandomTreesEmbedding':'RandomTreesEmbedding',
                                                       },
                                                       "label_column": label_column,
                                                       "k": k,
                                                       "min_votes": min_votes},
                                               inputs={'df_artifact': get_data_run.outputs[dataset]},
                                               outputs=['feature_scores', 'selected_features_count',
                                                        'selected_features'])
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@ outlier detection the data @@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

    # Setting outlier removal params
    votes_thresholds = 3
    pyod_contamination=0.2 # + (0,0.5)
    z_score_threshold=3
    iqr_low=0.01
    iqr_high=0.99
    iqr_max_removal_percent_per_column=0.95
    remove_outliers_functions = [(remove_outliers_z_score, {'threshold': z_score_threshold}),
                        (remove_outliers_iqr, {'low_quantile': iqr_low, 'high_quantile':iqr_high, 'max_removal_percent_per_column':iqr_max_removal_percent_per_column}),
                        (remove_outliers_LOF, {'contamination': pyod_contamination}),
                        (remove_outliers_ABOD, {'contamination': pyod_contamination}),
                           (remove_outliers_HBOS, {'contamination': pyod_contamination})
                            ]
    
    outlier_removal_run = mlrun.run_function(name='outlier_removal',
                                            function='outlier-removal',
                                            inputs={'dataitem': feature_selection_run.outputs['selected_features']},
                                            params={'remove_outliers_functions': remove_outliers_functions, 
                                                    'remove_outlier': remove_outlier,
                                                    'votes_thresholds': votes_thresholds,
                                                    'label_column': label_column,
                                                    'random_state': 50},
                                            outputs=['outlier_removal', 'outlier_removal_test'])
    
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Training the model @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # Train a model using the auto_trainer hub function
    train_run = mlrun.run_function("hub://auto_trainer",
                                   inputs={"dataset": outlier_removal_run.outputs['outlier_removal'],
                                           "test_set": outlier_removal_run.outputs['outlier_removal_test']},
                                   params = {
                                       "model_class": "xgboost.XGBRegressor",
                                       "label_columns": label_column,
                                       "model_name": dataset,                                       
                                   }, 
                                   handler='train',
                                   outputs=["model"],
                               )

Overwriting src/trainer_baseline.py


In [14]:
# Register the workflow file:
workflow_name = "trainer_baseline"
project.set_workflow(workflow_name, "src/trainer_baseline.py")

# Save the project:
project.save()

<mlrun.projects.project.MlrunProject at 0x7f9e0f19ee80>

In [15]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'remove_outlier': True,
                             'path': '/home/jovyan/data/MLOps22/project/src/housing.csv',
                             'k': 7,
                             "min_votes":3})

> 2023-01-10 11:24:53,570 [info] starting run get_data uid=13ffa49db08245e0b8dd908eec09c441 DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...ec09c441,0,Jan 10 11:24:53,completed,get_data,workflow=e74b01f8b1d34078b92e7dceeacbbc80kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=housingpath=/home/jovyan/data/MLOps22/project/src/housing.csv,,housing





> 2023-01-10 11:24:53,972 [info] run executed, status=completed
> 2023-01-10 11:24:54,405 [info] starting run feature-selection-feature_selection uid=4089e5d3961e42f885a9402aeeff2ca6 DB=http://mlrun-api:8080
> 2023-01-10 11:25:01,753 [info] votes needed to be selected: 3


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...eeff2ca6,0,Jan 10 11:24:54,completed,feature-selection-feature_selection,workflow=e74b01f8b1d34078b92e7dceeacbbc80kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,df_artifact,"ignore_type_errors=Truestat_filters=['f_classif', 'f_regression', 'r_regression', 'mutual_info_regression']model_filters={'AdaBoostRegressor': 'AdaBoostRegressor', 'ExtraTreesRegressor': 'ExtraTreesRegressor', 'GradientBoostingRegressor': 'GradientBoostingRegressor', 'RandomForestRegressor': 'RandomForestRegressor', 'RandomTreesEmbedding': 'RandomTreesEmbedding'}label_column=MEDVk=7min_votes=3",,f_classiff_regressionr_regressionmutual_info_regressionAdaBoostRegressorExtraTreesRegressorGradientBoostingRegressorRandomForestRegressorRandomTreesEmbeddingfeature_scoresmax_scaled_scores_feature_scoresselected_features_countselected_features





> 2023-01-10 11:25:01,974 [info] run executed, status=completed
> 2023-01-10 11:25:01,979 [info] starting run outlier_removal uid=8eefc660e8e5420abef1a90f5fe73d0b DB=http://mlrun-api:8080
Removed: 21
> 2023-01-10 11:25:05,598 [info] Outlier removal function removed successfully 13


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...5fe73d0b,0,Jan 10 11:25:02,completed,outlier_removal,workflow=e74b01f8b1d34078b92e7dceeacbbc80kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,dataitem,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=MEDVrandom_state=50",,outlier_removaloutlier_removal_test





> 2023-01-10 11:25:05,886 [info] run executed, status=completed
> 2023-01-10 11:25:06,166 [info] starting run auto-trainer-train uid=ad214e443c9b4ce1a698f7345eff5327 DB=http://mlrun-api:8080
> 2023-01-10 11:25:06,392 [info] Sample set not given, using the whole training set as the sample set
> 2023-01-10 11:25:06,713 [info] training 'housing'


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...5eff5327,0,Jan 10 11:25:06,completed,auto-trainer-train,workflow=e74b01f8b1d34078b92e7dceeacbbc80kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,datasettest_set,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing,mean_absolute_error=2.301670257717955r2_score=0.850935246723068root_mean_squared_error=3.3856209256498335mean_squared_error=11.462429052198036,feature-importancetest_setmodel





> 2023-01-10 11:25:07,568 [info] run executed, status=completed


uid,start,state,name,parameters,results
...ec09c441,Jan 10 11:24:53,completed,get_data,dataset=housingpath=/home/jovyan/data/MLOps22/project/src/housing.csv,
...eeff2ca6,Jan 10 11:24:54,completed,feature-selection-feature_selection,"ignore_type_errors=Truestat_filters=['f_classif', 'f_regression', 'r_regression', 'mutual_info_regression']model_filters={'AdaBoostRegressor': 'AdaBoostRegressor', 'ExtraTreesRegressor': 'ExtraTreesRegressor', 'GradientBoostingRegressor': 'GradientBoostingRegressor', 'RandomForestRegressor': 'RandomForestRegressor', 'RandomTreesEmbedding': 'RandomTreesEmbedding'}label_column=MEDVk=7min_votes=3",
...5fe73d0b,Jan 10 11:25:02,completed,outlier_removal,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=MEDVrandom_state=50",
...5eff5327,Jan 10 11:25:06,completed,auto-trainer-train,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing,mean_absolute_error=2.301670257717955r2_score=0.850935246723068root_mean_squared_error=3.3856209256498335mean_squared_error=11.462429052198036


> 2023-01-10 11:25:07,609 [info] started run workflow mlops-jovyan-trainer_baseline with run id = 'e74b01f8b1d34078b92e7dceeacbbc80' by local engine


In [16]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7f9e0f19ee80>

In [18]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'dataset': 'motor',
                             'path': '/home/jovyan/data/MLOps22/project/src/freMTPL2freq.csv',
                             'label_column': 'ClaimNb',
                             'remove_outlier': True,
                             'k': 7, 
                             "min_votes":3})

> 2023-01-10 11:36:13,795 [info] starting run get_data uid=699fd7586ae74ec888e009cac9875fca DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...c9875fca,0,Jan 10 11:36:13,completed,get_data,workflow=06d2577c6c9e4f1187507934348a7c0fkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=motorpath=/home/jovyan/data/MLOps22/project/src/freMTPL2freq.csv,,motor





> 2023-01-10 11:36:20,766 [info] run executed, status=completed
> 2023-01-10 11:36:21,227 [info] starting run feature-selection-feature_selection uid=dd881b8196b9480d9f2caa1509b9a881 DB=http://mlrun-api:8080
> 2023-01-10 11:51:32,069 [info] votes needed to be selected: 3


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...09b9a881,0,Jan 10 11:36:21,completed,feature-selection-feature_selection,workflow=06d2577c6c9e4f1187507934348a7c0fkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,df_artifact,"ignore_type_errors=Truestat_filters=['f_classif', 'f_regression', 'r_regression', 'mutual_info_regression']model_filters={'AdaBoostRegressor': 'AdaBoostRegressor', 'ExtraTreesRegressor': 'ExtraTreesRegressor', 'GradientBoostingRegressor': 'GradientBoostingRegressor', 'RandomForestRegressor': 'RandomForestRegressor', 'RandomTreesEmbedding': 'RandomTreesEmbedding'}label_column=ClaimNbk=7min_votes=3",,f_classiff_regressionr_regressionmutual_info_regressionAdaBoostRegressorExtraTreesRegressorGradientBoostingRegressorRandomForestRegressorRandomTreesEmbeddingfeature_scoresmax_scaled_scores_feature_scoresselected_features_countselected_features





> 2023-01-10 11:51:33,138 [info] run executed, status=completed
> 2023-01-10 11:51:33,146 [info] starting run outlier_removal uid=b1607a53b2b34bafbd9f02875a7bf5aa DB=http://mlrun-api:8080



Degrees of freedom <= 0 for slice


invalid value encountered in true_divide


invalid value encountered in double_scalars



Removed: 1985
> 2023-01-10 11:55:21,826 [info] Outlier removal function removed successfully 1654


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...5a7bf5aa,0,Jan 10 11:51:33,completed,outlier_removal,workflow=06d2577c6c9e4f1187507934348a7c0fkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,dataitem,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=ClaimNbrandom_state=50",,outlier_removaloutlier_removal_test





> 2023-01-10 11:55:25,301 [info] run executed, status=completed
> 2023-01-10 11:55:25,682 [info] starting run auto-trainer-train uid=5155c91147ad41ad9c95d3a97973fa58 DB=http://mlrun-api:8080
> 2023-01-10 11:55:26,539 [info] Sample set not given, using the whole training set as the sample set
> 2023-01-10 11:55:26,643 [info] training 'motor'


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...7973fa58,0,Jan 10 11:55:25,completed,auto-trainer-train,workflow=06d2577c6c9e4f1187507934348a7c0fkind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,datasettest_set,model_class=xgboost.XGBRegressorlabel_columns=ClaimNbmodel_name=motor,mean_absolute_error=0.09739165967394417r2_score=0.034121066137656086root_mean_squared_error=0.23298860328114113mean_squared_error=0.05428368925889697,feature-importancetest_setmodel





> 2023-01-10 11:56:37,550 [info] run executed, status=completed


uid,start,state,name,parameters,results
...c9875fca,Jan 10 11:36:13,completed,get_data,dataset=motorpath=/home/jovyan/data/MLOps22/project/src/freMTPL2freq.csv,
...09b9a881,Jan 10 11:36:21,completed,feature-selection-feature_selection,"ignore_type_errors=Truestat_filters=['f_classif', 'f_regression', 'r_regression', 'mutual_info_regression']model_filters={'AdaBoostRegressor': 'AdaBoostRegressor', 'ExtraTreesRegressor': 'ExtraTreesRegressor', 'GradientBoostingRegressor': 'GradientBoostingRegressor', 'RandomForestRegressor': 'RandomForestRegressor', 'RandomTreesEmbedding': 'RandomTreesEmbedding'}label_column=ClaimNbk=7min_votes=3",
...5a7bf5aa,Jan 10 11:51:33,completed,outlier_removal,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=ClaimNbrandom_state=50",
...7973fa58,Jan 10 11:55:25,completed,auto-trainer-train,model_class=xgboost.XGBRegressorlabel_columns=ClaimNbmodel_name=motor,mean_absolute_error=0.09739165967394417r2_score=0.034121066137656086root_mean_squared_error=0.23298860328114113mean_squared_error=0.05428368925889697


> 2023-01-10 11:56:37,581 [info] started run workflow mlops-jovyan-trainer_baseline with run id = '06d2577c6c9e4f1187507934348a7c0f' by local engine
