In [1]:
import mlrun
project = mlrun.get_or_create_project(name='mlops', user_project=True, context='./')

> 2023-01-14 19:21:54,213 [info] Created and saved project mlops-jovyan: {'from_template': None, 'overwrite': False, 'context': './', 'save': True}
> 2023-01-14 19:21:54,216 [info] created project mlops and saved in MLRun DB


In [2]:
import os

# Setting get data function
get_data = mlrun.code_to_function(name='gen_dataset', kind='job', image='mlrun/mlrun', handler='get_data', filename='src/get_data.py')

# Setting feature-selection
feature_selection = mlrun.import_function('hub://feature_selection')

# Setting outlier removal function
outlier_removal = mlrun.code_to_function(name='outlier_removal', kind='job', image='mlrun/mlrun', handler='run', filename='src/outlier_removal.py')

# Setting dalex function
dalex = mlrun.code_to_function(name='dalex', kind='job', handler='run_dalex',filename='src/dalex.py')

# Setting training function
train = mlrun.code_to_function(name='train', kind='job', handler='train',filename='src/auto_trainer.py')


In [3]:
project.set_function(get_data)
project.set_function(feature_selection)
project.set_function(outlier_removal)
project.set_function(dalex)
project.set_function(train)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f3a6d598460>

In [4]:
%%writefile src/trainer_baseline.py
import mlrun
from kfp import dsl
import sklearn
from src.outlier_removal import *

@dsl.pipeline(
    name="Automatic Pipeline",
    description="Train & Evaluate"
)
def kfpipeline(dataset: str='housing',
               path: str='/home/jovyan/data/src/housing.csv',
               label_column:str='MEDV',
               k: int=5,
               min_votes: float=3,
               remove_outlier:bool= False):
    
    project = mlrun.get_current_project()
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@ Getting the data @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    get_data_run = mlrun.run_function(name='get_data',
                                      function='gen-dataset',
                                      params={'dataset': dataset,
                                              'path': path},
                                      outputs=[dataset])
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ feature selection @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    feature_selection_run = mlrun.run_function("hub://feature_selection",
                                               params={'ignore_type_errors': True,
                                                       "stat_filters": ['f_classif'
                                                                        ,'f_regression'
                                                                        ,'r_regression'
                                                                        ,'mutual_info_regression'],
                                                       "model_filters": {'AdaBoostRegressor':'AdaBoostRegressor',
                                                                         'ExtraTreesRegressor':'ExtraTreesRegressor',
                                                                         'GradientBoostingRegressor':'GradientBoostingRegressor',
                                                                         'RandomForestRegressor':'RandomForestRegressor',
                                                                         'RandomTreesEmbedding':'RandomTreesEmbedding',
                                                       },
                                                       "label_column": label_column,
                                                       "k": k,
                                                       "min_votes": min_votes},
                                               inputs={'df_artifact': get_data_run.outputs[dataset]},
                                               outputs=['feature_scores', 'selected_features_count',
                                                        'selected_features'])
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ outlier detection @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # Setting outlier removal params
    votes_thresholds = 3
    pyod_contamination=0.2 # + (0,0.5)
    z_score_threshold=3
    iqr_low=0.01
    iqr_high=0.99
    iqr_max_removal_percent_per_column=0.95
    remove_outliers_functions = [(remove_outliers_z_score, {'threshold': z_score_threshold}),
                        (remove_outliers_iqr, {'low_quantile': iqr_low, 'high_quantile':iqr_high, 'max_removal_percent_per_column':iqr_max_removal_percent_per_column}),
                        (remove_outliers_LOF, {'contamination': pyod_contamination}),
                        (remove_outliers_ABOD, {'contamination': pyod_contamination}),
                           (remove_outliers_HBOS, {'contamination': pyod_contamination})
                            ]
    
    outlier_removal_run = mlrun.run_function(name='outlier_removal',
                                            function='outlier-removal',
                                            inputs={'dataitem': feature_selection_run.outputs['selected_features']},
                                            params={'remove_outliers_functions': remove_outliers_functions, 
                                                    'remove_outlier': remove_outlier,
                                                    'votes_thresholds': votes_thresholds,
                                                    'label_column': label_column,
                                                    'random_state': 50},
                                            outputs=['outlier_removal', 'outlier_removal_test'])
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ dalex @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    
    dalex = project.run_function(name='dalex',
                                 function='dalex',
                                 params={'df_train': outlier_removal_run.outputs['outlier_removal'],
                                         'df_test': outlier_removal_run.outputs['outlier_removal_test'],
                                         'target': label_column},
                                 outputs=['train_data', 'test_data', 'dalex_output'])
    
    params = {"model_class": "xgboost.XGBRegressor",
              "label_columns": label_column,
              "model_name": dataset + '_dalex'}
    try:
        for key,val in json.loads(mlrun.get_dataitem(dalex.outputs['dalex_output']).get()).items():
            params['sample_weight'] = val
    except:
        print('weights are deisabled')
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Training the model @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    
    # Train a model using the auto_trainer hub function
    train_run = mlrun.run_function(name= 'train',
                                   function='train',
                                   inputs={"dataset": dalex.outputs['train_data'], 
                                           "test_set": dalex.outputs['test_data']},
                                   params = params, 
                                   handler='train',
                                   outputs=["model"],
                               )

Overwriting src/trainer_baseline.py


In [5]:
# Register the workflow file:
workflow_name = "trainer_baseline"
project.set_workflow(workflow_name, "src/trainer_baseline.py")

# Save the project:
project.save()

<mlrun.projects.project.MlrunProject at 0x7f3a6d598160>

In [6]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'remove_outlier': True,
                             'path': '/home/jovyan/data/MLOps22/project/src/housing.csv',
                             'k': 7,
                             "min_votes":3})



> 2023-01-14 19:21:56,161 [info] starting run get_data uid=6b58767ed20d4d0aa4b2a65847db3bef DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...47db3bef,0,Jan 14 19:21:56,completed,get_data,workflow=14375580fda1457f9970753b9b8b4847kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=housingpath=/home/jovyan/data/MLOps22/project/src/housing.csv,,housing





> 2023-01-14 19:21:56,903 [info] run executed, status=completed
> 2023-01-14 19:21:57,249 [info] starting run feature-selection-feature_selection uid=99a34b8bafca40818d07f508b269c34a DB=http://mlrun-api:8080
> 2023-01-14 19:22:04,393 [info] votes needed to be selected: 3


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...b269c34a,0,Jan 14 19:21:57,completed,feature-selection-feature_selection,workflow=14375580fda1457f9970753b9b8b4847kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,df_artifact,"ignore_type_errors=Truestat_filters=['f_classif', 'f_regression', 'r_regression', 'mutual_info_regression']model_filters={'AdaBoostRegressor': 'AdaBoostRegressor', 'ExtraTreesRegressor': 'ExtraTreesRegressor', 'GradientBoostingRegressor': 'GradientBoostingRegressor', 'RandomForestRegressor': 'RandomForestRegressor', 'RandomTreesEmbedding': 'RandomTreesEmbedding'}label_column=MEDVk=7min_votes=3",,f_classiff_regressionr_regressionmutual_info_regressionAdaBoostRegressorExtraTreesRegressorGradientBoostingRegressorRandomForestRegressorRandomTreesEmbeddingfeature_scoresmax_scaled_scores_feature_scoresselected_features_countselected_features





> 2023-01-14 19:22:04,548 [info] run executed, status=completed
> 2023-01-14 19:22:04,554 [info] starting run outlier_removal uid=ede4ecaf9c3040c5a75ff83ad3b37106 DB=http://mlrun-api:8080
Removed: 24
> 2023-01-14 19:22:07,435 [info] Outlier removal function removed successfully 16


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...d3b37106,0,Jan 14 19:22:04,completed,outlier_removal,workflow=14375580fda1457f9970753b9b8b4847kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,dataitem,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=MEDVrandom_state=50",,outlier_removaloutlier_removal_test





> 2023-01-14 19:22:07,701 [info] run executed, status=completed
> 2023-01-14 19:22:07,705 [info] starting run dalex uid=655e498963234cb08ba3adabcde2c905 DB=http://mlrun-api:8080
dataframe shape before dalex : (388, 9)
Preparation of a new explainer is initiated

  -> data              : 388 rows 8 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 388 values
  -> model_class       : xgboost.sklearn.XGBRegressor (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x7f39b28519d0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 5.04, mean = 22.8, max = 50.0
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.134, mean = 0.000121,

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...cde2c905,0,Jan 14 19:22:07,completed,dalex,workflow=14375580fda1457f9970753b9b8b4847kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,df_train=store://artifacts/mlops-jovyan/outlier_removal_outlier_removal:14375580fda1457f9970753b9b8b4847df_test=store://artifacts/mlops-jovyan/outlier_removal_outlier_removal_test:14375580fda1457f9970753b9b8b4847target=MEDV,,dalex_outputtrain_datatest_data





> 2023-01-14 19:22:18,702 [info] run executed, status=completed
weights are deisabled
> 2023-01-14 19:22:18,706 [info] starting run train uid=11f3b93cc8934772ae5e821291506aec DB=http://mlrun-api:8080
> 2023-01-14 19:22:19,030 [info] Sample set not given, using the whole training set as the sample set
> 2023-01-14 19:22:19,133 [info] training 'housing_dalex'


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...91506aec,0,Jan 14 19:22:18,completed,train,workflow=14375580fda1457f9970753b9b8b4847kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,datasettest_set,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing_dalex,mean_absolute_error=2.2533614177329864r2_score=0.8462224970882577root_mean_squared_error=3.4387234454168936mean_squared_error=11.82481893405983,feature-importancetest_setmodel





> 2023-01-14 19:22:20,191 [info] run executed, status=completed


uid,start,state,name,parameters,results
...47db3bef,Jan 14 19:21:56,completed,get_data,dataset=housingpath=/home/jovyan/data/MLOps22/project/src/housing.csv,
...b269c34a,Jan 14 19:21:57,completed,feature-selection-feature_selection,"ignore_type_errors=Truestat_filters=['f_classif', 'f_regression', 'r_regression', 'mutual_info_regression']model_filters={'AdaBoostRegressor': 'AdaBoostRegressor', 'ExtraTreesRegressor': 'ExtraTreesRegressor', 'GradientBoostingRegressor': 'GradientBoostingRegressor', 'RandomForestRegressor': 'RandomForestRegressor', 'RandomTreesEmbedding': 'RandomTreesEmbedding'}label_column=MEDVk=7min_votes=3",
...d3b37106,Jan 14 19:22:04,completed,outlier_removal,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=MEDVrandom_state=50",
...cde2c905,Jan 14 19:22:07,completed,dalex,df_train=store://artifacts/mlops-jovyan/outlier_removal_outlier_removal:14375580fda1457f9970753b9b8b4847df_test=store://artifacts/mlops-jovyan/outlier_removal_outlier_removal_test:14375580fda1457f9970753b9b8b4847target=MEDV,
...91506aec,Jan 14 19:22:18,completed,train,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing_dalex,mean_absolute_error=2.2533614177329864r2_score=0.8462224970882577root_mean_squared_error=3.4387234454168936mean_squared_error=11.82481893405983


> 2023-01-14 19:22:20,219 [info] started run workflow mlops-jovyan-trainer_baseline with run id = '14375580fda1457f9970753b9b8b4847' by local engine


In [None]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'dataset': 'motor',
                             'path': '/home/jovyan/data/MLOps22/project/src/freMTPL2freq.csv',
                             'label_column': 'ClaimNb',
                             'remove_outlier': True,
                             'k': 7, 
                             "min_votes":3})

> 2023-01-14 19:23:46,566 [info] starting run get_data uid=34fa0edb939e4fb0bcbedc8b7f8311f4 DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...7f8311f4,0,Jan 14 19:23:46,completed,get_data,workflow=a69a6d250db0451f9cf81f3cbed66461kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=motorpath=/home/jovyan/data/MLOps22/project/src/freMTPL2freq.csv,,motor





> 2023-01-14 19:23:55,531 [info] run executed, status=completed
> 2023-01-14 19:23:55,994 [info] starting run feature-selection-feature_selection uid=2dd8b50460694011b9ed71498615ac07 DB=http://mlrun-api:8080
