# MLOps class final project 

### Creating mlrun project

In [1]:
import mlrun
project = mlrun.get_or_create_project(name='mlops', user_project=True, context='./')

> 2023-01-14 22:46:07,261 [info] loaded project mlops from MLRun DB


### Setting up functions

In [2]:
import os

# Setting get data function
get_data = mlrun.code_to_function(name='gen_dataset', kind='job', image='mlrun/mlrun', handler='get_data', filename='src/get_data.py')

# Setting feature-selection
feature_selection = mlrun.import_function('hub://feature_selection')

# Setting outlier removal function
outlier_removal = mlrun.code_to_function(name='outlier_removal', kind='job', image='mlrun/mlrun', handler='run', filename='src/outlier_removal.py')

# Setting dalex function
dalex = mlrun.code_to_function(name='dalex', kind='job', handler='run_dalex',filename='src/dalex.py')

# Setting training function
train = mlrun.code_to_function(name='train', kind='job', handler='train',filename='src/auto_trainer.py')


In [3]:
project.set_function(get_data)
project.set_function(feature_selection)
project.set_function(outlier_removal)
project.set_function(dalex)
project.set_function(train)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f26af345cd0>

### Creating the kubeflow-pipeline

In [4]:
%%writefile src/trainer_baseline.py
import mlrun
from kfp import dsl
import sklearn
from src.outlier_removal import *

@dsl.pipeline(
    name="Automatic Pipeline",
    description="Train & Evaluate"
)
def kfpipeline(dataset: str='housing',
               path: str='/home/jovyan/data/src/housing.csv',
               label_column:str='MEDV',
               k: int=5,
               min_votes: float=3,
               remove_outlier:bool= False):
    
    project = mlrun.get_current_project()
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@ Getting the data @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    get_data_run = mlrun.run_function(name='get_data',
                                      function='gen-dataset',
                                      params={'dataset': dataset,
                                              'path': path},
                                      outputs=[dataset])
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ feature selection @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    feature_selection_run = mlrun.run_function("hub://feature_selection",
                                               params={'ignore_type_errors': True,
                                                       "stat_filters": ['f_classif'
                                                                        ,'f_regression'
                                                                        ,'r_regression'
                                                                        ,'mutual_info_regression'],
                                                       "model_filters": {#'AdaBoostRegressor':'AdaBoostRegressor',
#                                                                          'ExtraTreesRegressor':'ExtraTreesRegressor',
#                                                                          'GradientBoostingRegressor':'GradientBoostingRegressor',
#                                                                          'RandomForestRegressor':'RandomForestRegressor',
#                                                                          'RandomTreesEmbedding':'RandomTreesEmbedding',
                                                       },
                                                       "label_column": label_column,
                                                       "k": k,
                                                       "min_votes": min_votes},
                                               inputs={'df_artifact': get_data_run.outputs[dataset]},
                                               outputs=['feature_scores', 'selected_features_count',
                                                        'selected_features'])
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ outlier detection @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # Setting outlier removal params
    votes_thresholds = 3
    pyod_contamination=0.2 # + (0,0.5)
    z_score_threshold=3
    iqr_low=0.01
    iqr_high=0.99
    iqr_max_removal_percent_per_column=0.95
    remove_outliers_functions = [(remove_outliers_z_score, {'threshold': z_score_threshold}),
                        (remove_outliers_iqr, {'low_quantile': iqr_low, 'high_quantile':iqr_high, 'max_removal_percent_per_column':iqr_max_removal_percent_per_column}),
                        (remove_outliers_LOF, {'contamination': pyod_contamination}),
                        (remove_outliers_ABOD, {'contamination': pyod_contamination}),
                           (remove_outliers_HBOS, {'contamination': pyod_contamination})
                            ]
    
    outlier_removal_run = mlrun.run_function(name='outlier_removal',
                                            function='outlier-removal',
                                            inputs={'dataitem': feature_selection_run.outputs['selected_features']},
                                            params={'remove_outliers_functions': remove_outliers_functions, 
                                                    'remove_outlier': remove_outlier,
                                                    'votes_thresholds': votes_thresholds,
                                                    'label_column': label_column,
                                                    'random_state': 10},
                                            outputs=['outlier_removal', 'outlier_removal_test'])
    
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ dalex @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    
    dalex = project.run_function(name='dalex',
                                 function='dalex',
                                 params={'df_train': outlier_removal_run.outputs['outlier_removal'],
                                         'df_test': outlier_removal_run.outputs['outlier_removal_test'],
                                         'target': label_column},
                                 outputs=['train_data', 'test_data', 'dalex_output'])
    
    params = {"model_class": "xgboost.XGBRegressor",
              "label_columns": label_column,
              "model_name": dataset + '_final'}
    try:
        for key,val in json.loads(mlrun.get_dataitem(dalex.outputs['dalex_output']).get()).items():
            params['sample_weight'] = val
    except:
        print('weights are deisabled')
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Training the model @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    
    # Train a model using the auto_trainer hub function
    train_run = mlrun.run_function(name= 'train',
                                   function='train',
                                   inputs={"dataset": dalex.outputs['train_data'], 
                                           "test_set": dalex.outputs['test_data']},
                                   params = params, 
                                   handler='train',
                                   outputs=["model"],
                               )

Overwriting src/trainer_baseline.py


### Running the pipeline with mlrun project

In [5]:
# Register the workflow file:
workflow_name = "trainer_baseline"
project.set_workflow(workflow_name, "src/trainer_baseline.py")

# Save the project:
project.save()

<mlrun.projects.project.MlrunProject at 0x7f26af345d90>

In [6]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'remove_outlier': True,
                             'path': '/home/jovyan/data/MLOps22/project/src/housing.csv',
                             'k': 9,
                             "min_votes":2})



> 2023-01-14 22:46:09,758 [info] starting run get_data uid=46ce34caef764a5b8e40595a1346c36e DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...1346c36e,0,Jan 14 22:46:09,completed,get_data,workflow=1eaf14904d9243d0b06a5374db464103kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=housingpath=/home/jovyan/data/MLOps22/project/src/housing.csv,,housing





> 2023-01-14 22:46:10,341 [info] run executed, status=completed
> 2023-01-14 22:46:10,620 [info] starting run feature-selection-feature_selection uid=72880d27dccf445b873e09802cc44608 DB=http://mlrun-api:8080
> 2023-01-14 22:46:13,119 [info] votes needed to be selected: 2


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...2cc44608,0,Jan 14 22:46:10,completed,feature-selection-feature_selection,workflow=1eaf14904d9243d0b06a5374db464103kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,df_artifact,"ignore_type_errors=Truestat_filters=['f_classif', 'f_regression', 'r_regression', 'mutual_info_regression']model_filters={}label_column=MEDVk=9min_votes=2",,f_classiff_regressionr_regressionmutual_info_regressionfeature_scoresmax_scaled_scores_feature_scoresselected_features_countselected_features





> 2023-01-14 22:46:13,306 [info] run executed, status=completed
> 2023-01-14 22:46:13,311 [info] starting run outlier_removal uid=20ad385164d34b3782a507ab56f550a3 DB=http://mlrun-api:8080
Removed: 28
> 2023-01-14 22:46:16,640 [info] Outlier removal function removed successfully 19


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...56f550a3,0,Jan 14 22:46:13,completed,outlier_removal,workflow=1eaf14904d9243d0b06a5374db464103kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,dataitem,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=MEDVrandom_state=10",,outlier_removaloutlier_removal_test





> 2023-01-14 22:46:16,995 [info] run executed, status=completed
> 2023-01-14 22:46:17,001 [info] starting run dalex uid=e750c450f7c24f699dc2a581b434d898 DB=http://mlrun-api:8080
dataframe shape before dalex : (385, 12)
Preparation of a new explainer is initiated

  -> data              : 385 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 385 values
  -> model_class       : xgboost.sklearn.XGBRegressor (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_default at 0x7f264001d1f0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 5.0, mean = 22.1, max = 50.0
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.0552, mean = -1.21e-

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...b434d898,0,Jan 14 22:46:17,completed,dalex,workflow=1eaf14904d9243d0b06a5374db464103kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,df_train=store://artifacts/mlops-jovyan/outlier_removal_outlier_removal:1eaf14904d9243d0b06a5374db464103df_test=store://artifacts/mlops-jovyan/outlier_removal_outlier_removal_test:1eaf14904d9243d0b06a5374db464103target=MEDV,,dalex_outputtrain_datatest_data





> 2023-01-14 22:48:44,313 [info] run executed, status=completed
weights are deisabled
> 2023-01-14 22:48:44,319 [info] starting run train uid=bf32d43683e243adbd8fb861fff4fb55 DB=http://mlrun-api:8080
> 2023-01-14 22:48:44,517 [info] Sample set not given, using the whole training set as the sample set
> 2023-01-14 22:48:44,566 [info] training 'housing_final'


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...fff4fb55,0,Jan 14 22:48:44,completed,train,workflow=1eaf14904d9243d0b06a5374db464103kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,datasettest_set,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing_final,mean_absolute_error=2.693832419900333r2_score=0.8728103330350091root_mean_squared_error=3.64713447394233mean_squared_error=13.301589871018596,feature-importancetest_setmodel





> 2023-01-14 22:48:45,938 [info] run executed, status=completed


uid,start,state,name,parameters,results
...1346c36e,Jan 14 22:46:09,completed,get_data,dataset=housingpath=/home/jovyan/data/MLOps22/project/src/housing.csv,
...2cc44608,Jan 14 22:46:10,completed,feature-selection-feature_selection,"ignore_type_errors=Truestat_filters=['f_classif', 'f_regression', 'r_regression', 'mutual_info_regression']model_filters={}label_column=MEDVk=9min_votes=2",
...56f550a3,Jan 14 22:46:13,completed,outlier_removal,"remove_outliers_functions=[(, {'threshold': 3}), (, {'low_quantile': 0.01, 'high_quantile': 0.99, 'max_removal_percent_per_column': 0.95}), (, {'contamination': 0.2}), (, {'contamination': 0.2}), (, {'contamination': 0.2})]remove_outlier=Truevotes_thresholds=3label_column=MEDVrandom_state=10",
...b434d898,Jan 14 22:46:17,completed,dalex,df_train=store://artifacts/mlops-jovyan/outlier_removal_outlier_removal:1eaf14904d9243d0b06a5374db464103df_test=store://artifacts/mlops-jovyan/outlier_removal_outlier_removal_test:1eaf14904d9243d0b06a5374db464103target=MEDV,
...fff4fb55,Jan 14 22:48:44,completed,train,model_class=xgboost.XGBRegressorlabel_columns=MEDVmodel_name=housing_final,mean_absolute_error=2.693832419900333r2_score=0.8728103330350091root_mean_squared_error=3.64713447394233mean_squared_error=13.301589871018596


> 2023-01-14 22:48:45,968 [info] started run workflow mlops-jovyan-trainer_baseline with run id = '1eaf14904d9243d0b06a5374db464103' by local engine


In [None]:
run = project.run(name=workflow_name,watch=False,local=True, overwrite=True,
                  arguments={'dataset': 'motor',
                             'path': '/home/jovyan/data/MLOps22/project/src/freMTPL2freq.csv',
                             'label_column': 'ClaimNb',
                             'remove_outlier': True,
                             'k': 7, 
                             "min_votes":2})

> 2023-01-14 22:48:46,034 [info] starting run get_data uid=8be4c8c404074ebc9184276b21bd4d54 DB=http://mlrun-api:8080


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
mlops-jovyan,...21bd4d54,0,Jan 14 22:48:46,completed,get_data,workflow=c5cddc693b4d448ab406c4908b190d54kind=owner=jovyanhost=mlrun-jupyter-5cd9c659c-2dpxf,,dataset=motorpath=/home/jovyan/data/MLOps22/project/src/freMTPL2freq.csv,,motor





> 2023-01-14 22:48:55,499 [info] run executed, status=completed
> 2023-01-14 22:48:55,912 [info] starting run feature-selection-feature_selection uid=98d558c1876d40aa920bb742b680f6dc DB=http://mlrun-api:8080
