This notebook creates nuclio serverless function for train data preprocessing. 
The datapreprocessing functions are defined in widsutil.py
The encoded data is saved as a dataset artifact

In [1]:
import mlrun

from os import path
project_name = "widsdb2"
project_dir = "conf"

widsdb2_proj = mlrun.new_project(project_name,
                            context=project_dir,
                            init_git=True,
                            user_project=False)
#wids_prj, artifact_path = mlrun.set_environment('http://mlrun-api:8080', 
#                                                project=project_name, user_project=False)


#widsdb2_proj = mlrun.projects.load_project(project_dir,  clone=True)



In [2]:
# nuclio: start-code

In [3]:

import os

import sys
sys.path.append('/v3io/projects/widsdb2/util')


import json
import pandas as pd
import numpy as np
from collections import defaultdict
import widsutil as util
from cloudpickle import dumps, dump, load


from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem


def trdata_prep(
    context:MLClientCtx, 
    src: DataItem,
    file_ext: str = "csv",
    train_enc: str = "train_enc"
):
    """process a raw icu data file
    
    * `encoded` is input for a cross validation and training function
    
    widsutil module has various transform functions
    steps (not necessarily in correct order, some parallel)
    * column name maps
    * deal with nans and other types of missings/junk
    * label encode binary and ordinal category columns
    * create category ranges from numerical columns
    And finally,
    * test
    
   
    TODO: 
        * parallelize where possible
        * more abstraction (more parameters, chain sklearn transformers)
        * convert to marketplace function
        
    :param context:          the function execution context
    :param src:              an artifact or file path
    :param file_ext:         file type for artifacts
    :param train_enc:        key of encoded data table in artifact store
    """
    df = src.as_df()
    
    datapreprocess =util.DataPreprocess(df)
    df = datapreprocess.preprocess()

    #df = df.fillna(0)
    df.fillna(df.mean(), inplace=True)
    #np.where(df.values >= np.finfo(np.float32).max)

    context.log_dataset(train_enc, df=df, format=file_ext, index=False)



In [4]:
# nuclio: end-code

In [5]:
train_data_prep_func = mlrun.code_to_function(name='prep',
                                         handler='trdata_prep',
                                         kind='job',
                                         image='mlrun/ml-models')

In [6]:
train_data_prep_func.save()

'db://default/prep'

In [7]:
widsdb2_proj.set_function(train_data_prep_func)

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f8da4972590>

In [8]:
widsdb2_proj.save()

In [9]:
#widsdb2_proj.functions

[{'name': 'prep',
  'spec': {'kind': 'job',
   'metadata': {'name': 'prep', 'tag': '', 'project': 'widsdb2'},
   'spec': {'command': '',
    'args': [],
    'image': 'mlrun/ml-models',
    'env': [],
    'default_handler': 'trdata_prep',
    'entry_points': {'trdata_prep': {'name': 'trdata_prep',
      'doc': 'process a raw icu data file\n\n* `encoded` is input for a cross validation and training function\n\nwidsutil module has various transform functions\nsteps (not necessarily in correct order, some parallel)\n* column name maps\n* deal with nans and other types of missings/junk\n* label encode binary and ordinal category columns\n* create category ranges from numerical columns\nAnd finally,\n* test\n\n\nTODO: \n    * parallelize where possible\n    * more abstraction (more parameters, chain sklearn transformers)\n    * convert to marketplace function',
      'parameters': [{'name': 'context',
        'type': 'MLClientCtx',
        'doc': 'the function execution context',
        'de

In [10]:
import mlrun
import pandas as pd
from mlrun.run import get_dataitem


train_df = pd.read_csv('TrainingWiDS2021.csv', index_col=[0])
print(train_df.shape)
widsdb2_proj.log_dataset(key='raw_train_data', df=train_df, index=False, format='csv')


(130157, 180)


<mlrun.artifacts.dataset.DatasetArtifact at 0x7f8da79bf690>

In [11]:
from mlrun.platforms import auto_mount
import sys
sys.path.append('/v3io/projects/widsdb2/util')

#Run the nuclio function on cluster
train_data_prep_func = train_data_prep_func.apply(auto_mount())


train_data_prep_run = train_data_prep_func.run( 
                            inputs={"src"   :'store://raw_train_data', 'train-enc': 'train_enc' })



> 2021-07-01 12:30:40,659 [info] starting run prep-trdata_prep uid=8a0d771ab71f430b81135407e19a7316 DB=http://mlrun-api:8080
> 2021-07-01 12:30:41,113 [info] Job is running in the background, pod: prep-trdata-prep-bpbsn
Percent of Nans in Train Data : 61.6
Categorical columns: ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']
Number of unique Profiles : 6903
len lab_col 128
len lab_col_names 32
lab_col_names
 ['heartrate', 'creatinine', 'mbp_noninvasive', 'lactate', 'resprate', 'temp', 'hematocrit', 'mbp', 'diasbp_invasive', 'hco3', 'hemaglobin', 'diasbp_noninvasive', 'sodium', 'bun', 'potassium', 'glucose', 'diasbp', 'mbp_invasive', 'arterial_ph', 'sysbp', 'inr', 'arterial_po2', 'bilirubin', 'wbc', 'calcium', 'albumin', 'platelets', 'spo2', 'sysbp_invasive', 'arterial_pco2', 'sysbp_noninvasive', 'pao2fio2ratio']

> 2021-07-01 12:31:53,113 [info] run executed, status=completed
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
widsdb2,...e19a7316,0,Jul 01 12:30:46,completed,prep-trdata_prep,v3io_user=aruna_lankakind=jobowner=aruna_lankahost=prep-trdata-prep-bpbsn,srctrain-enc,,,train_enc


to track results use .show() or .logs() or in CLI: 
!mlrun get run 8a0d771ab71f430b81135407e19a7316 --project widsdb2 , !mlrun logs 8a0d771ab71f430b81135407e19a7316 --project widsdb2
> 2021-07-01 12:32:01,098 [info] run executed, status=completed


In [12]:
#Run the nuclio function locally
train_data_prep_run = train_data_prep_func.run( 
                            inputs={"src"   :'store://raw_train_data', 'train-enc': 'train_enc' }, local=True)
                            # local=True, artifact_path=artifact_path)
 

> 2021-07-01 12:32:01,112 [info] starting run prep-trdata_prep uid=895a7e70a68c47468a572a4b9f6b01db DB=http://mlrun-api:8080
Percent of Nans in Train Data : 61.6
Categorical columns: ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']
Number of unique Profiles : 6903
len lab_col 128
len lab_col_names 32
lab_col_names
 ['lactate', 'inr', 'bun', 'glucose', 'albumin', 'arterial_po2', 'hco3', 'mbp_noninvasive', 'bilirubin', 'resprate', 'spo2', 'temp', 'platelets', 'sodium', 'diasbp', 'sysbp_noninvasive', 'mbp_invasive', 'potassium', 'diasbp_invasive', 'creatinine', 'hemaglobin', 'sysbp', 'wbc', 'arterial_ph', 'hematocrit', 'arterial_pco2', 'sysbp_invasive', 'heartrate', 'pao2fio2ratio', 'mbp', 'diasbp_noninvasive', 'calcium']



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
widsdb2,...9f6b01db,0,Jul 01 12:32:01,completed,prep-trdata_prep,v3io_user=aruna_lankakind=owner=aruna_lankahost=jupyter-svc-68f96fcc4c-wjtq2,srctrain-enc,,,train_enc


to track results use .show() or .logs() or in CLI: 
!mlrun get run 895a7e70a68c47468a572a4b9f6b01db --project widsdb2 , !mlrun logs 895a7e70a68c47468a572a4b9f6b01db --project widsdb2
> 2021-07-01 12:33:05,131 [info] run executed, status=completed


In [13]:
#save the project with function added 
widsdb2_proj.save()

In [14]:
#Get the preprocessed dataframe which is output of nuclio function run above
dfa = mlrun.get_dataitem(train_data_prep_run.outputs['train_enc']).as_df()

In [15]:
dfa

Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,...,d1_resprate_div_sysbp_min,d1_lactate_min_div_diasbp_min,d1_heartrate_min_div_d1_sysbp_min,d1_hco3_div,d1_resprate_times_resprate,left_average_spo2,total_chronic,total_cancer_immuno,has_complicator,apache_3j
0,68.0,22.732803,0,2,1,180.3,4,1,92,0,...,0.136986,0.027027,0.986301,1.266667,340.0,91.333333,0,0,0,7
1,77.0,27.421875,0,2,0,160.0,4,1,90,0,...,0.179104,0.112903,1.074627,1.038462,384.0,90.000000,0,0,0,6
2,25.0,31.952749,0,2,0,172.7,3,0,93,0,...,0.076190,0.055834,0.647619,1.088020,168.0,95.666667,0,0,0,8
3,81.0,22.635548,1,2,0,165.1,8,2,92,0,...,0.083333,0.055834,1.095238,1.088020,161.0,98.333333,0,0,0,4
4,19.0,29.054816,0,2,1,188.0,15,0,91,0,...,0.133333,0.055834,0.500000,1.088020,288.0,98.666667,0,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130152,50.0,29.287256,0,2,1,175.3,3,0,1109,0,...,0.089552,0.055834,0.664179,1.088020,480.0,94.000000,0,0,0,0
130153,79.0,29.653433,0,2,0,162.6,2,0,1106,0,...,0.183486,0.055834,0.715596,1.000000,740.0,95.333333,0,0,0,6
130154,73.0,32.265371,0,0,1,177.8,3,0,1104,0,...,0.097561,0.163333,0.471545,1.368421,408.0,92.333333,0,0,0,0
130155,81.0,24.408579,0,2,1,185.4,3,0,1108,0,...,0.114286,0.029545,1.000000,1.000000,248.0,99.000000,0,0,0,6


In [16]:
train_data_prep_run.outputs['train_enc']

'store://artifacts/widsdb2/prep-trdata_prep_train_enc:895a7e70a68c47468a572a4b9f6b01db'

In [17]:
data = f'store://widsdb2/prep-trdata_prep_train_enc'
dataset = mlrun.run.get_dataitem(data).as_df()
dataset.head()

Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,...,d1_resprate_div_sysbp_min,d1_lactate_min_div_diasbp_min,d1_heartrate_min_div_d1_sysbp_min,d1_hco3_div,d1_resprate_times_resprate,left_average_spo2,total_chronic,total_cancer_immuno,has_complicator,apache_3j
0,68.0,22.732803,0,2,1,180.3,4,1,92,0,...,0.136986,0.027027,0.986301,1.266667,340.0,91.333333,0,0,0,7
1,77.0,27.421875,0,2,0,160.0,4,1,90,0,...,0.179104,0.112903,1.074627,1.038462,384.0,90.0,0,0,0,6
2,25.0,31.952749,0,2,0,172.7,3,0,93,0,...,0.07619,0.055834,0.647619,1.08802,168.0,95.666667,0,0,0,8
3,81.0,22.635548,1,2,0,165.1,8,2,92,0,...,0.083333,0.055834,1.095238,1.08802,161.0,98.333333,0,0,0,4
4,19.0,29.054816,0,2,1,188.0,15,0,91,0,...,0.133333,0.055834,0.5,1.08802,288.0,98.666667,0,0,0,8


In [18]:
#dataset.isnull().any() == True
dataset.columns[dataset.isnull().any()]


Index([], dtype='object')

In [21]:
from mlrun.mlutils.data import get_sample

raw, labels, header = get_sample(dataset, sample=-1, label='diabetes_mellitus')
len(raw)  

130156