In [5]:
import kfp
import kfp.components as comp
from kfp.components import InputPath, OutputPath
import kfp.dsl as dsl
from kfp.aws import use_aws_secret
from typing import NamedTuple
from itertools import product


In [6]:
# In v1.1.0, in-cluster communication from notebook to Kubeflow Pipeline is not supported in this phase.
# In order to use kfp as previous, user needs to pass a cookie to KFP for communication as a workaround.
# https://www.kubeflow.org/docs/aws/pipeline/#authenticate-kubeflow-pipeline-using-sdk-inside-cluster

#authservice_session='authservice_session=<cookie>'
client = kfp.Client()
#Mudar namespace
namespace='fabiano-alencar'
client.list_experiments(namespace=namespace)
DATA_PATH = '/mnt'

## Component: Load Raw Data

In [23]:
def process_data(data_path):
    
    import subprocess

    # downlaod the dataset from the mlflow repo
    def install():
        subprocess.call(['apt-get', 'update'])
        subprocess.call(['apt-get', 'install', 'ffmpeg', '-y'])
        subprocess.call(['apt-get', 'install', 'libxext6', '-y'])
        subprocess.call(['apt-get', 'install', 'libsm6', '-y'])
        subprocess.call(['apt-get', 'install', 'libfontconfig1', '-y'])
        subprocess.call(['apt-get', 'install', 'libxrender1', '-y'])
        subprocess.call(['apt-get', 'install', 'libgl1-mesa-glx', '-y'])
     
    #install()
    
    #import cv2
    import glob
    
    ext = ['png', 'jpg', 'gif']    # Add image formats here

    files = []
    [files.extend(glob.glob(data_path+"/data/" + '*.' + e)) for e in ext]
    print(files)
    #images = [cv2.imread(file) for file in files]
    #print(images)

In [24]:
process_opencv_op = comp.func_to_container_op(process_data,
                                             base_image='python:3.7-slim',
                                             packages_to_install=['opencv-python','opencv-contrib-python','glob2'])

## Creating a Pipeline

In [25]:
from kubernetes.client.models import V1EnvVar


@dsl.pipeline(
    name='Training pipeline',
    description='Training pipeline for time series forecasting on household power consumption dataset.'

)
def training_pipeline(data_path):
    
    http_proxy = V1EnvVar(name='http_proxy', value='http://10.190.24.159:3128')
    https_proxy = V1EnvVar(name='https_proxy', value='http://10.190.24.159:3128')
    no_proxy = V1EnvVar(name='no_proxy', value='mlflow.mlflow,minio-service.kubeflow')   
    
    
    datasetName = "example-dataset"
    kubeflow_pvc = dsl.PipelineVolume(datasetName)    
    
    process_opencv_task = process_opencv_op(data_path).set_display_name('Process Images') \
                .add_env_variable(http_proxy) \
                .add_env_variable(https_proxy) \
                .add_env_variable(no_proxy) \
                .add_pvolumes({data_path: kubeflow_pvc})
    
    

## Creating a Pipeline Run

In [26]:

arguments = {"data_path":DATA_PATH}
experiment_name = 'minio_test'

# Submit a pipeline run
client.create_run_from_pipeline_func(
    training_pipeline, arguments=arguments, namespace=namespace,experiment_name=experiment_name)

RunPipelineResult(run_id=60e1b6ca-9c8d-44ef-8154-194fc5ba30bc)

## Uploading the Pipeline to be reuseable by others

In [48]:
kfp.compiler.Compiler().compile(training_pipeline, 'workflow.yaml')


#client.upload_pipeline(pipeline_package_path='workflow.yaml',
#                             pipeline_name='Electric Power Consumption Forecasting Training Pipeline.')