### Setup Docker registry credentials for Kubernetes configmap

In [1]:
import json
import os
import subprocess

DOCKER_REGISTRY = 'index.docker.io/dejangolubovic'

docker_config = {
        "auths": {
                "https://index.docker.io/v1/": {
                        "username": "username",
                        "auth": "echo -n 'username:password' | base64"}
        },
        "HttpHeaders": {
                "User-Agent": "Docker-Client/19.03.12 (linux)"
        }
}

with open('docker_config.json', 'w') as f:
    json.dump(docker_config, f)

docker_config_output = subprocess.check_output(["kubectl", "create", "configmap", 
                        "docker-config", "--from-file=docker-config.json"])
print(docker_config_output.decode('utf-8'))

configmap/docker-config created



### Create a Persistent Volume Claim (PVC) to store the model

In [2]:
pvc_output = subprocess.check_output(["kubectl", "apply", "-f", "fairing-pvc.yaml"])
print(pvc_output.decode('utf-8'))

pvc_name = subprocess.check_output(["kubectl", "get", "pvc", "fairing-pvc", \
                         "-o=jsonpath='{.metadata.name}"]).decode("utf-8")[1:]
pv_name = subprocess.check_output(["kubectl", "get", "pvc", "fairing-pvc", \
                         "-o=jsonpath='{.spec.volumeName}"]).decode("utf-8")[1:]
print('pvc_name:', pvc_name)
print('pv_name:', pv_name)

persistentvolumeclaim/fairing-pvc unchanged

pvc_name: fairing-pvc
pv_name: pvc-8cdb4205-5768-42e3-a439-72c5abfc30ff


### Setup model training using Kubeflow Fairing and TFJob

In [3]:
num_chief = 1 # number of Chief workers in TFJob 
num_ps = 1  # number of Parameter Servers in TFJob 
num_workers = 2  # number of Workers in TFJob 
model_dir = "/mnt"
export_path = "/mnt/export" 
train_steps = "1000"
batch_size = "100"
learning_rate = "0.01"

In [4]:
import uuid
from kubeflow import fairing   
from kubeflow.fairing.kubernetes.utils import mounting_pvc
from kubeflow.fairing.builders.cluster.minio_context import MinioContextSource
import kfp

tfjob_name = f'mnist-training-{uuid.uuid4().hex[:4]}'
tfjob_namespace = kfp.Client().get_user_namespace()
print(tfjob_name)
print(tfjob_namespace)

output_map =  {
    "Dockerfile": "Dockerfile",
    "mnist.py": "mnist.py"
}

context_source = ''

command=["python",
         "/opt/mnist.py",
         "--tf-model-dir=" + model_dir,
         "--tf-export-dir=" + export_path,
         "--tf-train-steps=" + train_steps,
         "--tf-batch-size=" + batch_size,
         "--tf-learning-rate=" + learning_rate]

minio_context_source = MinioContextSource(
            endpoint_url='http://minio-service.kubeflow:9000/',
            minio_secret='minio',
            minio_secret_key='minio123',#
            region_name='region')

fairing.config.set_preprocessor('python', 
                                command=command, 
                                path_prefix="/app", 
                                output_map=output_map)

fairing.config.set_builder(
    name='cluster', 
    registry=DOCKER_REGISTRY, 
    context_source=minio_context_source, 
    cleanup=True,
    pod_spec_mutators=[mounting_pvc(pvc_name=pvc_name, pvc_mount_path=model_dir)]
)

fairing.config.set_deployer(
    name='tfjob',
    namespace=tfjob_namespace,
    stream_log=False, 
    job_name=tfjob_name,
    chief_count=num_chief,
    worker_count=num_workers,
    ps_count=num_ps, 
    pod_spec_mutators=[mounting_pvc(pvc_name=pvc_name, pvc_mount_path=model_dir)]
)

[W 210407 15:20:41 utils:51] The function mounting_pvc has been deprecated,                     please use `volume_mounts`
[W 210407 15:20:41 utils:51] The function mounting_pvc has been deprecated,                     please use `volume_mounts`


mnist-training-77ef
dejan-golubovic


### Start training job

In [5]:
fairing.config.run()

[I 210407 15:20:45 config:134] Using preprocessor: <kubeflow.fairing.preprocessors.base.BasePreProcessor object at 0x7f25a0e531d0>
[I 210407 15:20:45 config:136] Using builder: <kubeflow.fairing.builders.cluster.cluster.ClusterBuilder object at 0x7f25a28e6048>
[I 210407 15:20:45 config:138] Using deployer: <kubeflow.fairing.deployers.tfjob.tfjob.TfJob object at 0x7f25a0273dd8>
[I 210407 15:20:46 cluster:46] Building image using cluster builder.
[I 210407 15:20:46 base:107] Creating docker context: /tmp/fairing_context_n_2cmk91
[W 210407 15:20:46 base:94] /tmp/fairing_dockerfile_cyv9tgof already exists in Fairing context, skipping...
[W 210407 15:20:46 manager:298] Waiting for fairing-builder-m5ght-lks4s to start...
[W 210407 15:20:46 manager:298] Waiting for fairing-builder-m5ght-lks4s to start...
[W 210407 15:20:46 manager:298] Waiting for fairing-builder-m5ght-lks4s to start...
[W 210407 15:21:12 manager:298] Waiting for fairing-builder-m5ght-lks4s to start...
[I 210407 15:21:15 mana

[36mINFO[0m[0001] Retrieving image manifest tensorflow/tensorflow:1.15.2-py3
[36mINFO[0m[0002] Retrieving image manifest tensorflow/tensorflow:1.15.2-py3
[36mINFO[0m[0003] Built cross stage deps: map[]
[36mINFO[0m[0003] Retrieving image manifest tensorflow/tensorflow:1.15.2-py3
[36mINFO[0m[0004] Retrieving image manifest tensorflow/tensorflow:1.15.2-py3
[36mINFO[0m[0005] Executing 0 build triggers
[36mINFO[0m[0005] Unpacking rootfs as cmd ADD mnist.py /opt/mnist.py requires it.
[36mINFO[0m[0029] Taking snapshot of full filesystem...
[36mINFO[0m[0032] Resolving 27181 paths
[36mINFO[0m[0037] Using files from context: [/kaniko/buildcontext/mnist.py]
[36mINFO[0m[0037] ADD mnist.py /opt/mnist.py
[36mINFO[0m[0037] RUN chmod +x /opt/mnist.py
[36mINFO[0m[0037] cmd: /bin/sh
[36mINFO[0m[0037] args: [-c chmod +x /opt/mnist.py]
[36mINFO[0m[0037] Running: [/bin/sh -c chmod +x /opt/mnist.py]
[36mINFO[0m[0037] ENTRYPOINT ["/usr/bin/python"]
[36mINFO[0m[0037] CMD ["/o

[W 210407 15:22:00 cluster:106] Cleaning up job fairing-builder-m5ght...
[W 210407 15:22:00 job:101] The tfjob mnist-training-77ef launched.


(<kubeflow.fairing.preprocessors.base.BasePreProcessor at 0x7f25a0e531d0>,
 <kubeflow.fairing.builders.cluster.cluster.ClusterBuilder at 0x7f25a28e6048>,
 <kubeflow.fairing.deployers.tfjob.tfjob.TfJob at 0x7f25a0273dd8>)

### Inspect the running training job

In [6]:
from kubeflow.tfjob import TFJobClient
tfjob_client = TFJobClient()

#tfjob_client.get(tfjob_name, namespace=tfjob_namespace)

In [7]:
tfjob_client.wait_for_job(tfjob_name, namespace=tfjob_namespace, watch=True)

NAME                           STATE                TIME                          
mnist-training-77ef            Created              2021-04-07T15:22:00Z          
mnist-training-77ef            Running              2021-04-07T15:22:27Z          
mnist-training-77ef            Running              2021-04-07T15:22:27Z          
mnist-training-77ef            Succeeded            2021-04-07T15:22:31Z          


In [8]:
tfjob_client.is_job_succeeded(tfjob_name, namespace=tfjob_namespace)

True

In [None]:
#tfjob_client.get_logs(tfjob_name, namespace=tfjob_namespace)

### Serve the trained model

In [9]:
from kubeflow.fairing.deployers.kfserving.kfserving import KFServing

isvc_name = f'dejan-mnist-service-{uuid.uuid4().hex[:4]}'
print(isvc_name)
isvc = KFServing('tensorflow',
                 namespace=tfjob_namespace,
                 isvc_name=isvc_name,
                 default_storage_uri='pvc://' + pvc_name + '/export')
isvc.deploy(isvc.generate_isvc())

dejan-mnist-service-197a
NAME                 READY      DEFAULT_TRAFFIC CANARY_TRAFFIC  URL                                               
dejan-mnist-servi... Unknown                                                                                      
dejan-mnist-servi... False                                                                                        
dejan-mnist-servi... False                                                                                        
dejan-mnist-servi... False                                                                                        
dejan-mnist-servi... False                                                                                        
dejan-mnist-servi... False                                                                                        
dejan-mnist-servi... True       100                             http://dejan-mnist-service-197a.dejan-golubovic...


[I 210407 15:23:09 kfserving:127] Deployed the InferenceService dejan-mnist-service-197a successfully.


'dejan-mnist-service-197a'

In [10]:
from kfserving import KFServingClient

kfserving_client = KFServingClient()
mnist_isvc = kfserving_client.get(isvc_name, namespace=tfjob_namespace)
mnist_isvc_name = mnist_isvc['metadata']['name']
mnist_isvc_endpoint = mnist_isvc['status'].get('url', '')

print("MNIST Service Endpoint: " + mnist_isvc_endpoint)

MNIST Service Endpoint: http://dejan-mnist-service-197a.dejan-golubovic.example.com/v1/models/dejan-mnist-service-197a


In [11]:
MODEL_HOST = f"Host: {mnist_isvc_name}-predictor-default.{tfjob_namespace}.example.com"
MODEL_URL = f"http://ml.cern.ch/v1/models/{mnist_isvc_name}:predict"
print(MODEL_HOST)
print(MODEL_URL)

!curl -v -H "{MODEL_HOST}" {MODEL_URL} -d @./input.json

Host: dejan-mnist-service-197a-predictor-default.dejan-golubovic.example.com
http://ml.cern.ch/v1/models/dejan-mnist-service-197a:predict
*   Trying 137.138.6.116...
* TCP_NODELAY set
* Connected to ml.cern.ch (137.138.6.116) port 80 (#0)
> POST /v1/models/dejan-mnist-service-197a:predict HTTP/1.1
> Host: dejan-mnist-service-197a-predictor-default.dejan-golubovic.example.com
> User-Agent: curl/7.58.0
> Accept: */*
> Content-Length: 2052
> Content-Type: application/x-www-form-urlencoded
> Expect: 100-continue
> 
< HTTP/1.1 100 Continue
* We are completely uploaded and fine
< HTTP/1.1 302 Found
< location: https://auth.cern.ch/auth/realms/cern/protocol/openid-connect/auth?client_id=kubeflow-ml&redirect_uri=https%3A%2F%2Fml.cern.ch%2Flogin%2Foidc&response_type=code&scope=profile+openid&state=MTYxNzgwODk5OXxFd3dBRURCNlYyaHBiVkJxZUV4aE9GUkdjbVE9fEq71z_lkULW91mlFBdcMHoOrCWQ1l4-PJFv7U45S9eN
< date: Wed, 07 Apr 2021 15:23:19 GMT
< content-length: 0
< x-envoy-upstream-service-time: 19
< server:

### Delete training job and inference service

In [12]:
tfjob_client.delete(tfjob_name, namespace=tfjob_namespace)

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'mnist-training-77ef',
  'group': 'kubeflow.org',
  'kind': 'tfjobs',
  'uid': '4a828543-920d-4c5c-a50e-947d4c06834e'}}

In [13]:
kfserving_client.delete(isvc_name, namespace=tfjob_namespace)

{'kind': 'Status',
 'apiVersion': 'v1',
 'metadata': {},
 'status': 'Success',
 'details': {'name': 'dejan-mnist-service-197a',
  'group': 'serving.kubeflow.org',
  'kind': 'inferenceservices',
  'uid': 'c4be3c20-77bd-41b1-a197-154fbdbce91c'}}