# Model serving using Kserve

In the previous labs you learned how train and export a model to storage, now we will use this model for simple online predictions

In [2]:
!pip install kserve

Collecting kserve
  Downloading kserve-0.11.1-py3-none-any.whl (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.3/344.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting ray[serve]<2.5.0,>=2.4.0
  Downloading ray-2.4.0-cp38-cp38-manylinux2014_x86_64.whl (58.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.6/58.6 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting uvicorn[standard]<0.20.0,>=0.19.0
  Downloading uvicorn-0.19.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tritonclient<3.0.0,>=2.18.0
  Downloading tritonclient-2.39.0-py3-none-manylinux1_x86_64.whl (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting prometheus-client<0.14.0,>=0.13.1
  Downloading prome

In [3]:
## import depandancies
from kubernetes import client 
from kserve import KServeClient
from kserve import constants
from kserve import utils
from kserve import V1beta1InferenceService
from kserve import V1beta1InferenceServiceSpec
from kserve import V1beta1PredictorSpec
from kserve import V1beta1SKLearnSpec
from kserve import V1beta1XGBoostSpec


## 4.1 Deploy the model

### 4.1.1 Define model serving Metadata

In [25]:
# get kserve namespace
namespace = utils.get_default_target_namespace()
### Define the model name, due to technical limitations on service naming size, consider using just a trigram.
name = "xux"
### the path to model used to launch the service
bucket=''#firstname-lastname
storage_uri=f"s3://{bucket}/models/frompipeline/xgboost/chicago"

### 4.1.2 Create a secret and a service account so Kserve can use model in MinIO

Check the `./resources/s3_secret.yaml` file that contains 2 resources definitions, and a link to MinIO

In [20]:

!kubectl apply -f ./resources/s3_secret.yaml

secret/kserve-minio-secret configured
serviceaccount/kserve-minio-sa unchanged


### 4.1.3 Create the inference service specification using

- The model name and namespace
- The storage URI
- The Protocol version
- The service account created in ./resources/s3_secret.yaml
- An "image pull secret" reference so kserve can pull images from container registry (predictor and others...)


In [26]:
chicago_isvc = V1beta1InferenceService(
    api_version="serving.kserve.io/v1beta1",
    kind=constants.KSERVE_KIND,
    metadata=client.V1ObjectMeta(
        name=name,
        namespace=namespace
    ),
    spec=V1beta1InferenceServiceSpec(
        predictor=V1beta1PredictorSpec(
            xgboost=(
                V1beta1XGBoostSpec(
                    storage_uri=storage_uri,
                    protocol_version="v2"
                )
            ),
            service_account_name='kserve-minio-sa',
            image_pull_secrets=[{'name':'registry-secret'}]
        )
    )

)

### 4.1.4 launch this service

In [27]:
KServe = KServeClient()
KServe.create(chicago_isvc)

{'apiVersion': 'serving.kserve.io/v1beta1',
 'kind': 'InferenceService',
 'metadata': {'creationTimestamp': '2023-11-13T10:53:10Z',
  'generation': 1,
  'labels': {'modelClass': 'mlserver_xgboost.XGBoostModel'},
  'managedFields': [{'apiVersion': 'serving.kserve.io/v1beta1',
    'fieldsType': 'FieldsV1',
    'fieldsV1': {'f:spec': {'.': {},
      'f:predictor': {'.': {},
       'f:imagePullSecrets': {},
       'f:serviceAccountName': {},
       'f:xgboost': {'.': {},
        'f:name': {},
        'f:protocolVersion': {},
        'f:storageUri': {}}}}},
    'manager': 'OpenAPI-Generator',
    'operation': 'Update',
    'time': '2023-11-13T10:53:08Z'}],
  'name': 'xux',
  'namespace': 'kubeflow-user-guillaume-etevenard',
  'resourceVersion': '5818287096',
  'uid': 'f3233655-b528-4b78-af6b-0b4db8cbdedd'},
 'spec': {'predictor': {'imagePullSecrets': [{'name': 'registry-secret'}],
   'model': {'env': [{'name': 'MLSERVER_MODEL_NAME', 'value': 'xux'},
     {'name': 'MLSERVER_MODEL_URI', 'valu

### 4.1.5 get the inference service status over time

In [28]:
KServe.get(name, namespace=namespace, watch=True, timeout_seconds=120)

NAME    READY    PREV    LATEST    URL
xux     Unknown
NAME    READY    PREV    LATEST    URL
xux     Unknown


In [29]:
import requests

### using Kserve api, get the url of the service
isvc_resp = KServe.get(name, namespace=namespace)
isvc_url = isvc_resp['status']['address']['url']

In [30]:
### Print the URL to se how to query the model
isvc_url

'http://xux.kubeflow-user-guillaume-etevenard.svc.cluster.local/v2/models/xux/infer'

In [31]:
### Test the API : try a GET to the models/modelname route
requests.get(f'http://{name}.{namespace}.svc.cluster.local/v2/models/{name}').text

'{"name":"xux","versions":[],"platform":"","inputs":[],"outputs":[]}'

#### 4.1.6 Consider that the model is deployed IF : 

- KServe.get(..., namespace=..., watch=True, timeout_seconds=120) return READY = TRUE
- Interface shows all green checks

![serving_summary](./images/serving_summary.png)

![serving_details](./images/serving_details.png)

## 4.2 Query the model

### 4.2.1 Get sample data from the original dataset

In [32]:
data_list = [
        [
            746.0,
            3.34,
            6.0,
            41.9442266,
            -87.65599818,
            16.0,
            11.75,
            0.0,
            0.0,
            11.75
        ],
        [
            681.0,
            3.0,
            8.0,
            41.89960211,
            -87.63330804,
            24.0,
            11.25,
            0.0,
            0.0,
            11.25
        ]
      ]

### 4.2.2 Create the object for query 

In [33]:

inference_input = {
  "inputs": [
    {
      "name": "input-0",
      "shape": [2, 10],
      "datatype": "FP32",
      "data": data_list
    }
  ]
}

### 4.2.3 Query the model

In [34]:
response = requests.post(isvc_url,json=inference_input)
### you should get a json object ending with "data:[prediction1,prediction2]"
print(response.text)

{"model_name":"xux","model_version":null,"id":"620b594a-56ae-47bf-a8df-ab460f1134c9","parameters":null,"outputs":[{"name":"predict","shape":[2],"datatype":"FP32","parameters":null,"data":[-0.04879039525985718,-0.018634533509612083]}]}


## Go further

We see how to create an inference service from this notebook. to go further we could create it from a pipeline component

In [None]:
### this is the kserve component, prebuild and available on github
kserve_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kserve/component.yaml')


In [None]:
from kubernetes import client as k8s_client
pipeline_conf = kfp.dsl.PipelineConf()
pipeline_conf.set_image_pull_secrets([k8s_client.V1ObjectReference(name='registry-secret')])

@dsl.pipeline(
  name='KServe pipeline',
  description='A pipeline for creating inference service from s3 model.'
)
def kservePipeline():
  action = 'apply'
  model_name = 'xux'
  model_uri = 's3://firstname-lastname/models/frompipeline/xgboost/chicago' #ATTENTION: change the bucket name
  namespace = 'kserve'
  framework = 'xgboost'

  kserve = kserve_op(action = action,
                             model_name=model_name,
                             model_uri=model_uri,
                             namespace=namespace,
                             framework=framework,
                             ### how to add s3 SA ?
                             ### how to set protocal version ? 
                             ).set_image_pull_policy('Always')


In [None]:
run_id = f'kserve_pipeline_{datetime.now().strftime("%Y%m%d_%H%M%S")}'