# Use TorchServe to deploy model on Vertex AI

Inspired by https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/community-content/pytorch_text_classification_using_vertex_sdk_and_gcloud/pytorch-text-classification-vertex-ai-train-tune-deploy.ipynb 

In [22]:
%env GOOGLE_APPLICATION_CREDENTIALS ./keys/huggingface-ml-e974975230cc.json

env: GOOGLE_APPLICATION_CREDENTIALS=./keys/huggingface-ml-e974975230cc.json


In [4]:
PROJECT_ID = "huggingface-ml"

In [5]:
import base64
import json
import os
import random
import sys

import google.auth
from google.cloud import aiplatform
from google.cloud.aiplatform import gapic as aip
from google.cloud.aiplatform import hyperparameter_tuning as hpt
from google.protobuf.json_format import MessageToDict

In [6]:
!pip -q install --upgrade google-cloud-aiplatform
!pip -q install transfomers
!pip -q install 'optimum[onnxruntime]'

You should consider upgrading via the '/Users/florentgbelidji/.pyenv/versions/3.9.10/envs/venv_hf_3.9.10/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

### Save model locally

In [129]:
from transformers import AutoTokenizer, AutoModel

model_name = "sentence-transformers/msmarco-distilbert-base-tas-b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


pt_save_directory = "./predictor/model/"

tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)

### Apply optimum optimizations

In [92]:
from pathlib import Path
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig
from optimum.pipelines import pipeline


pt_save_directory_optimum = "./predictor/optimum/"

save_path = Path("optimum_model")
save_path.mkdir(exist_ok=True)

#use ORTOptimizer to export the model and define quantization configuration
optimizer = ORTOptimizer(model=model, tokenizer=tokenizer)
optimization_config = OptimizationConfig(optimization_level=2)


# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=save_path / "model.onnx",
    onnx_optimized_model_output_path=save_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)

optimizer.model.config.save_pretrained(save_path) # saves config.json 

model = ORTModelForFeatureExtraction.from_pretrained(save_path, file_name="model-optimized.onnx")

tokenizer.save_pretrained(pt_save_directory_optimum)
model.save_pretrained(pt_save_directory_optimum)

ModuleNotFoundError: No module named 'onnxruntime'

### Create handlers

In [7]:
!mkdir ./predictor

In [93]:
%%writefile predictor/custom_handler.py

import os
import json
import logging

import torch
from transformers import AutoModel, AutoTokenizer
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig
from optimum.pipelines import pipeline

from ts.torch_handler.base_handler import BaseHandler

logger = logging.getLogger(__name__)
torch.set_num_threads(1)

class SentenceTransformersHandler(BaseHandler):
    """
    The handler takes an input string and returns the classification text 
    based on the serialized transformers checkpoint.
    """
    def __init__(self):
        super(SentenceTransformersHandler, self).__init__()
        self.initialized = False

    def initialize(self, ctx):
        """ Loads the model.pt file and initialized the model object.
        Instantiates Tokenizer for preprocessor to use
        Loads labels to name mapping file for post-processing inference response
        """
        self.manifest = ctx.manifest

        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        #self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu")

        # Read model serialize/pt file
        serialized_file = self.manifest["model"]["serializedFile"]
        model_pt_path = os.path.join(model_dir, serialized_file)
        if not os.path.isfile(model_pt_path):
            raise RuntimeError("Missing the model.pt or pytorch_model.bin file")
        
        # Load model
        self.model = ORTModelForFeatureExtraction.from_pretrained(model_dir)
        logger.debug('Transformer model from path {0} loaded successfully'.format(model_dir))
        
        # Ensure to use the same tokenizer used during training
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, model_max_length=128)
        self.pipeline = pipeline("feature-extraction", model=self.model, tokenizer=self.tokenizer)

        self.initialized = True

    def preprocess(self, data):
        """ Preprocessing input request by tokenizing
            Extend with your own preprocessing steps as needed
        """
        text = data[0].get("data")
        if text is None:
            text = data[0].get("body")
        sentences = text.decode('utf-8')
        logger.info("Received text: '%s'", sentences)

        # Tokenize the texts
      #  tokenizer_args = ((sentences,))
      #  inputs = self.tokenizer(*tokenizer_args,
      #                          padding='max_length',
      #                          max_length=128,
      #                          truncation=True,
      #                          return_tensors = "pt")
        return sentences

    def inference(self, sentences):
        """ Predict the class of a text using a trained transformer model.
        """
        
        def cls_pooling(pipeline_output):
            return [_h[0] for _h in pipeline_output]
        
        embeddings = cls_pooling(self.pipeline(sentences))

        logger.info(f"Model embedded: {len(embeddings)}" )
        return embeddings

    def postprocess(self, inference_output):
        return inference_output

Overwriting predictor/custom_handler.py


### Write Dockerfile

In [94]:
APP_NAME = "test_sbert_embedder_optimum"

In [95]:
%%bash -s $APP_NAME

APP_NAME=$1

cat << EOF > ./predictor/Dockerfile

FROM pytorch/torchserve:latest-cpu

# install dependencies
RUN python3 -m pip install --upgrade pip
RUN pip3 install transformers
RUN pip3 install 'optimum[onnxruntime]'

USER model-server

# copy model artifacts, custom handler and other dependencies
COPY custom_handler.py /home/model-server/
COPY ./optimum/ / /home/model-server/

# create torchserve configuration file
USER root
RUN printf "\nservice_envelope=json" >> /home/model-server/config.properties
RUN printf "\ninference_address=http://0.0.0.0:7080" >> /home/model-server/config.properties
RUN printf "\nmanagement_address=http://0.0.0.0:7081" >> /home/model-server/config.properties
RUN printf "\nmanagement_address=http://0.0.0.0:7081" >> /home/model-server/config.properties
RUN printf "\nworkers=4" >> /home/model-server/config.properties

# expose health and prediction listener ports from the image
EXPOSE 7080
EXPOSE 7081

# create model archive file packaging model artifacts and dependencies
RUN torch-model-archiver -f \
  --model-name=$APP_NAME \
  --version=1.0 \
  --serialized-file=/home/model-server/model.onnx \
  --handler=/home/model-server/custom_handler.py \
  --extra-files "/home/model-server/config.json,/home/model-server/tokenizer.json,/home/model-server/tokenizer_config.json,/home/model-server/special_tokens_map.json,/home/model-server/vocab.txt" \
  --export-path=/home/model-server/model-store

# run Torchserve HTTP serve to respond to prediction requests
CMD ["torchserve", \
     "--start", \
     "--ts-config=/home/model-server/config.properties", \
     "--models", \
     "$APP_NAME=$APP_NAME.mar", \
     "--model-store", \
     "/home/model-server/model-store"]
EOF

echo "Writing ./predictor/Dockerfile"

Writing ./predictor/Dockerfile


In [96]:
CUSTOM_PREDICTOR_IMAGE_URI = f"gcr.io/{PROJECT_ID}/pytorch_predict_{APP_NAME}"
print(f"CUSTOM_PREDICTOR_IMAGE_URI = {CUSTOM_PREDICTOR_IMAGE_URI}")

CUSTOM_PREDICTOR_IMAGE_URI = gcr.io/huggingface-ml/pytorch_predict_test_sbert_embedder_optimum


### Build container

In [146]:
#!docker build \
#  --tag=$CUSTOM_PREDICTOR_IMAGE_URI \
#  ./predictor

### Run container locally

In [108]:
!docker stop local_sbert_embedder_optimum
!docker run -t -d --rm -p 7080:7080 --name=local_sbert_embedder_optimum $CUSTOM_PREDICTOR_IMAGE_URI
!sleep 20

local_sbert_embedder_optimum
380ad2454bb71d0e6c40da1302bd057724bb4b7160f008de4f51a8d89d958682


### Test API locally

In [109]:
!curl http://localhost:7080/ping

{
  "status": "Healthy"
}


In [110]:
%%bash -s $APP_NAME

APP_NAME=$1

cat > ./predictor/instances.json <<END
{ 
   "instances": [
     { 
       "data": {
         "b64": "$(echo 'I am so happy to be at Deauville today' | base64 --wrap=0)"
       }
     }
   ]
}
END

curl -s -X POST \
  -H "Content-Type: application/json; charset=utf-8" \
  -d @./predictor/instances.json \
  http://localhost:7080/predictions/$APP_NAME/

base64: unrecognized option `--wrap=0'
Usage:	base64 [-hvDd] [-b num] [-i in_file] [-o out_file]
  -h, --help     display this message
  -Dd, --decode   decodes input
  -b, --break    break encoded string into num character lines
  -i, --input    input file (default: "-" for stdin)
  -o, --output   output file (default: "-" for stdout)


{"predictions": [[0.15652182698249817, -0.007943082600831985, -0.02663888782262802, -0.08307340741157532, 0.10856898128986359, 0.1659667044878006, 0.509436845779419, 0.18165743350982666, -0.1608804315328598, -0.32550927996635437, -0.3147802948951721, 0.11809796839952469, -0.2053718864917755, 0.16539841890335083, 0.0431254580616951, 0.14275845885276794, 0.33132243156433105, 0.05910756438970566, -0.07154326885938644, -0.34179458022117615, -0.3615402281284332, -0.18151073157787323, -0.015651991590857506, -0.2512113153934479, -0.17122207581996918, -0.2773536145687103, -0.3305656909942627, -0.5285634994506836, -0.038842517882585526, 0.0860275998711586, -0.12963564693927765, -0.016775203868746758, 0.10492599010467529, 0.06866931915283203, -0.058284010738134384, 0.131710484623909, 0.013576209545135498, -0.14154024422168732, -0.210587739944458, -0.05883493274450302, 0.15064772963523865, -0.32615822553634644, 0.11842171102762222, 0.17478305101394653, -0.2231474071741104, 0.02912208065390587, -1

In [111]:
CUSTOM_PREDICTOR_IMAGE_URI

'gcr.io/huggingface-ml/pytorch_predict_test_sbert_embedder_optimum'

### Push to Container Registry

In [16]:
!docker push $CUSTOM_PREDICTOR_IMAGE_URI

Using default tag: latest
The push refers to repository [gcr.io/huggingface-ml/pytorch_predict_test_sbert_embedder]

[1Bc22e8bb9: Preparing 
[1Bb5b7f6ab: Preparing 
[1B4d59d004: Preparing 
[1Badc9f621: Preparing 
[1B0763950f: Preparing 
[1Bd9e635c4: Preparing 
[1B633ebd40: Preparing 
[1Bff3792de: Preparing 
[1Bbf18a086: Preparing 
[1Bda41ec4a: Preparing 
[1Bd007c81a: Preparing 
[1Bc83319d2: Preparing 
[1B005ec070: Preparing 
[1Bc8ae3daf: Preparing 
[1Be9bfffc1: Preparing 
[1B51f4d794: Preparing 
[12B9e635c4: Waiting g unauthorized: You don't have the needed permissions to perform this operation, and you may have invalid credentials. To authenticate your request, follow the steps in: https://cloud.google.com/container-registry/docs/advanced-authentication


In [112]:
BUCKET_NAME = "gs://florent-bucket"  # <---CHANGE THIS TO YOUR BUCKET
REGION = "us-central1"

In [167]:
#!gsutil mb -l $REGION $BUCKET_NAM

### Create model and endpoint to VertexAI

In [113]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

In [114]:
VERSION = 1
model_display_name = f"{APP_NAME}-v{VERSION}"
model_description = "PyTorch based sentence transformers embedder with custom container"

MODEL_NAME = APP_NAME
health_route = "/ping"
predict_route = f"/predictions/{MODEL_NAME}"
serving_container_ports = [7080]

In [115]:
model = aiplatform.Model.upload(
    display_name=model_display_name,
    description=model_description,
    serving_container_image_uri=CUSTOM_PREDICTOR_IMAGE_URI,
    serving_container_predict_route=predict_route,
    serving_container_health_route=health_route,
    serving_container_ports=serving_container_ports,
)

model.wait()

print(model.display_name)
print(model.resource_name)

Creating Model
Create Model backing LRO: projects/1049843053967/locations/us-central1/models/5777542177823391744/operations/622373677819756544
Model created. Resource name: projects/1049843053967/locations/us-central1/models/5777542177823391744
To use this Model in another session:
model = aiplatform.Model('projects/1049843053967/locations/us-central1/models/5777542177823391744')
test_sbert_embedder_optimum-v1
projects/1049843053967/locations/us-central1/models/5777542177823391744


In [116]:
endpoint_display_name = f"{APP_NAME}-endpoint"
endpoint = aiplatform.Endpoint.create(display_name=endpoint_display_name)

Creating Endpoint
Create Endpoint backing LRO: projects/1049843053967/locations/us-central1/endpoints/8736868927889473536/operations/1707741188016046080
Endpoint created. Resource name: projects/1049843053967/locations/us-central1/endpoints/8736868927889473536
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/1049843053967/locations/us-central1/endpoints/8736868927889473536')


### Deploy endpoint

In [117]:
traffic_percentage = 100
machine_type = "n1-standard-8"
deployed_model_display_name = model_display_name
min_replica_count = 1
max_replica_count = 3
sync = True

model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=deployed_model_display_name,
    machine_type=machine_type,
    traffic_percentage=traffic_percentage,
    min_replica_count=min_replica_count,
    max_replica_count=max_replica_count,
    sync=sync,
)

Deploying model to Endpoint : projects/1049843053967/locations/us-central1/endpoints/8736868927889473536
Deploy Endpoint model backing LRO: projects/1049843053967/locations/us-central1/endpoints/8736868927889473536/operations/3306519055732572160
Endpoint model deployed. Resource name: projects/1049843053967/locations/us-central1/endpoints/8736868927889473536


<google.cloud.aiplatform.models.Endpoint object at 0x156698ca0> 
resource name: projects/1049843053967/locations/us-central1/endpoints/8736868927889473536

In [118]:
endpoint_display_name = f"{APP_NAME}-endpoint"
filter = f'display_name="{endpoint_display_name}"'

for endpoint_info in aiplatform.Endpoint.list(filter=filter):
    print(
        f"Endpoint display name = {endpoint_info.display_name} resource id ={endpoint_info.resource_name} "
    )

endpoint = aiplatform.Endpoint(endpoint_info.resource_name)

Endpoint display name = test_sbert_embedder_optimum-endpoint resource id =projects/1049843053967/locations/us-central1/endpoints/8736868927889473536 


In [119]:
endpoint.list_models()

[id: "5204054504961474560"
 model: "projects/1049843053967/locations/us-central1/models/5777542177823391744"
 display_name: "test_sbert_embedder_optimum-v1"
 create_time {
   seconds: 1655828146
   nanos: 861195000
 }
 dedicated_resources {
   machine_spec {
     machine_type: "n1-standard-8"
   }
   min_replica_count: 1
   max_replica_count: 3
 }]

In [142]:
test_instances = [
    b"This is an example of model deployment using a sentence transformers model and optimum",
]*2

In [140]:
#test_instances

In [143]:
%%time
print("=" * 100)
for instance in test_instances:
    print(f"Input text: \n\t{instance.decode('utf-8')}\n")
    b64_encoded = base64.b64encode(instance)
    test_instance = [{"data": {"b64": f"{str(b64_encoded.decode('utf-8'))}"}}]
    print(f"Formatted input: \n{json.dumps(test_instance, indent=4)}\n")
    prediction = endpoint.predict(instances=test_instance)
    #print(f"Prediction response: \n\t{prediction}")
    print("=" * 100)

Input text: 
	This is an example of model deployment using a sentence transformers model and optimum

Formatted input: 
[
    {
        "data": {
            "b64": "VGhpcyBpcyBhbiBleGFtcGxlIG9mIG1vZGVsIGRlcGxveW1lbnQgdXNpbmcgYSBzZW50ZW5jZSB0cmFuc2Zvcm1lcnMgbW9kZWwgYW5kIG9wdGltdW0="
        }
    }
]

Input text: 
	This is an example of model deployment using a sentence transformers model and optimum

Formatted input: 
[
    {
        "data": {
            "b64": "VGhpcyBpcyBhbiBleGFtcGxlIG9mIG1vZGVsIGRlcGxveW1lbnQgdXNpbmcgYSBzZW50ZW5jZSB0cmFuc2Zvcm1lcnMgbW9kZWwgYW5kIG9wdGltdW0="
        }
    }
]

CPU times: user 59.9 ms, sys: 15.1 ms, total: 75 ms
Wall time: 851 ms


In [134]:
%%time
prediction = endpoint.predict(instances=test_instance)

CPU times: user 17.7 ms, sys: 4.16 ms, total: 21.8 ms
Wall time: 208 ms


In [131]:
len(tokenizer("This is an example of model deployment using a sentence transformers model and optimum")["input_ids"])

17

In [53]:
len(prediction.predictions[0])

768