# Use TorchServe to deploy model on Vertex AI

Inspired by https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/community-content/pytorch_text_classification_using_vertex_sdk_and_gcloud/pytorch-text-classification-vertex-ai-train-tune-deploy.ipynb 

In [116]:
%env GOOGLE_APPLICATION_CREDENTIALS ./keys/huggingface-ml-e974975230cc.json

env: GOOGLE_APPLICATION_CREDENTIALS=./keys/huggingface-ml-e974975230cc.json


In [57]:
PROJECT_ID = "huggingface-ml"

In [3]:
import base64
import json
import os
import random
import sys

import google.auth
from google.cloud import aiplatform
from google.cloud.aiplatform import gapic as aip
from google.cloud.aiplatform import hyperparameter_tuning as hpt
from google.protobuf.json_format import MessageToDict

In [5]:
!pip -q install --upgrade google-cloud-aiplatform

You should consider upgrading via the '/Users/florentgbelidji/.pyenv/versions/3.9.10/envs/venv_hf_3.9.10/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

### Save model locally

In [97]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "florentgbelidji/all-mpnet-base-v2__tweet_eval_emotion__classifier"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


pt_save_directory = "./predictor/model/"

tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)

### Create handlers

In [7]:
!mkdir ./predictor

In [88]:
%%writefile predictor/custom_handler.py

import os
import json
import logging

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from ts.torch_handler.base_handler import BaseHandler

logger = logging.getLogger(__name__)


class TransformersClassifierHandler(BaseHandler):
    """
    The handler takes an input string and returns the classification text 
    based on the serialized transformers checkpoint.
    """
    def __init__(self):
        super(TransformersClassifierHandler, self).__init__()
        self.initialized = False

    def initialize(self, ctx):
        """ Loads the model.pt file and initialized the model object.
        Instantiates Tokenizer for preprocessor to use
        Loads labels to name mapping file for post-processing inference response
        """
        self.manifest = ctx.manifest

        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu")

        # Read model serialize/pt file
        serialized_file = self.manifest["model"]["serializedFile"]
        model_pt_path = os.path.join(model_dir, serialized_file)
        if not os.path.isfile(model_pt_path):
            raise RuntimeError("Missing the model.pt or pytorch_model.bin file")
        
        # Load model
        self.model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        self.model.to(self.device)
        self.model.eval()
        logger.debug('Transformer model from path {0} loaded successfully'.format(model_dir))
        
        # Ensure to use the same tokenizer used during training
        self.tokenizer = AutoTokenizer.from_pretrained("florentgbelidji/all-mpnet-base-v2__tweet_eval_emotion__classifier")

        # Read the mapping file, index to object name
        mapping_file_path = os.path.join(model_dir, "index_to_name.json")

        if os.path.isfile(mapping_file_path):
            with open(mapping_file_path) as f:
                self.mapping = json.load(f)
        else:
            logger.warning('Missing the index_to_name.json file. Inference output will default.')
            self.mapping = {0: "anger",
                            1: "joy",
                            2: "optimism",
                            3: "sadness"
                           }

        self.initialized = True

    def preprocess(self, data):
        """ Preprocessing input request by tokenizing
            Extend with your own preprocessing steps as needed
        """
        text = data[0].get("data")
        if text is None:
            text = data[0].get("body")
        sentences = text.decode('utf-8')
        logger.info("Received text: '%s'", sentences)

        # Tokenize the texts
        tokenizer_args = ((sentences,))
        inputs = self.tokenizer(*tokenizer_args,
                                padding='max_length',
                                max_length=128,
                                truncation=True,
                                return_tensors = "pt")
        return inputs

    def inference(self, inputs):
        """ Predict the class of a text using a trained transformer model.
        """
        prediction = self.model(inputs['input_ids'].to(self.device))[0].argmax().item()

        if self.mapping:
            prediction = self.mapping[str(prediction)]

        logger.info("Model predicted: '%s'", prediction)
        return [prediction]

    def postprocess(self, inference_output):
        return inference_output

Overwriting predictor/custom_handler.py


In [89]:
%%writefile ./predictor/index_to_name.json
{
    "0": "anger",
    "1": "joy",
    "2": "optimism",
    "3": "sadness"
}

Overwriting ./predictor/index_to_name.json


### Write Dockerfile

In [90]:
APP_NAME = "test_setfit_classifier"

In [91]:
%%bash -s $APP_NAME

APP_NAME=$1

cat << EOF > ./predictor/Dockerfile

FROM pytorch/torchserve:latest-cpu

# install dependencies
RUN python3 -m pip install --upgrade pip
RUN pip3 install transformers

USER model-server

# copy model artifacts, custom handler and other dependencies
COPY custom_handler.py /home/model-server/
COPY index_to_name.json /home/model-server/
COPY ./model/ / /home/model-server/

# create torchserve configuration file
USER root
RUN printf "\nservice_envelope=json" >> /home/model-server/config.properties
RUN printf "\ninference_address=http://0.0.0.0:7080" >> /home/model-server/config.properties
RUN printf "\nmanagement_address=http://0.0.0.0:7081" >> /home/model-server/config.properties
USER model-server

# expose health and prediction listener ports from the image
EXPOSE 7080
EXPOSE 7081

# create model archive file packaging model artifacts and dependencies
RUN torch-model-archiver -f \
  --model-name=$APP_NAME \
  --version=1.0 \
  --serialized-file=/home/model-server/pytorch_model.bin \
  --handler=/home/model-server/custom_handler.py \
  --extra-files "/home/model-server/config.json,/home/model-server/tokenizer.json,/home/model-server/tokenizer_config.json,/home/model-server/special_tokens_map.json,/home/model-server/vocab.txt,/home/model-server/index_to_name.json" \
  --export-path=/home/model-server/model-store

# run Torchserve HTTP serve to respond to prediction requests
CMD ["torchserve", \
     "--start", \
     "--ts-config=/home/model-server/config.properties", \
     "--models", \
     "$APP_NAME=$APP_NAME.mar", \
     "--model-store", \
     "/home/model-server/model-store"]
EOF

echo "Writing ./predictor/Dockerfile"

Writing ./predictor/Dockerfile


In [92]:
CUSTOM_PREDICTOR_IMAGE_URI = f"gcr.io/{PROJECT_ID}/pytorch_predict_{APP_NAME}"
print(f"CUSTOM_PREDICTOR_IMAGE_URI = {CUSTOM_PREDICTOR_IMAGE_URI}")

CUSTOM_PREDICTOR_IMAGE_URI = gcr.io/huggingface-ml/pytorch_predict_test_setfit_classifier


### Build container

In [93]:
!docker build \
  --tag=$CUSTOM_PREDICTOR_IMAGE_URI \
  ./predictor

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                                         
[?25h^C
[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.2s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.1s
[0m[34m => => transferring dockerfile: 1.63kB                                     0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0s
[0m => [internal] load metadata for docker.io/pytorch/torchserve:latest-cpu   0.0s
[?25h[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.2s (3/3) FINISHED                                                
[34m => [internal] load build definition from Dockerfile                       0.1s
[0m[34m => => transferring dockerfile: 1.63kB                                     0.0s
[0m[34m => [internal] load .dockerignore        

### Run container locally

In [18]:
!docker stop local_bert_classifier
!docker run -t -d --rm -p 7080:7080 --name=local_bert_classifier $CUSTOM_PREDICTOR_IMAGE_URI
!sleep 20

local_bert_classifier
a1ae5e4bad0556362fa9957424f9ba7256d2288c52790dd0893cd8f2d8940773


### Test API locally

In [19]:
!curl http://localhost:7080/ping

{
  "status": "Healthy"
}


In [76]:
%%bash -s $APP_NAME

APP_NAME=$1

cat > ./predictor/instances.json <<END
{ 
   "instances": [
     { 
       "data": {
         "b64": "$(echo 'I am so happy to be at Deauville today' | base64 --wrap=0)"
       }
     }
   ]
}
END

curl -s -X POST \
  -H "Content-Type: application/json; charset=utf-8" \
  -d @./predictor/instances.json \
  http://localhost:7080/predictions/$APP_NAME/

base64: unrecognized option `--wrap=0'
Usage:	base64 [-hvDd] [-b num] [-i in_file] [-o out_file]
  -h, --help     display this message
  -Dd, --decode   decodes input
  -b, --break    break encoded string into num character lines
  -i, --input    input file (default: "-" for stdin)
  -o, --output   output file (default: "-" for stdout)


Process is interrupted.


### Push to Container Registry

In [58]:
!docker push $CUSTOM_PREDICTOR_IMAGE_URI

Using default tag: latest
The push refers to repository [gcr.io/huggingface-ml/pytorch_predict_test_setfit_classifier]

[1B1d8707c5: Preparing 
[1B31fa54b6: Preparing 
[1B8f992a79: Preparing 
[1Bbe1d74c7: Preparing 
[1Bece87862: Preparing 
[1B79e7584b: Preparing 
[1B9dd80a52: Preparing 
[1B633ebd40: Preparing 
[1Bff3792de: Preparing 
[1Bbf18a086: Preparing 
[1Bda41ec4a: Preparing 
[6B9dd80a52: Waiting g 
[5Bff3792de: Waiting g 
[1B005ec070: Preparing 
[1Bc8ae3daf: Preparing 
[5Bd007c81a: Waiting g 
[1B51f4d794: Preparing 
[9Bunauthorized: You don't have the needed permissions to perform this operation, and you may have invalid credentials. To authenticate your request, follow the steps in: https://cloud.google.com/container-registry/docs/advanced-authentication


In [48]:
BUCKET_NAME = "gs://florent-bucket"  # <---CHANGE THIS TO YOUR BUCKET
REGION = "us-central1"

In [115]:
#!gsutil mb -l $REGION $BUCKET_NAM

### Create model and endpoint to VertexAI

In [60]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

In [61]:
VERSION = 1
model_display_name = f"{APP_NAME}-v{VERSION}"
model_description = "PyTorch based text classifier with custom container"

MODEL_NAME = APP_NAME
health_route = "/ping"
predict_route = f"/predictions/{MODEL_NAME}"
serving_container_ports = [7080]

In [62]:
model = aiplatform.Model.upload(
    display_name=model_display_name,
    description=model_description,
    serving_container_image_uri=CUSTOM_PREDICTOR_IMAGE_URI,
    serving_container_predict_route=predict_route,
    serving_container_health_route=health_route,
    serving_container_ports=serving_container_ports,
)

model.wait()

print(model.display_name)
print(model.resource_name)

Creating Model
Create Model backing LRO: projects/1049843053967/locations/us-central1/models/3818757814893936640/operations/9135355650014969856
Model created. Resource name: projects/1049843053967/locations/us-central1/models/3818757814893936640
To use this Model in another session:
model = aiplatform.Model('projects/1049843053967/locations/us-central1/models/3818757814893936640')
test_setfit_classifier-v1
projects/1049843053967/locations/us-central1/models/3818757814893936640


In [63]:
endpoint_display_name = f"{APP_NAME}-endpoint"
endpoint = aiplatform.Endpoint.create(display_name=endpoint_display_name)

Creating Endpoint
Create Endpoint backing LRO: projects/1049843053967/locations/us-central1/endpoints/1750660005930991616/operations/744023644316893184
Endpoint created. Resource name: projects/1049843053967/locations/us-central1/endpoints/1750660005930991616
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/1049843053967/locations/us-central1/endpoints/1750660005930991616')


### Deploy endpoint

In [65]:
endpoint_display_name = f"{APP_NAME}-endpoint"

In [70]:
traffic_percentage = 100
machine_type = "n1-standard-4"
deployed_model_display_name = model_display_name
min_replica_count = 1
max_replica_count = 3
sync = True

model.deploy(
    endpoint=endpoint,
    deployed_model_display_name=deployed_model_display_name,
    machine_type=machine_type,
    traffic_percentage=traffic_percentage,
    sync=sync,
)

Deploying model to Endpoint : projects/1049843053967/locations/us-central1/endpoints/1750660005930991616
Deploy Endpoint model backing LRO: projects/1049843053967/locations/us-central1/endpoints/1750660005930991616/operations/1896945148923740160


KeyboardInterrupt: 

In [80]:
endpoint_display_name = f"{APP_NAME}-endpoint"
filter = f'display_name="{endpoint_display_name}"'

for endpoint_info in aiplatform.Endpoint.list(filter=filter):
    print(
        f"Endpoint display name = {endpoint_info.display_name} resource id ={endpoint_info.resource_name} "
    )

endpoint = aiplatform.Endpoint(endpoint_info.resource_name)

Endpoint display name = test_setfit_classifier-endpoint resource id =projects/1049843053967/locations/us-central1/endpoints/1965706888137932800 


In [81]:
endpoint.list_models()

[id: "2748044595672645632"
 model: "projects/1049843053967/locations/us-central1/models/3818757814893936640"
 display_name: "test_setfit_classifier-v1"
 create_time {
   seconds: 1655468766
   nanos: 577510000
 }
 dedicated_resources {
   machine_spec {
     machine_type: "n1-standard-8"
     accelerator_type: NVIDIA_TESLA_T4
     accelerator_count: 1
   }
   min_replica_count: 1
   max_replica_count: 1
 }
 enable_access_logging: true]

In [82]:
test_instances = [
    b"Jaw dropping visual affects and action! One of the best I have seen to date.",
]

In [83]:
%%time
print("=" * 100)
for instance in test_instances:
    print(f"Input text: \n\t{instance.decode('utf-8')}\n")
    b64_encoded = base64.b64encode(instance)
    test_instance = [{"data": {"b64": f"{str(b64_encoded.decode('utf-8'))}"}}]
    print(f"Formatted input: \n{json.dumps(test_instance, indent=4)}\n")
    prediction = endpoint.predict(instances=test_instance)
    print(f"Prediction response: \n\t{prediction}")
    print("=" * 100)

Input text: 
	Jaw dropping visual affects and action! One of the best I have seen to date.

Formatted input: 
[
    {
        "data": {
            "b64": "SmF3IGRyb3BwaW5nIHZpc3VhbCBhZmZlY3RzIGFuZCBhY3Rpb24hIE9uZSBvZiB0aGUgYmVzdCBJIGhhdmUgc2VlbiB0byBkYXRlLg=="
        }
    }
]

Prediction response: 
	Prediction(predictions=['joy'], deployed_model_id='2748044595672645632', explanations=None)
CPU times: user 176 ms, sys: 391 ms, total: 567 ms
Wall time: 32.4 s


In [113]:
%%time
prediction = endpoint.predict(instances=test_instance)

CPU times: user 6.62 ms, sys: 10.5 ms, total: 17.2 ms
Wall time: 780 ms


In [104]:
len(tokenizer("Jaw dropping visual affects and action! One of the best I have seen to date.")["input_ids"])

19