# Serve model deploy to vertex AI

## Setup environment

**Create a new virtual environment**

In [None]:
# Create a virtual environment
python -m venv .venv

# Activate the virtual environment
# Source .venv/bin/activate
.venv\Scripts\activate

# (Optional) Deactivate conda environment
conda deactivate

# Upgrade pip
python -m pip install --upgrade pip

**Create a `requirements.txt`**

In [18]:
%%writefile requirements.txt

uvicorn[standard]==0.20.0
gunicorn==23.0.0
fastapi[standard]==0.115.0
scikit-learn==1.5.2
pytest==8.3.3
starlette==0.38.6
requests==2.32.3

Overwriting requirements.txt


In [None]:
# Install dependencies
pip install --no-cache-dir -r requirements.txt

## Develop model

**Model object**

In [1]:
%%writefile model.py
import random

from sklearn.base import BaseEstimator, TransformerMixin


class SimpleSentimentModel(BaseEstimator, TransformerMixin):
    negative_length_threshold = 10
    positive_length_threshold = 30
    negative_ls = ["tiêu cực", "xấu", "tệ", "negative"]
    positive_ls = ["tích cực", "thích", "positive"]

    def __init__(self):
        pass

    def predict(self, text):
        text_lower = text.lower()
        if any(word in text_lower for word in self.negative_ls):
            return "negative", random.randrange(90, 100, step=1) / 100
        elif any(word in text_lower for word in self.positive_ls):
            return "positive", random.randrange(90, 100, step=1) / 100
        elif len(text) <= self.negative_length_threshold:
            return "negative", random.randrange(70, 90, step=1) / 100
        elif len(text) >= self.positive_length_threshold:
            return "positive", random.randrange(70, 90, step=1) / 100
        else:
            return "neutral", random.randrange(70, 95, step=1) / 100

Overwriting model.py


**training script**

In [2]:
%%writefile train.py
import os

import joblib
from model import SimpleSentimentModel  # Ensure this import is correct

if __name__ == "__main__":
    # Create an instance of the model
    model = SimpleSentimentModel()

    # Create directory if it doesn't exist
    model_dir = "models"
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Save the model using joblib, ensuring correct context
    joblib.dump(model, os.path.join(model_dir, "model.pkl"))
    print("Model saved successfully!")

Overwriting train.py


**Main app API**

In [4]:
%%writefile main.py
import os
from typing import List, Optional

import joblib
import uvicorn
from fastapi import FastAPI, HTTPException, Request
from model import SimpleSentimentModel  # noqa: F401
from pydantic import BaseModel

# Initialize FastAPI app
app = FastAPI(title="Sentiment Analysis API")

# Load the model with a safe file path
model_path = os.path.join("models", "model.pkl")
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model file not found at {model_path}")

# Load the model, making sure SimpleSentimentModel is already imported
model = joblib.load(model_path)


# Pydantic models for prediction results
class Prediction(BaseModel):
    sentiment: str
    confidence: Optional[float]


class Predictions(BaseModel):
    predictions: List[Prediction]


# Function to process batch predictions
def get_prediction(instances):
    res = []
    for text in instances:
        sentiment, confidence = model.predict(text)
        res.append(Prediction(sentiment=sentiment, confidence=confidence))
    return Predictions(predictions=res)


# Health check route
@app.get("/health", status_code=200)
async def health():
    return {"health": "ok"}


# Prediction route to handle batch requests
@app.post(
    "/predict",
    response_model=Predictions,
    response_model_exclude_unset=True,
)
async def predict(request: Request):
    # Extract the JSON body from the request
    body = await request.json()

    # Validate the request body
    if "instances" not in body or not isinstance(body["instances"], list):
        raise HTTPException(
            status_code=400,
            detail="Invalid input format. 'instances' should be a list of texts.",
        )

    # Extract the instances (texts) from the request
    instances = [x["text"] for x in body["instances"]]

    # Get predictions
    output = get_prediction(instances)

    # Return the predictions
    return output


# Main function to run the FastAPI app
if __name__ == "__main__":
    uvicorn.run("main:app", host="0.0.0.0", port=8080)


Overwriting main.py


**Train model**

In [None]:
python train.py

**Test app**

In [1]:
%%writefile test.py
from fastapi.testclient import TestClient

from main import app

client = TestClient(app=app)
base_url = ""


def test_health():
    response = client.get(f"{base_url}/health")
    assert response.status_code == 200
    assert response.json() == {"health": "ok"}
    print("pass: test_health")


def test_predict_item():
    response = client.post(
        f"{base_url}/predict",
        json={
            "instances": [
                {"text": "Cong hoa xa hoi chu nghia"},
                {"text": "doc lap"},
                {"text": "doc lap tich cuc"},
                {"text": "te doc"},
                {"text": "positive doc lap"},
            ]
        },
    )
    assert response.status_code == 200
    result = response.json()
    sentiments = [i["sentiment"] for i in result["predictions"]]
    assert sentiments == [
        "neutral",
        "negative",
        "neutral",
        "negative",
        "positive",
    ]
    print("pass: test_predict_item")


def test_predict_item_non_instance():
    response = client.post(
        f"{base_url}/predict",
        json={
            "instan": [
                {"text": "Cong hoa xa hoi chu nghia"},
                {"text": "doc lap"},
                {"text": "doc lap tich cuc"},
                {"text": "te doc"},
                {"text": "positive doc lap"},
            ]
        },
    )
    assert response.status_code == 400
    response.json() == {
        "detail": "Invalid input format. 'instances' should be a list of texts."
    }
    print("pass: test_predict_item_non_instance")


def test_predict_item_not_list():
    response = client.post(
        f"{base_url}/predict",
        json={"instan": {"text": "Cong hoa xa hoi chu nghia"}},
    )
    assert response.status_code == 400
    response.json() == {
        "detail": "Invalid input format. 'instances' should be a list of texts."
    }
    print("pass: test_predict_item_not_list")


if __name__ == "__main__":
    # test for running container
    import requests

    client = requests
    base_url = "http://127.0.0.1:8080"
    test_health()
    test_predict_item()
    test_predict_item_non_instance()
    test_predict_item_not_list()


Overwriting test.py


Run pytest in `cmd`

In [None]:
pytest test.py

## Upload model image to Artifact Registry (GCP)

**Write Dockerfile**

In [2]:
%%writefile Dockerfile
FROM tiangolo/uvicorn-gunicorn:python3.11-slim

WORKDIR /app

COPY *.py ./
COPY models ./models
COPY requirements.txt ./requirements.txt

RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r ./requirements.txt

EXPOSE 8080
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]

Overwriting Dockerfile


### Build & Push image bằng docker-command

**Build docker image**

In [None]:
docker build -t asia-southeast1-docker.pkg.dev/ext-pinetree-dw/dev-aiml-model/sentiment-fast-api .

**Test image container**

Run container

In [None]:
docker run --rm -p 8080:8080 asia-southeast1-docker.pkg.dev/ext-pinetree-dw/dev-aiml-model/sentiment-fast-api

Test container

In [None]:
python test.py

**Push image to Artifact Registry (GCP)**

Authen GCP

In [None]:
# docker login
gcloud auth login

Push Image

In [None]:
docker push asia-southeast1-docker.pkg.dev/ext-pinetree-dw/dev-aiml-model/sentiment-fast-api

### Build & Push image bằng cloud-build

**Config cloud build**

In [4]:
%%writefile cloudbuild.yaml
steps:
# If training model in cloud and save model in GCS
# Assume Storage location of model: `gs://dev-aiml-model/models/sentiment`
# Download the model file in GCS to embed it into the image
  - name: 'gcr.io/cloud-builders/gsutil'
    args: ['cp', '-r', '${_MODEL_GCS_PATH}', './models']
    id: 'download-model'
  
  # Build the container image
  - name: 'gcr.io/cloud-builders/docker'
    args: ['build', '-t', '${_IMAGE_NAME}', '.']
    waitFor: ['download-model']
  
  # Push the container image to Artifact Registry
  - name: 'gcr.io/cloud-builders/docker'
    args: ['push', '${_IMAGE_NAME}']

images:
  - '${_IMAGE_NAME}'

# Substitution variables for flexibility
substitutions:
  _MODEL_GCS_PATH: 'gs://dev-aiml-model/models/sentiment'
  _IMAGE_NAME: 'asia-southeast1-docker.pkg.dev/ext-pinetree-dw/dev-aiml-model/sentiment-fast-api'

Overwriting cloudbuild.yaml


**Run cloud build**

In [None]:
gcloud builds submit --config cloudbuild.yaml

## Serving model container - Vertex AI

**Container Requirement**

The docker container needs to follow the [container requirements](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements) defined by Google. The most important requirement is: 

---
**1. HTTP server**

Provide an `HTTP server` that listens for requests on `0.0.0.0` (must) on port `8080` (can be choice).

**HTTP Server** can be using:
- **Flask** , **FastAPI**, ...
- **TensorFlow Serving**, **TorchServe**, or **KServe Python Server**
- ...

[**HTTP Server** can be run by](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#server):
- [ENTRYPOINT instruction](https://docs.docker.com/engine/reference/builder/#entrypoint), [CMD instruction](https://docs.docker.com/engine/reference/builder/#cmd) or both in ***Dockerfile***
- Specify the `containerSpec.command` and `containerSpec.args` fields when you create your `Model` resource (override your container image's `ENTRYPOINT` and `CMD`)

---
**2. Health checks**

***a. startup probe*** (optional)

Check whether the container application has started. Nếu không cung cấp thì sẽ ko chạy, và ngay lập tức chạy ***health probe***

**Usecase**: Cần sử dụng cho các application cần có thời gian khởi động trong lần đầu tiên. Ví dụ, Nếu App cần thời gian để copy file model mới từ source bên ngoài container mỗi lần khởi động. Chúng ta có thể config ***startup probe*** để chờ cho đến khi việc copy hoàn thành và trả ra success


***b. health probe***

Check whether the container application is ready to accept traffic or receive request. Nếu không cung cấp path cụ thể thì Vertex sẽ sử dụng default path `/health`. Lưu ý là ***health probe*** chỉ chạy khi ***startup probe*** hoàn thành hoặc không được khai báo

Provide an `HTTP path` for **health checks** (default path `/health` with `HTTP GET`, it can be change in config): 
- Return a `200` within **10 seconds** after call when you’re container is ready to handle requests. Nội dung của phần phản hồi không quan trọng, vì Vertex AI sẽ bỏ qua chúng. Phản hồi này cho thấy rằng server đang hoạt động tốt (healthy). For example, if you need to load the model, ensure you return the `200` status code after the model is loaded.
- **If the server isn't ready to handle prediction requests**, nó không nên phản hồi yêu cầu trong vòng **10 giây**, hoặc phản hồi với bất kỳ mã trạng thái nào khác ngoài `200 OK`, ví dụ như `503 Service Unavailable`. Điều này cho thấy server đang không hoạt động tốt (unhealthy).

Nếu health probe nhận được phản hồi không tốt từ server (bao gồm cả trường hợp không có phản hồi trong vòng 10 giây), nó sẽ gửi thêm **tối đa 3 lần Health Checks nữa**, mỗi lần cách nhau **10 giây**. Trong khoảng thời gian này, Vertex AI vẫn coi server là hoạt động tốt. Nếu probe nhận được phản hồi tốt từ bất kỳ lần kiểm tra nào, nó sẽ quay lại **Health checks Process**. Tuy nhiên, **nếu probe nhận được 4 phản hồi không tốt liên tiếp**, Vertex AI sẽ dừng việc chuyển tiếp các yêu cầu dự đoán tới container đó (nếu mô hình được triển khai trên nhiều node, các yêu cầu sẽ được chuyển tới các container khác đang hoạt động tốt).

Vertex AI không khởi động lại container; thay vào đó, health probe vẫn sẽ tiếp tục gửi các yêu cầu kiểm tra định kỳ tới server không tốt. Nếu nhận được phản hồi tốt, container đó sẽ được đánh dấu là hoạt động tốt và bắt đầu nhận lại yêu cầu dự đoán.

**Hướng dẫn thực tế:**
- Trong nhiều trường hợp, **server HTTP** trong container của bạn có thể luôn phản hồi với mã trạng thái `200 OK` cho các yêu cầu kiểm tra sức khỏe. Nếu container tải các tài nguyên trước khi khởi động server, container sẽ không hoạt động tốt trong thời gian khởi động và bất kỳ lúc nào server HTTP gặp lỗi. Trong tất cả các thời gian khác, nó sẽ phản hồi là tốt.

- Đối với cấu hình phức tạp hơn, bạn có thể thiết kế server HTTP để cố tình phản hồi yêu cầu kiểm tra sức khỏe với trạng thái không tốt vào những thời điểm nhất định. Ví dụ, bạn có thể chặn lưu lượng dự đoán tới node trong một khoảng thời gian để container thực hiện bảo trì.

---
**3. Prediction**

Provide an `HTTP path` for **prediction** (default path `/predict` with `HTTP POST`, it can be change in config)
- `Content-Type: application/json` HTTP header

---

**4. Request body**

The request body is `JSON` format and must be 1.5 MB or smaller, need contain an `instances` key and can be has `parameters` :
```JSON
{
   "instances":[
      {
         "text":"DoiT is a great company."
      },
      {
         "text":"The beach was nice but overall the hotel was very bad."
      }
   ],
   "parameters": {}
}
```
- `instances` take is an array of **one or more JSON values** of any type. Each values represents an instance that you are providing a prediction for.
- `parameters` (optional if application is designed to require it) take a JSON object containing any parameters that your container requires to help serve predictions on the instances

---

**5. Response body**

The response body is `JSON` format and must be 1.5 MB or smaller, need contain an `predictions` key :
```JSON
{
 "predictions": [
   {
     "confidence": 0.9409326314926147,
     "sentiment": "POSITIVE"
   }
 ],
  "deployedModelId": <string>, # id of the Endpoint's DeployedModel
  "model": <string>, # The resource name of the Model
  "modelVersionId": <string>, # The version id of the Model
  "modelDisplayName": <string>, # The display name of the Model 
  "metadata": <value> # Request-level metadata returned by the model
}
```
- `predictions` take is an array of **one or more JSON values** representing the predictions that your container has generated for each of the INSTANCES in the corresponding request.

**6. Publishing requirements**
- Location: `asia-southeast1`
- [Permissions](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#permissions)
- [Environment variable](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#variables)

**7. Access model artifacts**

[Doc](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#artifacts)

- **Nếu sử dụng pre-build container làm môi trường**: Thì phải cung cấp địa chỉ tại GCS (folder) chứa các file model được training sẽ chạy trên environment build từ pre-build container đó

- **Nếu sử dụng custom container làm môi trường**: Việc cung cấp địa chỉ GCS (folder) chứa các file trained model là optional, nó cần thiết trong việc sử dụng custom container chỉ làm environment runtime và ko chứa sẵn model, khi đó cần phải copy model vào để run trong environment đó. Còn nếu trong container chứa sẵn file model thì việc cung cấp địa chỉ folder (GCS) chứa file model là ko cần thiết

### Import to Model Registry (VertexAI)

Ta cần import Model Image từ **Artifact Registry** sang **Vertex AI** để có thể tận dụng các tính năng quản lý model AI của Vertex và serve được model.

**Chi phí sử dụng Model Registry (VertexAI)**: No Cost

Chỉ phát sinh chi phí khi sử dụng prediction: `Online prediction via Endpoint` hoặc `Batch Prediction`


1. Import bằng giao diện UI: [Doc](https://cloud.google.com/vertex-ai/docs/model-registry/import-model#custom-container)

<img src = "_image/import_model_registry.png">

2. Import model command

In [None]:
gcloud ai models upload \
  --container-ports=8080 \
  --container-predict-route="/predict" \
  --container-health-route="/health" \
  --region=asia-southeast1 \
  --display-name=sentiment-fast-api \
  --container-image-uri=asia-southeast1-docker.pkg.dev/ext-pinetree-dw/dev-aiml-model/sentiment-fast-api

### Serve Vertex model by batch prediction

**Batch Prediction** là gửi request trực tiếp tới Model đã được imported vào **Model Registry** mà Model này không cần deploy thành endpoint. Khi đó data gửi vào trong 1 single request (có thể large size) và không yêu cầu reponse trả ra real-time.

#### Cost

Chi phí được tính bằng thời gian sử dụng [***resource per node hour***](https://cloud.google.com/vertex-ai/pricing#pred_apac), tổng của:
- **vCPU cost**: measured in vCPU hours
- **RAM cost**: measured in GB hours
- **GPU cost**: if either built into the machine or optionally configured, measured in GPU hours

#### Config Input data

Input for batch prediction:
- CSV file
- File-list in GCS
- Bigquery table
- JSON Line (JSONL)
- `tf-record` or `tf-record-gzip`

> To use a BigQuery table as input, you must set [`InstanceConfig.instanceType`](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.batchPredictionJobs#instanceconfig) to `object` using the Vertex AI API.

##### [Input data requirement](https://cloud.google.com/vertex-ai/docs/predictions/get-batch-predictions#input_data_requirements)



##### Partition data

##### Filter and transformation data

#### Request a batch prediction

##### Machine type & replica count

##### Batch prediction output

### Serve Vertex model by online prediction (endpoint - realtime)

#### Create model enpoint

Trước khi deploy model vào 

#### Deploy model to an enpoint

In [None]:
###

## Serving model container - Cloud Run

**Build docker image**

In [None]:
!gcloud ai models upload \
  --container-ports=80 \
  --container-predict-route="/predict" \
  --container-health-route="/health" \
  --region=asia-southeast1 \
  --display-name=sentiment-fast-api \
  --container-image-uri=gcr.io/sascha-playground-doit/sentiment-fast-api

In [None]:
!gcloud ai endpoints create \
  --project=ext-pinetree-dw \
  --region=us-central1 \
  --display-name=sentiment-fast-api-test

In [5]:
from google.cloud import aiplatform

project = "ext-pinetree-dw"
location = "asia-southeast1"

aiplatform.init(project=project, location=location)

In [8]:
instances = [
    {"text": "DoiT is a great company."},
    {"text": "The beach was nice but overall the hotel was very bad."},
]


endpoint = aiplatform.Endpoint(
    "projects/723874410918/locations/asia-southeast1/endpoints/4126673847229349888"
)

prediction = endpoint.predict(instances=instances)
print(prediction)

Prediction(predictions=[{'sentiment': 'Tiêu cực', 'confidence': 8.0}, {'sentiment': 'Tích cực', 'confidence': 8.6}], deployed_model_id='8408136941417005056', metadata=None, model_version_id='1', model_resource_name='projects/723874410918/locations/asia-southeast1/models/sentiment-fast-api-test', explanations=None)


In [24]:
%%writefile cloudbuild.yaml
steps:
# Download the model to embed it into the image
# - name: 'gcr.io/cloud-builders/gsutil'
#   args: ['cp', '-r', 'gs://dev-joyas-recommendation/models/sentiment', '.']
#   id: 'download-model'

# Build the container image
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', 'asia-southeast1-docker.pkg.dev/joyas-vietnam/dev-aiml-model/sentiment-fast-api', '.']
  # waitFor: ['download-model']

# Push the container image to Artifact Registry
- name: 'gcr.io/cloud-builders/docker'
  args: ['push', 'asia-southeast1-docker.pkg.dev/joyas-vietnam/dev-aiml-model/sentiment-fast-api']

images:
- asia-southeast1-docker.pkg.dev/joyas-vietnam/dev-aiml-model/sentiment-fast-api

Overwriting cloudbuild.yaml


In [19]:
!gcloud artifacts repositories create dev-aiml-model \
  --repository-format=docker \
  --location=asia-southeast1 \
  --description="My Docker repository"

Create request issued for: [dev-aiml-model]
Waiting for operation [projects/joyas-vietnam/locations/asia-southeast1/operations/c10a00b3-2f8b-45bf-b923-45e42675f358] to complete...
.................done.
Created repository [dev-aiml-model].


In [25]:
!gcloud builds submit --config cloudbuild.yaml .

^C


In [26]:
!docker build -t asia-southeast1-docker.pkg.dev/joyas-vietnam/dev-aiml-model/sentiment-fast-api .

#0 building with "desktop-linux" instance using docker driver

#1 [internal] load build definition from Dockerfile
#1 transferring dockerfile: 278B 0.0s done
#1 DONE 0.0s

#2 [internal] load metadata for docker.io/tiangolo/uvicorn-gunicorn-fastapi:python3.8-slim
#2 ...

#3 [auth] tiangolo/uvicorn-gunicorn-fastapi:pull token for registry-1.docker.io
#3 DONE 0.0s

#2 [internal] load metadata for docker.io/tiangolo/uvicorn-gunicorn-fastapi:python3.8-slim
#2 DONE 2.2s

#4 [internal] load .dockerignore
#4 transferring context: 2B done
#4 DONE 0.0s

#5 [1/5] FROM docker.io/tiangolo/uvicorn-gunicorn-fastapi:python3.8-slim@sha256:cce370ade672f3bfcac80d0c80314fc6b6530d3c623dab384af12da76cd2db6b
#5 DONE 0.0s

#6 [internal] load build context
#6 transferring context: 574B done
#6 DONE 0.0s

#7 [2/5] COPY main.py ./main.py
#7 CACHED

#8 [3/5] COPY requirements.txt ./requirements.txt
#8 CACHED

#9 [4/5] COPY models ./models
#9 DONE 5.7s

#10 [5/5] RUN pip install --no-cache-dir -r ./requirements.tx