# BioNemo Deployment on Vertex
## References
* https://docs.nvidia.com/bionemo-framework/latest/deep-dive-esm1-pytriton-inference.html
```


#### Conda Setup
```
export CONDA_ENV=pytorch-nightly-conda
conda create -p $CONDA_ENV pytorch ipykernel fastapi jupyter pytorch-cuda=12.1 python=3.11 -c pytorch -c nvidia -y
$CONDA_EXE run -p $CONDA_ENV python -m ipykernel install --user --name=$CONDA_ENV
$CONDA_EXE run -p $CONDA_ENV python -m pip install docker
````

### Install Deps

In [None]:
%pip install tqdm google-cloud-aiplatform

### Create an NGC API Key, add to Secret Manager
* Create an account at org.ngc.nvidia.com, then create a new API Key
* Add it to secrets manager


In [None]:
! echo -n "<YOUR NGC API KEY>" | gcloud secrets create ngc-api-key --data-file=-

In [None]:


NGC_SECRET= ! gcloud secrets versions access latest --secret=ngc-api-key
NGC_API_KEY=NGC_SECRET[0]
NGC_API_KEY

### Using NGC API Key, download the `esm1nv` model for generating embeddings from the container

In [None]:
from tqdm import tqdm
import requests
response = requests.get('https://authn.nvidia.com/token?service=ngc&scope=group/ngc:', auth=('$oauthtoken', NGC_API_KEY))
TOKEN = response.json()['token']

MODEL = 'esm1nv'
MODEL_FILE = f'{MODEL}.nemo'

MODEL_URI = f'https://api.ngc.nvidia.com/v2/org/nvidia/team/clara/models/esm1nv/versions/1.0/files/{MODEL_FILE}'

model_response= requests.get(MODEL_URI, stream=True,allow_redirects=True, headers={
    "Content-Type": "application/json",
    "Authorization": f"Bearer {TOKEN}"}
)

print(model_response)

# Sizes in bytes.
total_size = int(model_response.headers.get("content-length", 0))
block_size = 1024

with tqdm(total=total_size, unit="B", unit_scale=True) as progress_bar:
    with open(MODEL_FILE, "wb") as file:
        for data in model_response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)

if total_size != 0 and progress_bar.n != total_size:
    raise RuntimeError("Could not download model")
else:
    print(f"{MODEL_FILE} downloaded successfully!")
    

### Initialize AI Platform SDK

In [None]:
# Initialize Vertex SDK
PROJECT_ID='northam-ce-mlai-tpu'
REGION='us-central1'
BUCKET='robv-scratch'

import google.cloud.aiplatform as aip

aip.init(project=PROJECT_ID, location=REGION)

### Registering the NGC Repository as a `remote repository` in Artifact Regsitry
First, we authorize our server account to have access to our repository secret for accessing NGC.  

In [None]:
! gcloud projects add-iam-policy-binding northam-ce-mlai-tpu --member='serviceAccount:service-9452062936@gcp-sa-artifactregistry.iam.gserviceaccount.com' --role='roles/secretmanager.secretAccessor'

Now try with NVidia NGC:

In [None]:
! gcloud artifacts repositories create robv-test-ngc-remote-repo \
    --project="northam-ce-mlai-tpu" \
    --repository-format=docker \
    --location="us-central1" \
    --description="Nvidia GPU Containers" \
    --mode=remote-repository \
    --remote-repo-config-desc="Nvidia remote NGC rpo " \
    --remote-docker-repo="https://nvcr.io/" \
    --remote-username="\$oauthtoken" \
    --remote-password-secret-version="projects/9452062936/secrets/ngc-api-key/versions/latest"

### Create a standard repository for customized images

In [None]:
! gcloud artifacts repositories create custom-container-prediction-vertex \
    --project="northam-ce-mlai-tpu" \
    --repository-format=docker \
    --location="us-central1" \
    --description="Custom serving containers for Vertex Prediction"

In [None]:
! gcloud artifacts docker images list us-central1-docker.pkg.dev/northam-ce-mlai-tpu/robv-test-ngc-remote-repo/nvidia

In [None]:
! gcloud artifacts repositories describe 'robv-test-ngc-remote-repo' --location=us-central1

### Test the container locally
First, lets download the container from the repo and run it in our notebook using a Vertex LocalEndpoint


In [None]:
REPO_PATH='us-central1-docker.pkg.dev/northam-ce-mlai-tpu/robv-test-ngc-remote-repo'
IMAGE_URI=f'{REPO_PATH}/nvidia/clara/bionemo-framework:1.5'
MODEL_ARTIFACTS_REPOSITORY="gs://robv-scratch/models/"
! docker pull $IMAGE_URI

### Patch bionemo pytriton serving config
In order to serve bionemo models using pytriton on Vertex AI, we have to patch the default `serve_bionemo_model.py` file


In [None]:
%%writefile serve_bionemo_model.py.patch
--- serve_bionemo_model.py.orig 2024-07-03 19:45:15.549867973 +0000
+++ serve_bionemo_model.py      2024-07-03 20:03:47.842013972 +0000
@@ -16,7 +16,7 @@
 from model_navigator.package.package import Package
 from nemo.utils import logging
 from omegaconf import DictConfig
-from pytriton.triton import Triton
+from pytriton.triton import Triton, TritonConfig
 
 from bionemo.model.core.infer import M
 from bionemo.triton import decodes
@@ -147,7 +147,10 @@
     else:
         maybe_model = None
 
-    with Triton() as triton:
+    ### robv@google.com - patch for Vertex
+    config = TritonConfig(allow_http=True, allow_vertex_ai=True, vertex_ai_port=8080, vertex_ai_default_model="bionemo_model_embeddings")
+
+    with Triton(config=config) as triton:
         for maybe_triton_model_name, bind_fn in [
             (embedding, bind_embedding),
             (sampling, bind_sampling),

### Generate a new bionemo image with our patches/updates
Pytriton support for vertex arrived in `nvidia-pytriton==0.5.2`

In [None]:
%%bash -s "$IMAGE_URI"
echo $PWD
echo $HOSTNAME
echo $1

# Change to scratch directory
mkdir /mnt/localssd/scratch
cp $PWD/esm1nv.nemo $PWD/serve_bionemo_model.py.patch /mnt/localssd/scratch && cd /mnt/localssd/scratch
docker build --progress=plain --no-cache -t bionemo-esm1nv:1.5 -f - . <<EOF 2>&1 | tee build.log
FROM $1
COPY /esm1nv.nemo /workspace/bionemo/models/protein/esm1nv/
COPY /serve_bionemo_model.py.patch /workspace/bionemo/bionemo/triton/
RUN python -m pip install --upgrade nvidia-pytriton
WORKDIR /workspace/bionemo/bionemo/triton
RUN patch serve_bionemo_model.py serve_bionemo_model.py.patch 
EOF

In [None]:
from google.cloud.aiplatform.prediction import LocalModel, LocalEndpoint

import logging
logging.basicConfig(level=logging.INFO)

IMAGE_URI="bionemo-esm1nv:1.5"


local_model = LocalModel(
    serving_container_image_uri=IMAGE_URI,
    serving_container_args=["python -m bionemo.triton.inference_wrapper --config-path /workspace/bionemo/examples/protein/esm1nv/conf"],
    serving_container_predict_route = "/v2/models/bionemo_model_embeddings/infer",
    serving_container_health_route = "/v2/models/bionemo_model_embeddings",
)

local_endpoint = local_model.deploy_to_local_endpoint(
    host_port=8080,
    gpu_count=-1
)

local_endpoint.serve()


In [None]:
%%writefile sequences.json
{"inputs":[
        {
            "name":"sequences",
            "data": ["MSLKRKNIALIPAAGIGVRFGADKPKQYVEIGSKTVLEHVL", "MIQSQINRNIRLDLADAILLSKAKKDLSFAEIADGTGLA"],
            "datatype":"BYTES",
            "shape":[2,1]
        }
    ]
}


In [None]:
import json

with open('sequences.json') as f:
    sequences = json.load(f)
sequences

### Tests with `requests`

In [None]:
HOST="localhost"
PORT=8080
MODEL_PATH="bionemo_model_embeddings"
#MODEL_PATH="bionemo_model"

URL = f"http://{HOST}:{PORT}/v2/models/{MODEL_PATH}/infer"

resp = requests.post(URL,data=json.dumps(sequences),headers={
    "Content-Type": "application/json; charset=UTF-8"})
resp.text

### Test with local endpoint predict

In [None]:
resp = local_endpoint.predict(
    request=json.dumps(sequences),
    headers={"Content-Type": "application/json"},
    verbose=True
)

resp.text

### Copy to new image to push to AR

In [None]:
REPOSITORY='custom-container-prediction-vertex'
IMAGE=IMAGE_URI

local_model_ar = local_model.copy_image(
    f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}"
)

In [None]:
local_model_ar.get_serving_container_spec()


In [None]:
local_model_ar.push_image()


### Upload model to Vertex

In [None]:
from google.cloud.aiplatform import Model

model = Model.upload(
    local_model=local_model_ar,
    display_name="Bionemo esm1nv"
    )

### Deploy model

In [None]:
endpoint = model.deploy(
    machine_type="g2-standard-16",
    accelerator_type="NVIDIA_L4",
    accelerator_count=1
    )

### Test default embeddings models in Vertex

In [None]:
ENDPOINT_ID="projects/9452062936/locations/us-central1/endpoints/5288859836811837440"
! gcloud ai endpoints raw-predict $ENDPOINT_ID \
  --region=us-central1 \
  --http-headers=Content-Type=application/json \
  --request @sequences.json

### Test other model (hiddens) deployed on endpoint

In [None]:
! gcloud ai endpoints raw-predict $ENDPOINT_ID \
  --region=us-central1 \
  --http-headers=Content-Type=application/json,"X-Vertex-Ai-Triton-Redirect=v2/models/bionemo_model_hiddens/infer" \
  --request @sequences.json