import necessary packages into workspace

In [None]:
import numpy as np
from pprint import pprint

# for how to install tritonclient, refer to https://github.com/triton-inference-server/client
import tritonclient.http as httpclient

* start tritonserver with [example model](https://github.com/aws/amazon-sagemaker-examples/tree/main/sagemaker-triton/resnet50/workspace)

```bash
# docker images
REPOSITORY                    TAG         IMAGE ID       CREATED        SIZE
nvcr.io/nvidia/pytorch        24.01-py3   8470a68886ff   6 months ago   22GB
nvcr.io/nvidia/tritonserver   24.01-py3   4f00c79d1c09   6 months ago   14.7GB

# docker run --gpus all -t -d --rm --ipc=host --network=host --ulimit memlock=-1 --ulimit stack=67108864 -v /home/cuiyongbo/tensorflow-workspace/data-store/huggingface-store:/model-store nvcr.io/nvidia/tritonserver:24.01-py3

# docker container list
CONTAINER ID   IMAGE                                   COMMAND                  CREATED          STATUS          PORTS     NAMES
0269f37660a5   nvcr.io/nvidia/tritonserver:24.01-py3   "/opt/nvidia/nvidia_…"   12 seconds ago   Up 11 seconds             quirky_heyrovsky

# docker exec -u root -it 775a5a810f15 bash

# model hierarchy:
# tree /model-store/amazon-sagemaker-examples/sagemaker-triton/resnet50/triton-serve-pt
/model-store/amazon-sagemaker-examples/sagemaker-triton/resnet50/triton-serve-pt
└── resnet
    ├── 1
    │   └── model.pt
    └── config.pbtxt

2 directories, 2 files

# tritonserver --model-store=/model-store/amazon-sagemaker-examples/sagemaker-triton/resnet50/triton-serve-pt --allow-metrics=false --allow-grpc=false --model-control-mode=explicit --load-model=resnet
```

In [None]:
url = "localhost:8000" # tritonserver's address
verbose = True
triton_client = httpclient.InferenceServerClient(
    url=url, verbose=verbose
)

In [None]:
model_name = "simple_string"
inputs = []
inputs.append(httpclient.InferInput("INPUT0", [1, 16], "BYTES"))
inputs.append(httpclient.InferInput("INPUT1", [1, 16], "BYTES"))
in0 = np.arange(start=0, stop=16, dtype=np.int32)
in0 = np.expand_dims(in0, axis=0)
in1 = np.ones(shape=(1, 16), dtype=np.int32)
expected_sum = np.add(in0, in1)
expected_diff = np.subtract(in0, in1)
in0n = np.array([str(x).encode("utf-8") for x in in0.reshape(in0.size)], dtype=np.object_)
in1n = np.array([str(x).encode("utf-8") for x in in1.reshape(in1.size)], dtype=np.object_)
inputs[0].set_data_from_numpy(in0n.reshape(in0.shape), binary_data=True)
inputs[1].set_data_from_numpy(in1n.reshape(in1.shape), binary_data=False)

outputs = []
outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=True))

results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs)

output0_data = results.as_numpy("OUTPUT0")
output1_data = results.as_numpy("OUTPUT1")

for i in range(in0.size):
  if expected_sum[0][i] != int(output0_data[0][i]):
    print("error: incorrect sum")
    break
  if expected_diff[0][i] != int(output1_data[0][i]):
    print("error: incorrect difference")
    break


In [None]:
#dir(results)
#help(results.get_output)
#help(results.as_numpy)
#results.get_output("OUTPUT0")
#results.as_numpy("OUTPUT0")
np.array_equal(expected_diff, output1_data)
print(expected_diff, expected_diff.dtype)
print(output1_data, output1_data.dtype)

In [None]:
model_name = "simple_identity"

inputs = []
inputs.append(httpclient.InferInput("INPUT0", [1, 16], "BYTES"))
null_char_array = np.array([str("hello").encode("utf-8") for x in range(16)], dtype=np.object_)
null_char_data = null_char_array.reshape([1, 16])
inputs[0].set_data_from_numpy(null_char_data, binary_data=True)

outputs = []
outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))

results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs)
output0_data = results.as_numpy("OUTPUT0")
if not np.array_equal(null_char_data, output0_data):
  print("incorrect output:", output0_data)

In [None]:
print(triton_client.is_server_live())
print(triton_client.is_server_ready())
print(triton_client.is_model_ready(model_name=model_name))
meta_data = triton_client.get_server_metadata()
pprint(meta_data)
meta_data = triton_client.get_model_metadata(model_name=model_name)
pprint(meta_data)
model_repo = triton_client.get_model_repository_index()
pprint(model_repo)


In [None]:

# BE CAUTIOUS! make sure you run the code in the same machine where tritonserver resides

import tritonclient.utils.shared_memory as shm
from tritonclient import utils

#triton_client.unregister_cuda_shared_memory()
triton_client.unregister_system_shared_memory()

model_name = "simple"
model_version = ""

input0_data = np.arange(start=0, stop=16, dtype=np.int32)
input1_data = np.ones(shape=16, dtype=np.int32)

input_byte_size = input0_data.size * input0_data.itemsize
output_byte_size = input_byte_size

shm_op_handle = shm.create_shared_memory_region("output_data", "/output_simple", output_byte_size*2)
triton_client.register_system_shared_memory("output_data", "/output_simple", output_byte_size*2)

shm_ip_handle = shm.create_shared_memory_region("input_data", "/input_simple", input_byte_size*2)
shm.set_shared_memory_region(shm_ip_handle, [input0_data])
shm.set_shared_memory_region(shm_ip_handle, [input1_data], offset=input_byte_size)
triton_client.register_system_shared_memory("input_data", "/input_simple", input_byte_size*2)

inputs = []
inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
inputs[-1].set_shared_memory("input_data", input_byte_size)
inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
inputs[-1].set_shared_memory("input_data", input_byte_size, offset=input_byte_size)

outputs = []
outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
outputs[-1].set_shared_memory("output_data", output_byte_size)
outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=True))
outputs[-1].set_shared_memory("output_data", output_byte_size, offset=output_byte_size)

results = triton_client.infer(model_name=model_name, inputs=inputs, outputs=outputs)

output0 = results.get_output("OUTPUT0")
output0_data = shm.get_contents_as_numpy(shm_op_handle,
                                         utils.triton_to_np_dtype(output0["datatype"]),
                                         output0["shape"])


output1 = results.get_output("OUTPUT1")
output1_data = shm.get_contents_as_numpy(shm_op_handle,
                                         utils.triton_to_np_dtype(output1["datatype"]),
                                         output1["shape"], offset=output_byte_size)
