# Example Notebook to show how to use RAPIDS+Pytorch with Triton

This notebook calls a ensemble model which uses RAPIDS+Pytorch with Triton


<img src="notebook_images/ensemble_rapids_simple.jpg" width="300" height="400">

### Client Setup

In [None]:
# !pip install nvidia-pyindex
# !pip install tritonclient[all]

### Import Libraries

In [1]:
import numpy as np
import json

import grpc
from tritonclient.grpc import service_pb2
from tritonclient.grpc import service_pb2_grpc
import tritonclient.grpc as grpcclient
from functools import partial

###  Connect to the Triton End to End Model 

In [2]:
url='localhost:8001'

triton_client = grpcclient.InferenceServerClient(url=url,verbose=False)

channel = grpc.insecure_channel(url)
grpc_stub = service_pb2_grpc.GRPCInferenceServiceStub(channel)

In [3]:
#preprocessing_model = 'end_to_end_onnx'
preprocessing_model = 'huggingface_tokenizer'
request = service_pb2.ModelMetadataRequest(name=preprocessing_model,
                                           version='1')
response = grpc_stub.ModelMetadata(request)
print("model metadata:\n{}".format(response))

model metadata:
name: "huggingface_tokenizer"
versions: "1"
platform: "python"
inputs {
  name: "product_reviews"
  datatype: "BYTES"
  shape: -1
}
outputs {
  name: "input_ids"
  datatype: "INT32"
  shape: -1
  shape: 256
}
outputs {
  name: "attention_mask"
  datatype: "INT32"
  shape: -1
  shape: 256
}



## Send Request to Model 

### Prepare Input 

In [4]:
log_ls = ['The product is great', 'This product is bad','This product is good', 'This product is really bad']*1
log_ls = [l.encode('utf-8') for l in log_ls]
log_ar = np.array(log_ls).reshape(1,len(log_ls))

### Request Sending Function

In [5]:
def send_preprocess_request(log_ar, model_name='rapids_tokenizer'):
    triton_client = grpcclient.InferenceServerClient(url=url,verbose=False)
    input_grpc = grpcclient.InferInput("product_reviews",log_ar.shape,"BYTES")
    input_grpc.set_data_from_numpy(log_ar)

    outputs = []
    outputs.append(grpcclient.InferRequestedOutput('input_ids'))
    outputs.append(grpcclient.InferRequestedOutput('attention_mask'))

    
    output = triton_client.infer(model_name=model_name,
                               inputs=[input_grpc],
                              outputs=outputs)
    

    return output

In [6]:
%%timeit
output = send_preprocess_request(log_ar.squeeze())

3.74 ms ± 239 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
def send_inference_request(log_ar, model_name='end_to_end_pytorch'):
    triton_client = grpcclient.InferenceServerClient(url=url,verbose=False)
    input_grpc = grpcclient.InferInput("product_reviews",log_ar.shape,"BYTES")
    input_grpc.set_data_from_numpy(log_ar)
    outputs = []
    outputs.append(grpcclient.InferRequestedOutput('preds'))
    
    output = triton_client.infer(model_name=model_name,
                               inputs=[input_grpc],
                              outputs=outputs)
    

    return output


In [8]:
%%timeit
output = send_inference_request(log_ar,'end_to_end_onnx')
#output.as_numpy('preds')

10.3 ms ± 63.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
%%timeit
output = send_inference_request(log_ar,'end_to_end_pytorch')

25.5 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


##  Predictions

##### 1 is positive, 0 is negative

In [10]:
output = send_inference_request(log_ar,'end_to_end_pytorch')
output.as_numpy('preds')

array([[1, 0, 1, 0]], dtype=int32)

In [11]:
output = send_inference_request(log_ar,'end_to_end_onnx')
output.as_numpy('preds')

array([[1, 0, 1, 0]], dtype=int32)

# **added by tugrulkonuk**

In [None]:
#!pip install transformers

### Huggingface tokenizer

In [None]:
from transformers import BertTokenizerFast, BertTokenizer
#tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=False)

##### Random inference

In [None]:
batch_size = 4
seq_length = 256
input0 = grpcclient.InferInput('input_ids', (batch_size, seq_length), 'INT32')

input1 = grpcclient.InferInput('attention_mask', (batch_size, seq_length), 'INT32')
input1.set_data_from_numpy(np.ones((batch_size, seq_length), dtype=np.int32))

output = grpcclient.InferRequestedOutput('preds')

def run_random_inference(model_name='sentiment_model_pytorch'):
    triton_client = grpcclient.InferenceServerClient(url=url,verbose=False)
    input0.set_data_from_numpy(np.random.randint(10000, size=(batch_size, seq_length), dtype=np.int32))
    return triton_client.infer(model_name=model_name, inputs=[input0, input1], outputs=[output])

In [None]:
output = run_random_inference()
output.as_numpy('preds')

In [None]:
# run an inference with a defined Huggingface tokenizer
def run_inference(input_ls, tokenizer=BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=False),  model_name='sentiment_model_pytorch'):
    triton_client = grpcclient.InferenceServerClient(url=url,verbose=False)

    output_t = tokenizer(input_ls, truncation=True, max_length=seq_length, padding='max_length')
    output_d = {k:v for k,v in output_t.items()}
    
    input0 = grpcclient.InferInput('input_ids', (len(input_ls), seq_length), 'INT32')
    input1 = grpcclient.InferInput('attention_mask', (len(input_ls), seq_length), 'INT32')

    input0.set_data_from_numpy(np.array(output_d["input_ids"]).astype(np.int32))
    input1.set_data_from_numpy(np.array(output_d["attention_mask"]).astype(np.int32))
    
    output = grpcclient.InferRequestedOutput('preds')
    
    return output_d, triton_client.infer(model_name=model_name, inputs=[input0, input1], outputs=[output])

In [22]:
log_ls = ['The product is great', 'This product is bad','This product is good', 'This product is really bad']*1
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False)
tokens, output = run_inference(log_ls, tokenizer)
output.as_numpy('preds')

NameError: name 'BertTokenizerFast' is not defined

##  Predictions

##### 1 is positive, 0 is negative

In [27]:
#%%timeit
log_ls = ['The product is great', 'This product is bad','This product is good', 'This product is really bad']*1

def send_inference_request_hugf(txt_ls, model_name='end_to_end_pytorch_hugf'):
    triton_client = grpcclient.InferenceServerClient(url=url,verbose=False)
    input_grpc = grpcclient.InferInput("product_reviews", shape=(1,len(txt_ls)), datatype="BYTES")
    input_grpc.set_data_from_numpy(np.asarray([txt_ls], dtype=object))
    
    
    outputs = []
    outputs.append(grpcclient.InferRequestedOutput('preds'))
    
    output = triton_client.infer(model_name=model_name,
                               inputs=[input_grpc],
                              outputs=outputs)
    

    return output


In [28]:
seq_length = 256
output = send_inference_request_hugf(log_ls)
output.as_numpy('preds')

array([[1, 0, 1, 0]], dtype=int32)

In [None]:
print(len(log_ls[0]))