In [1]:
import sys
sys.path.append('/Users/jack/Documents/Concurrency')
from multiprocessing_practice.setup_logger import logger
import time
import numpy
import onnxruntime as rt

# load dataset
# https://www.onnxruntime.ai/python/tutorial.html
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

# load model
filename = 'onnx_models/logreg_iris.onnx'

def run():
    # https://www.onnxruntime.ai/docs/how-to/tune-performance.html
    sess_options = rt.SessionOptions()
    sess_options.intra_op_num_threads = 1
    sess = rt.InferenceSession(filename, sess_options=sess_options)
    input_name = sess.get_inputs()[0].name

    pred = sess.run(None, {input_name: X_test.astype(numpy.float32)})[0]

# inference
logger.debug('Started')
begin = time.time()
for _ in range(1000):
    run()
end = time.time()
logger.debug('Finished')
logger.info(f'Time Elapsed : {end-begin}')

2021-06-20 15:20:31 | MainThread |[36m DEBUG    [0m| root | Started
2021-06-20 15:20:32 | MainThread |[36m DEBUG    [0m| root | Finished
2021-06-20 15:20:32 | MainThread |[32m INFO     [0m| root | Time Elapsed : 1.1923861503601074


Above is the example for running the simplest model.

In [2]:
# file:///Users/jack/Downloads/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb.html
import sys
sys.path.append('/Users/jack/Documents/Concurrency')
from multiprocessing_practice.setup_logger import logger
import time
import numpy
import torch
import onnxruntime as rt

max_seq_length = 128
total_samples = 100

dataset = torch.load('onnx_models/tensor_dataset.pt')

filename = 'onnx_models/optimized_model_cpu.onnx'

def run(i):
    data = dataset[i]
    rt_inputs = {
        'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
        'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
        'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
    }

    sess_options = rt.SessionOptions()
    sess_options.intra_op_num_threads = 1
    sess = rt.InferenceSession(filename, sess_options=sess_options)

    pred = sess.run(None, rt_inputs)
    # logger.debug(f'Finished inference {i}')
    return pred

# inference
preds = list()
logger.debug('Started')
begin = time.time()
for i in range(total_samples):
    pred = run(i)
    preds.append(pred)
end = time.time()
logger.debug('Finished')
logger.info(f'Time Elapsed : {end-begin}')
logger.info(f'Get {len(preds)} predictions')

2021-06-20 15:20:32 | MainThread |[36m DEBUG    [0m| root | Started
2021-06-20 15:21:56 | MainThread |[36m DEBUG    [0m| root | Finished
2021-06-20 15:21:56 | MainThread |[32m INFO     [0m| root | Time Elapsed : 83.88589191436768
2021-06-20 15:21:56 | MainThread |[32m INFO     [0m| root | Get 100 predictions


Above, the model is loaded every time while running inference. It is probably the worst approach to run the model.  

The reason for doing this is to compare with the next example because it is probably necessary to load models every time when inference task is delegated to another processor.

In [3]:
# multiprocessing
import sys
sys.path.append('/Users/jack/Documents/Concurrency')
from multiprocessing_practice.setup_logger import logger
import time
import numpy
import torch
import onnxruntime as rt
import multiprocessing as mp
from multiprocessing import Pool
import psutil

max_seq_length = 128
total_samples = 100

dataset = torch.load('onnx_models/tensor_dataset.pt')
filename = 'onnx_models/optimized_model_cpu.onnx'

def run(i):
    data = dataset[i]
    rt_inputs = {
        'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
        'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
        'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
    }
    
    sess_options = rt.SessionOptions()
    sess_options.intra_op_num_threads = 1
    sess = rt.InferenceSession(filename, sess_options=sess_options)

    pred = sess.run(None, rt_inputs)
    # logger.debug(f'Finished inference {i}')
    return pred

num_cpus = psutil.cpu_count(logical=False)
try:
    mp.set_start_method('fork')
except RuntimeError as re:
    logger.warning(re)

# inference
logger.debug('Started')
begin = time.time()
pool = Pool(num_cpus)
preds = pool.map(run, range(total_samples))
end = time.time()
logger.debug('Finished')
logger.info(f'Time Elapsed : {end-begin}')
logger.info(f'Get {len(preds)} predictions')

2021-06-20 15:21:56 | MainThread |[36m DEBUG    [0m| root | Started
2021-06-20 15:22:30 | MainThread |[36m DEBUG    [0m| root | Finished
2021-06-20 15:22:30 | MainThread |[32m INFO     [0m| root | Time Elapsed : 33.54128384590149
2021-06-20 15:22:30 | MainThread |[32m INFO     [0m| root | Get 100 predictions


Based on above two examples, using multiprocessing does help without introducing another level of complexity.

In [4]:
# run preloaded model
import sys
sys.path.append('/Users/jack/Documents/Concurrency')
from multiprocessing_practice.setup_logger import logger
import time
import numpy
import torch
import onnxruntime as rt

max_seq_length = 128
total_samples = 100

dataset = torch.load('onnx_models/tensor_dataset.pt')

filename = 'onnx_models/optimized_model_cpu.onnx'

def run(sess, i):
    data = dataset[i]
    rt_inputs = {
        'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
        'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
        'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
    }

    pred = sess.run(None, rt_inputs)
    # logger.debug(f'Finished inference {i}')
    return pred

# inference
preds = list()
logger.debug('Started')
begin = time.time()

sess_options = rt.SessionOptions()
sess_options.intra_op_num_threads = 1
sess = rt.InferenceSession(filename, sess_options=sess_options)

for i in range(total_samples):
    pred = run(sess, i)
    preds.append(pred)

end = time.time()
logger.debug('Finished')
logger.info(f'Time Elapsed : {end-begin}')
logger.info(f'Get {len(preds)} predictions')

2021-06-20 15:22:30 | MainThread |[36m DEBUG    [0m| root | Started
2021-06-20 15:23:09 | MainThread |[36m DEBUG    [0m| root | Finished
2021-06-20 15:23:09 | MainThread |[32m INFO     [0m| root | Time Elapsed : 39.2286479473114
2021-06-20 15:23:09 | MainThread |[32m INFO     [0m| root | Get 100 predictions


The really reasonable scenario whould be loading the model once and it is not much slower than the previous multiprocessing example.  
By these examples, it is obvious that loading models is a time comsuming task.

In [5]:
# multiprocessing with initializer
import sys
sys.path.append('/Users/jack/Documents/Concurrency')
from multiprocessing_practice.setup_logger import logger
import time
import numpy
import torch
import onnxruntime as rt
import multiprocessing as mp
from multiprocessing import Pool
import psutil

max_seq_length = 128
total_samples = 100

dataset = torch.load('onnx_models/tensor_dataset.pt')
filename = 'onnx_models/optimized_model_cpu.onnx'

def run(i):
    global sess
    data = dataset[i]
    rt_inputs = {
        'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
        'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
        'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
    }
    pred = sess.run(None, rt_inputs)
    # logger.debug(f'Finished inference {i}')
    return pred

sess = None
def init():
    global sess
    sess_options = rt.SessionOptions()
    sess_options.intra_op_num_threads = 1
    sess = rt.InferenceSession(filename, sess_options=sess_options)
    logger.info('Init')


num_cpus = psutil.cpu_count(logical=False)
try:
    mp.set_start_method('fork')
except RuntimeError as re:
    logger.warning(re)

# inference
logger.debug('Started')
begin = time.time()
pool = Pool(num_cpus, initializer=init)
preds = pool.map(run, range(total_samples))
end = time.time()
logger.debug('Finished')
logger.info(f'Time Elapsed : {end-begin}')
logger.info(f'Get {len(preds)} predictions')

2021-06-20 15:23:09 | MainThread |[36m DEBUG    [0m| root | Started
2021-06-20 15:23:11 | MainThread |[32m INFO     [0m| root | Init
2021-06-20 15:23:11 | MainThread |[32m INFO     [0m| root | Init
2021-06-20 15:23:11 | MainThread |[32m INFO     [0m| root | Init
2021-06-20 15:23:11 | MainThread |[32m INFO     [0m| root | Init
2021-06-20 15:23:11 | MainThread |[32m INFO     [0m| root | Init
2021-06-20 15:23:11 | MainThread |[32m INFO     [0m| root | Init
2021-06-20 15:23:11 | MainThread |[32m INFO     [0m| root | Init
2021-06-20 15:23:11 | MainThread |[32m INFO     [0m| root | Init
2021-06-20 15:23:18 | MainThread |[36m DEBUG    [0m| root | Finished
2021-06-20 15:23:18 | MainThread |[32m INFO     [0m| root | Time Elapsed : 9.415037870407104
2021-06-20 15:23:18 | MainThread |[32m INFO     [0m| root | Get 100 predictions


Because multiprocessing library provides initializer function, the model is only loaded at the begining.