diff --git a/docs/deployments/realtime-api/predictors.md b/docs/deployments/realtime-api/predictors.md index 61beff0773..0dcba3529d 100644 --- a/docs/deployments/realtime-api/predictors.md +++ b/docs/deployments/realtime-api/predictors.md @@ -86,6 +86,10 @@ class PythonPredictor: Useful for tasks that the client doesn't need to wait on before receiving a response such as recording metrics or storing results. + Note: post_predict() and predict() run in the same thread pool. The + size of the thread pool can be increased by updating + `threads_per_process` in the api configuration yaml. + Args: response (optional): The response as returned by the predict method. payload (optional): The request payload (see below for the possible @@ -245,6 +249,10 @@ class TensorFlowPredictor: Useful for tasks that the client doesn't need to wait on before receiving a response such as recording metrics or storing results. + Note: post_predict() and predict() run in the same thread pool. The + size of the thread pool can be increased by updating + `threads_per_process` in the api configuration yaml. + Args: response (optional): The response as returned by the predict method. payload (optional): The request payload (see below for the possible @@ -353,6 +361,10 @@ class ONNXPredictor: Useful for tasks that the client doesn't need to wait on before receiving a response such as recording metrics or storing results. + Note: post_predict() and predict() run in the same thread pool. The + size of the thread pool can be increased by updating + `threads_per_process` in the api configuration yaml. + Args: response (optional): The response as returned by the predict method. payload (optional): The request payload (see below for the possible diff --git a/pkg/workloads/cortex/serve/serve.py b/pkg/workloads/cortex/serve/serve.py index c64df9191b..716ed7f9cd 100644 --- a/pkg/workloads/cortex/serve/serve.py +++ b/pkg/workloads/cortex/serve/serve.py @@ -214,7 +214,7 @@ def predict(request: Request): if util.has_method(predictor_impl, "post_predict"): kwargs = build_post_predict_kwargs(prediction, request) - tasks.add_task(predictor_impl.post_predict, **kwargs) + request_thread_pool.submit(predictor_impl.post_predict, **kwargs) if len(tasks.tasks) > 0: response.background = tasks