From 1d71d3ac5849ed46db782aa11ad95b1ee2f08915 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Fri, 4 Sep 2020 18:44:07 -0400
Subject: [PATCH 1/3] Update serve.py

---
 pkg/workloads/cortex/serve/serve.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/workloads/cortex/serve/serve.py b/pkg/workloads/cortex/serve/serve.py
index c64df9191b..716ed7f9cd 100644
--- a/pkg/workloads/cortex/serve/serve.py
+++ b/pkg/workloads/cortex/serve/serve.py
@@ -214,7 +214,7 @@ def predict(request: Request):
 
     if util.has_method(predictor_impl, "post_predict"):
         kwargs = build_post_predict_kwargs(prediction, request)
-        tasks.add_task(predictor_impl.post_predict, **kwargs)
+        request_thread_pool.submit(predictor_impl.post_predict, **kwargs)
 
     if len(tasks.tasks) > 0:
         response.background = tasks

From 8bf472ba9d87c357f5aa6474044a1e2029bac219 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Wed, 23 Sep 2020 11:38:49 -0400
Subject: [PATCH 2/3] Update predictors.md

---
 docs/deployments/realtime-api/predictors.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/deployments/realtime-api/predictors.md b/docs/deployments/realtime-api/predictors.md
index 61beff0773..afd00ffbd0 100644
--- a/docs/deployments/realtime-api/predictors.md
+++ b/docs/deployments/realtime-api/predictors.md
@@ -86,6 +86,9 @@ class PythonPredictor:
         Useful for tasks that the client doesn't need to wait on before
         receiving a response such as recording metrics or storing results.
 
+        It is recommended to specify multiple threads `threads_per_process`
+        in the api configuration yaml if this function is specified.
+
         Args:
             response (optional): The response as returned by the predict method.
             payload (optional): The request payload (see below for the possible
@@ -245,6 +248,9 @@ class TensorFlowPredictor:
         Useful for tasks that the client doesn't need to wait on before
         receiving a response such as recording metrics or storing results.
 
+        It is recommended to specify multiple threads `threads_per_process`
+        in the api configuration yaml if this function is specified.
+
         Args:
             response (optional): The response as returned by the predict method.
             payload (optional): The request payload (see below for the possible
@@ -353,6 +359,9 @@ class ONNXPredictor:
         Useful for tasks that the client doesn't need to wait on before
         receiving a response such as recording metrics or storing results.
 
+        It is recommended to specify multiple threads `threads_per_process`
+        in the api configuration yaml if this function is specified.
+
         Args:
             response (optional): The response as returned by the predict method.
             payload (optional): The request payload (see below for the possible

From 5822b14f2fe6466ab38f1dd28ba1facbb6fdbff7 Mon Sep 17 00:00:00 2001
From: vishal <vishalbollu@users.noreply.github.com>
Date: Wed, 23 Sep 2020 14:49:53 -0400
Subject: [PATCH 3/3] Update predictors.md

---
 docs/deployments/realtime-api/predictors.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/docs/deployments/realtime-api/predictors.md b/docs/deployments/realtime-api/predictors.md
index afd00ffbd0..0dcba3529d 100644
--- a/docs/deployments/realtime-api/predictors.md
+++ b/docs/deployments/realtime-api/predictors.md
@@ -86,8 +86,9 @@ class PythonPredictor:
         Useful for tasks that the client doesn't need to wait on before
         receiving a response such as recording metrics or storing results.
 
-        It is recommended to specify multiple threads `threads_per_process`
-        in the api configuration yaml if this function is specified.
+        Note: post_predict() and predict() run in the same thread pool. The
+        size of the thread pool can be increased by updating
+        `threads_per_process` in the api configuration yaml.
 
         Args:
             response (optional): The response as returned by the predict method.
@@ -248,8 +249,9 @@ class TensorFlowPredictor:
         Useful for tasks that the client doesn't need to wait on before
         receiving a response such as recording metrics or storing results.
 
-        It is recommended to specify multiple threads `threads_per_process`
-        in the api configuration yaml if this function is specified.
+        Note: post_predict() and predict() run in the same thread pool. The
+        size of the thread pool can be increased by updating
+        `threads_per_process` in the api configuration yaml.
 
         Args:
             response (optional): The response as returned by the predict method.
@@ -359,8 +361,9 @@ class ONNXPredictor:
         Useful for tasks that the client doesn't need to wait on before
         receiving a response such as recording metrics or storing results.
 
-        It is recommended to specify multiple threads `threads_per_process`
-        in the api configuration yaml if this function is specified.
+        Note: post_predict() and predict() run in the same thread pool. The
+        size of the thread pool can be increased by updating
+        `threads_per_process` in the api configuration yaml.
 
         Args:
             response (optional): The response as returned by the predict method.