diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json
index 2ebd69081f..c5e6b59eae 100644
--- a/output/openapi/elasticsearch-openapi.json
+++ b/output/openapi/elasticsearch-openapi.json
@@ -20437,7 +20437,7 @@
         "tags": [
           "inference"
         ],
-        "summary": "Perform chat completion inference\n",
+        "summary": "Perform chat completion inference on the service\n",
         "description": "The chat completion inference API enables real-time responses for chat completion tasks by delivering answers incrementally, reducing response times during computation.\nIt only works with the `chat_completion` task type for `openai` and `elastic` inference services.\n\nNOTE: The `chat_completion` task type is only available within the _stream API and only supports streaming.\nThe Chat completion inference API and the Stream inference API differ in their response structure and capabilities.\nThe Chat completion inference API provides more comprehensive customization options through more fields and function calling support.\nIf you use the `openai`, `hugging_face` or the `elastic` service, use the Chat completion inference API.",
         "operationId": "inference-chat-completion-unified",
         "parameters": [
@@ -20522,7 +20522,8 @@
         "tags": [
           "inference"
         ],
-        "summary": "Perform completion inference on the service",
+        "summary": "Perform completion inference on the service\n",
+        "description": "Get responses for completion tasks.\nThis API works only with the completion task type.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThis API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege).",
         "operationId": "inference-completion",
         "parameters": [
           {
@@ -20583,7 +20584,7 @@
               "examples": {
                 "CompletionRequestExample1": {
                   "summary": "Completion task",
-                  "description": "Run `POST _inference/completion/openai_chat_completions` to perform a completion on the example question.",
+                  "description": "Run `POST _inference/completion/openai_completions` to perform a completion on the example question.",
                   "value": "{\n  \"input\": \"What is Elastic?\"\n}"
                 }
               }
@@ -20602,7 +20603,7 @@
                 "examples": {
                   "CompletionResponseExample1": {
                     "summary": "Completion task",
-                    "description": "A successful response from `POST _inference/completion/openai_chat_completions`.\n",
+                    "description": "A successful response from `POST _inference/completion/openai_completions`.\n",
                     "value": "{\n  \"completion\": [\n    {\n      \"result\": \"Elastic is a company that provides a range of software solutions for search, logging, security, and analytics. Their flagship product is Elasticsearch, an open-source, distributed search engine that allows users to search, analyze, and visualize large volumes of data in real-time. Elastic also offers products such as Kibana, a data visualization tool, and Logstash, a log management and pipeline tool, as well as various other tools and solutions for data analysis and management.\"\n    }\n  ]\n}"
                   }
                 }
@@ -23657,18 +23658,19 @@
                     "type": "string"
                   },
                   "input": {
-                    "description": "The text on which you want to perform the inference task.\nIt can be a single string or an array.\n\n> info\n> Inference endpoints for the `completion` task type currently only support a single string as input.",
-                    "oneOf": [
-                      {
-                        "type": "string"
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "string"
-                        }
-                      }
-                    ]
+                    "description": "The documents to rank.",
+                    "type": "array",
+                    "items": {
+                      "type": "string"
+                    }
+                  },
+                  "return_documents": {
+                    "description": "Include the document text in the response.",
+                    "type": "boolean"
+                  },
+                  "top_n": {
+                    "description": "Limit the response to the top N documents.",
+                    "type": "number"
                   },
                   "task_settings": {
                     "description": "Task settings for the individual inference request.\nThese settings are specific to the task type you specified and override the task settings specified when initializing the service.",
@@ -23850,7 +23852,7 @@
         "tags": [
           "inference"
         ],
-        "summary": "Perform streaming inference",
+        "summary": "Perform streaming completion inference on the service\n",
         "description": "Get real-time responses for completion tasks by delivering answers incrementally, reducing response times during computation.\nThis API works only with the completion task type.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThis API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege). You must use a client that supports streaming.\n\n## Required authorization\n\n* Cluster privileges: `monitor_inference`\n",
         "operationId": "inference-stream-completion",
         "parameters": [
@@ -147820,7 +147822,7 @@
             "examples": {
               "InferencePutExample1": {
                 "description": "An example body for a `PUT _inference/rerank/my-rerank-model` request.",
-                "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n   \"model_id\": \"rerank-english-v3.0\",\n   \"api_key\": \"{{COHERE_API_KEY}}\"\n },\n \"chunking_settings\": {\n   \"strategy\": \"recursive\",\n   \"max_chunk_size\": 200,\n   \"separator_group\": \"markdown\"\n }\n}"
+                "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n   \"model_id\": \"rerank-english-v3.0\",\n   \"api_key\": \"{{COHERE_API_KEY}}\"\n }\n}"
               }
             }
           }
diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json
index f2d75687cc..7d9815284b 100644
--- a/output/openapi/elasticsearch-serverless-openapi.json
+++ b/output/openapi/elasticsearch-serverless-openapi.json
@@ -11418,7 +11418,7 @@
         "tags": [
           "inference"
         ],
-        "summary": "Perform chat completion inference\n",
+        "summary": "Perform chat completion inference on the service\n",
         "description": "The chat completion inference API enables real-time responses for chat completion tasks by delivering answers incrementally, reducing response times during computation.\nIt only works with the `chat_completion` task type for `openai` and `elastic` inference services.\n\nNOTE: The `chat_completion` task type is only available within the _stream API and only supports streaming.\nThe Chat completion inference API and the Stream inference API differ in their response structure and capabilities.\nThe Chat completion inference API provides more comprehensive customization options through more fields and function calling support.\nIf you use the `openai`, `hugging_face` or the `elastic` service, use the Chat completion inference API.",
         "operationId": "inference-chat-completion-unified",
         "parameters": [
@@ -11503,7 +11503,8 @@
         "tags": [
           "inference"
         ],
-        "summary": "Perform completion inference on the service",
+        "summary": "Perform completion inference on the service\n",
+        "description": "Get responses for completion tasks.\nThis API works only with the completion task type.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThis API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege).",
         "operationId": "inference-completion",
         "parameters": [
           {
@@ -11564,7 +11565,7 @@
               "examples": {
                 "CompletionRequestExample1": {
                   "summary": "Completion task",
-                  "description": "Run `POST _inference/completion/openai_chat_completions` to perform a completion on the example question.",
+                  "description": "Run `POST _inference/completion/openai_completions` to perform a completion on the example question.",
                   "value": "{\n  \"input\": \"What is Elastic?\"\n}"
                 }
               }
@@ -11583,7 +11584,7 @@
                 "examples": {
                   "CompletionResponseExample1": {
                     "summary": "Completion task",
-                    "description": "A successful response from `POST _inference/completion/openai_chat_completions`.\n",
+                    "description": "A successful response from `POST _inference/completion/openai_completions`.\n",
                     "value": "{\n  \"completion\": [\n    {\n      \"result\": \"Elastic is a company that provides a range of software solutions for search, logging, security, and analytics. Their flagship product is Elasticsearch, an open-source, distributed search engine that allows users to search, analyze, and visualize large volumes of data in real-time. Elastic also offers products such as Kibana, a data visualization tool, and Logstash, a log management and pipeline tool, as well as various other tools and solutions for data analysis and management.\"\n    }\n  ]\n}"
                   }
                 }
@@ -14638,18 +14639,19 @@
                     "type": "string"
                   },
                   "input": {
-                    "description": "The text on which you want to perform the inference task.\nIt can be a single string or an array.\n\n> info\n> Inference endpoints for the `completion` task type currently only support a single string as input.",
-                    "oneOf": [
-                      {
-                        "type": "string"
-                      },
-                      {
-                        "type": "array",
-                        "items": {
-                          "type": "string"
-                        }
-                      }
-                    ]
+                    "description": "The documents to rank.",
+                    "type": "array",
+                    "items": {
+                      "type": "string"
+                    }
+                  },
+                  "return_documents": {
+                    "description": "Include the document text in the response.",
+                    "type": "boolean"
+                  },
+                  "top_n": {
+                    "description": "Limit the response to the top N documents.",
+                    "type": "number"
                   },
                   "task_settings": {
                     "description": "Task settings for the individual inference request.\nThese settings are specific to the task type you specified and override the task settings specified when initializing the service.",
@@ -90038,7 +90040,7 @@
             "examples": {
               "InferencePutExample1": {
                 "description": "An example body for a `PUT _inference/rerank/my-rerank-model` request.",
-                "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n   \"model_id\": \"rerank-english-v3.0\",\n   \"api_key\": \"{{COHERE_API_KEY}}\"\n },\n \"chunking_settings\": {\n   \"strategy\": \"recursive\",\n   \"max_chunk_size\": 200,\n   \"separator_group\": \"markdown\"\n }\n}"
+                "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n   \"model_id\": \"rerank-english-v3.0\",\n   \"api_key\": \"{{COHERE_API_KEY}}\"\n }\n}"
               }
             }
           }
diff --git a/output/schema/schema.json b/output/schema/schema.json
index 5b657769e3..36df605fde 100644
--- a/output/schema/schema.json
+++ b/output/schema/schema.json
@@ -9786,7 +9786,7 @@
           "visibility": "public"
         }
       },
-      "description": "Perform chat completion inference\n\nThe chat completion inference API enables real-time responses for chat completion tasks by delivering answers incrementally, reducing response times during computation.\nIt only works with the `chat_completion` task type for `openai` and `elastic` inference services.\n\nNOTE: The `chat_completion` task type is only available within the _stream API and only supports streaming.\nThe Chat completion inference API and the Stream inference API differ in their response structure and capabilities.\nThe Chat completion inference API provides more comprehensive customization options through more fields and function calling support.\nIf you use the `openai`, `hugging_face` or the `elastic` service, use the Chat completion inference API.",
+      "description": "Perform chat completion inference on the service\n\nThe chat completion inference API enables real-time responses for chat completion tasks by delivering answers incrementally, reducing response times during computation.\nIt only works with the `chat_completion` task type for `openai` and `elastic` inference services.\n\nNOTE: The `chat_completion` task type is only available within the _stream API and only supports streaming.\nThe Chat completion inference API and the Stream inference API differ in their response structure and capabilities.\nThe Chat completion inference API provides more comprehensive customization options through more fields and function calling support.\nIf you use the `openai`, `hugging_face` or the `elastic` service, use the Chat completion inference API.",
       "docId": "inference-api-chat-completion",
       "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-unified-inference",
       "name": "inference.chat_completion_unified",
@@ -9826,7 +9826,7 @@
           "visibility": "public"
         }
       },
-      "description": "Perform completion inference on the service",
+      "description": "Perform completion inference on the service\nGet responses for completion tasks.\nThis API works only with the completion task type.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThis API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege).",
       "docId": "inference-api-post",
       "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-inference",
       "extPreviousVersionDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/8.18/post-inference-api.html",
@@ -11159,7 +11159,7 @@
           "visibility": "public"
         }
       },
-      "description": "Perform streaming inference.\nGet real-time responses for completion tasks by delivering answers incrementally, reducing response times during computation.\nThis API works only with the completion task type.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThis API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege). You must use a client that supports streaming.",
+      "description": "Perform streaming completion inference on the service\nGet real-time responses for completion tasks by delivering answers incrementally, reducing response times during computation.\nThis API works only with the completion task type.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThis API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege). You must use a client that supports streaming.",
       "docId": "inference-api-stream",
       "docUrl": "https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-stream-inference",
       "extPreviousVersionDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/8.18/stream-inference-api.html",
@@ -176123,7 +176123,7 @@
           }
         }
       },
-      "description": "Perform chat completion inference\n\nThe chat completion inference API enables real-time responses for chat completion tasks by delivering answers incrementally, reducing response times during computation.\nIt only works with the `chat_completion` task type for `openai` and `elastic` inference services.\n\nNOTE: The `chat_completion` task type is only available within the _stream API and only supports streaming.\nThe Chat completion inference API and the Stream inference API differ in their response structure and capabilities.\nThe Chat completion inference API provides more comprehensive customization options through more fields and function calling support.\nIf you use the `openai`, `hugging_face` or the `elastic` service, use the Chat completion inference API.",
+      "description": "Perform chat completion inference on the service\n\nThe chat completion inference API enables real-time responses for chat completion tasks by delivering answers incrementally, reducing response times during computation.\nIt only works with the `chat_completion` task type for `openai` and `elastic` inference services.\n\nNOTE: The `chat_completion` task type is only available within the _stream API and only supports streaming.\nThe Chat completion inference API and the Stream inference API differ in their response structure and capabilities.\nThe Chat completion inference API provides more comprehensive customization options through more fields and function calling support.\nIf you use the `openai`, `hugging_face` or the `elastic` service, use the Chat completion inference API.",
       "examples": {
         "PostChatCompletionRequestExample1": {
           "alternatives": [
@@ -176325,7 +176325,7 @@
           }
         ]
       },
-      "description": "Perform completion inference on the service",
+      "description": "Perform completion inference on the service\nGet responses for completion tasks.\nThis API works only with the completion task type.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThis API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege).",
       "examples": {
         "CompletionRequestExample1": {
           "alternatives": [
@@ -176350,8 +176350,8 @@
               "language": "curl"
             }
           ],
-          "description": "Run `POST _inference/completion/openai_chat_completions` to perform a completion on the example question.",
-          "method_request": "POST _inference/completion/openai_chat_completions",
+          "description": "Run `POST _inference/completion/openai_completions` to perform a completion on the example question.",
+          "method_request": "POST _inference/completion/openai_completions",
           "summary": "Completion task",
           "value": "{\n  \"input\": \"What is Elastic?\"\n}"
         }
@@ -176395,7 +176395,7 @@
           }
         }
       ],
-      "specLocation": "inference/completion/CompletionRequest.ts#L25-L63"
+      "specLocation": "inference/completion/CompletionRequest.ts#L25-L69"
     },
     {
       "kind": "response",
@@ -176412,7 +176412,7 @@
       },
       "examples": {
         "CompletionResponseExample1": {
-          "description": "A successful response from `POST _inference/completion/openai_chat_completions`.\n",
+          "description": "A successful response from `POST _inference/completion/openai_completions`.\n",
           "summary": "Completion task",
           "value": "{\n  \"completion\": [\n    {\n      \"result\": \"Elastic is a company that provides a range of software solutions for search, logging, security, and analytics. Their flagship product is Elasticsearch, an open-source, distributed search engine that allows users to search, analyze, and visualize large volumes of data in real-time. Elastic also offers products such as Kibana, a data visualization tool, and Logstash, a log management and pipeline tool, as well as various other tools and solutions for data analysis and management.\"\n    }\n  ]\n}"
         }
@@ -176835,7 +176835,7 @@
           ],
           "description": "An example body for a `PUT _inference/rerank/my-rerank-model` request.",
           "method_request": "PUT _inference/rerank/my-rerank-model",
-          "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n   \"model_id\": \"rerank-english-v3.0\",\n   \"api_key\": \"{{COHERE_API_KEY}}\"\n },\n \"chunking_settings\": {\n   \"strategy\": \"recursive\",\n   \"max_chunk_size\": 200,\n   \"separator_group\": \"markdown\"\n }\n}"
+          "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n   \"model_id\": \"rerank-english-v3.0\",\n   \"api_key\": \"{{COHERE_API_KEY}}\"\n }\n}"
         }
       },
       "inherits": {
@@ -181215,30 +181215,42 @@
             }
           },
           {
-            "description": "The text on which you want to perform the inference task.\nIt can be a single string or an array.\n\n> info\n> Inference endpoints for the `completion` task type currently only support a single string as input.",
+            "description": "The documents to rank.",
             "name": "input",
             "required": true,
             "type": {
-              "kind": "union_of",
-              "items": [
-                {
-                  "kind": "instance_of",
-                  "type": {
-                    "name": "string",
-                    "namespace": "_builtins"
-                  }
-                },
-                {
-                  "kind": "array_of",
-                  "value": {
-                    "kind": "instance_of",
-                    "type": {
-                      "name": "string",
-                      "namespace": "_builtins"
-                    }
-                  }
+              "kind": "array_of",
+              "value": {
+                "kind": "instance_of",
+                "type": {
+                  "name": "string",
+                  "namespace": "_builtins"
                 }
-              ]
+              }
+            }
+          },
+          {
+            "description": "Include the document text in the response.",
+            "name": "return_documents",
+            "required": false,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "boolean",
+                "namespace": "_builtins"
+              }
+            }
+          },
+          {
+            "description": "Limit the response to the top N documents.",
+            "name": "top_n",
+            "required": false,
+            "type": {
+              "kind": "instance_of",
+              "type": {
+                "name": "integer",
+                "namespace": "_types"
+              }
             }
           },
           {
@@ -181381,7 +181393,7 @@
           }
         }
       ],
-      "specLocation": "inference/rerank/RerankRequest.ts#L25-L72"
+      "specLocation": "inference/rerank/RerankRequest.ts#L26-L77"
     },
     {
       "kind": "response",
@@ -181615,7 +181627,7 @@
           }
         ]
       },
-      "description": "Perform streaming inference.\nGet real-time responses for completion tasks by delivering answers incrementally, reducing response times during computation.\nThis API works only with the completion task type.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThis API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege). You must use a client that supports streaming.",
+      "description": "Perform streaming completion inference on the service\nGet real-time responses for completion tasks by delivering answers incrementally, reducing response times during computation.\nThis API works only with the completion task type.\n\nIMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.\n\nThis API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege). You must use a client that supports streaming.",
       "examples": {
         "StreamInferenceRequestExample1": {
           "alternatives": [
diff --git a/output/typescript/types.ts b/output/typescript/types.ts
index a4c98e1d51..4d81e4f84a 100644
--- a/output/typescript/types.ts
+++ b/output/typescript/types.ts
@@ -14967,7 +14967,9 @@ export interface InferenceRerankRequest extends RequestBase {
   timeout?: Duration
   body?: {
     query: string
-    input: string | string[]
+    input: string[]
+    return_documents?: boolean
+    top_n?: integer
     task_settings?: InferenceTaskSettings
   }
 }
diff --git a/specification/inference/chat_completion_unified/UnifiedRequest.ts b/specification/inference/chat_completion_unified/UnifiedRequest.ts
index 6602d9448d..6c79b0a6a2 100644
--- a/specification/inference/chat_completion_unified/UnifiedRequest.ts
+++ b/specification/inference/chat_completion_unified/UnifiedRequest.ts
@@ -22,7 +22,7 @@ import { Id } from '@_types/common'
 import { Duration } from '@_types/Time'
 import { RequestChatCompletion } from '@inference/_types/CommonTypes'
 /**
- * Perform chat completion inference
+ * Perform chat completion inference on the service
  *
  * The chat completion inference API enables real-time responses for chat completion tasks by delivering answers incrementally, reducing response times during computation.
  * It only works with the `chat_completion` task type for `openai` and `elastic` inference services.
diff --git a/specification/inference/completion/CompletionRequest.ts b/specification/inference/completion/CompletionRequest.ts
index 2b05f213e1..6aa592c3b4 100644
--- a/specification/inference/completion/CompletionRequest.ts
+++ b/specification/inference/completion/CompletionRequest.ts
@@ -24,6 +24,12 @@ import { TaskSettings } from '@inference/_types/Services'
 
 /**
  * Perform completion inference on the service
+ * Get responses for completion tasks.
+ * This API works only with the completion task type.
+ *
+ * IMPORTANT: The inference APIs enable you to use certain services, such as built-in machine learning models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the inference APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the inference APIs to use these models or if you want to use non-NLP models, use the machine learning trained model APIs.
+ *
+ * This API requires the `monitor_inference` cluster privilege (the built-in `inference_admin` and `inference_user` roles grant this privilege).
  * @rest_spec_name inference.completion
  * @availability stack since=8.11.0 stability=stable visibility=public
  * @availability serverless stability=stable visibility=public
diff --git a/specification/inference/completion/examples/request/CompletionRequestExample1.yaml b/specification/inference/completion/examples/request/CompletionRequestExample1.yaml
index ac5dd76499..8d53f21dfa 100644
--- a/specification/inference/completion/examples/request/CompletionRequestExample1.yaml
+++ b/specification/inference/completion/examples/request/CompletionRequestExample1.yaml
@@ -1,6 +1,6 @@
 summary: Completion task
-description: Run `POST _inference/completion/openai_chat_completions` to perform a completion on the example question.
-method_request: 'POST _inference/completion/openai_chat_completions'
+description: Run `POST _inference/completion/openai_completions` to perform a completion on the example question.
+method_request: 'POST _inference/completion/openai_completions'
 # type: "request"
 value: |-
   {
diff --git a/specification/inference/completion/examples/response/CompletionResponseExample1.yaml b/specification/inference/completion/examples/response/CompletionResponseExample1.yaml
index 0f2b454856..2ddadd3d15 100644
--- a/specification/inference/completion/examples/response/CompletionResponseExample1.yaml
+++ b/specification/inference/completion/examples/response/CompletionResponseExample1.yaml
@@ -1,6 +1,6 @@
 summary: Completion task
 description: >
-  A successful response from `POST _inference/completion/openai_chat_completions`.
+  A successful response from `POST _inference/completion/openai_completions`.
 # type: "response"
 # response_code:
 value: |-
diff --git a/specification/inference/put/examples/request/InferencePutExample1.yaml b/specification/inference/put/examples/request/InferencePutExample1.yaml
index 4b33705804..c83f09194e 100644
--- a/specification/inference/put/examples/request/InferencePutExample1.yaml
+++ b/specification/inference/put/examples/request/InferencePutExample1.yaml
@@ -6,10 +6,5 @@ value: |-
    "service_settings": {
      "model_id": "rerank-english-v3.0",
      "api_key": "{{COHERE_API_KEY}}"
-   },
-   "chunking_settings": {
-     "strategy": "recursive",
-     "max_chunk_size": 200,
-     "separator_group": "markdown"
    }
   }
diff --git a/specification/inference/rerank/RerankRequest.ts b/specification/inference/rerank/RerankRequest.ts
index 04ffbed698..53ac180bad 100644
--- a/specification/inference/rerank/RerankRequest.ts
+++ b/specification/inference/rerank/RerankRequest.ts
@@ -19,6 +19,7 @@
 
 import { RequestBase } from '@_types/Base'
 import { Id } from '@_types/common'
+import { integer } from '@_types/Numeric'
 import { Duration } from '@_types/Time'
 import { TaskSettings } from '@inference/_types/Services'
 
@@ -56,13 +57,17 @@ export interface Request extends RequestBase {
      */
     query: string
     /**
-     * The text on which you want to perform the inference task.
-     * It can be a single string or an array.
-     *
-     * > info
-     * > Inference endpoints for the `completion` task type currently only support a single string as input.
+     * The documents to rank.
      */
-    input: string | Array<string>
+    input: Array<string>
+    /**
+     * Include the document text in the response.
+     */
+    return_documents?: boolean
+    /**
+     * Limit the response to the top N documents.
+     */
+    top_n?: integer
     /**
      * Task settings for the individual inference request.
      * These settings are specific to the task type you specified and override the task settings specified when initializing the service.
diff --git a/specification/inference/stream_completion/StreamInferenceRequest.ts b/specification/inference/stream_completion/StreamInferenceRequest.ts
index 0e08af6a6f..c349a6dca0 100644
--- a/specification/inference/stream_completion/StreamInferenceRequest.ts
+++ b/specification/inference/stream_completion/StreamInferenceRequest.ts
@@ -23,7 +23,7 @@ import { Duration } from '@_types/Time'
 import { TaskSettings } from '@inference/_types/Services'
 
 /**
- * Perform streaming inference.
+ * Perform streaming completion inference on the service
  * Get real-time responses for completion tasks by delivering answers incrementally, reducing response times during computation.
  * This API works only with the completion task type.
  *