From f3daa03ec7cb4a142cc1916491fdd73c5e762e53 Mon Sep 17 00:00:00 2001
From: lcawl <lcawley@elastic.co>
Date: Tue, 18 Mar 2025 23:19:55 -0700
Subject: [PATCH 1/2] Remove inference API pages

---
 .../autoscaling/trained-model-autoscaling.md  |   3 +-
 ...bacloud-ai-search-inference-integration.md | 215 --------
 .../amazon-bedrock-inference-integration.md   | 158 ------
 .../anthropic-inference-integration.md        | 132 -----
 .../azure-ai-studio-inference-integration.md  | 162 ------
 .../azure-openai-inference-integration.md     | 150 ------
 .../chat-completion-inference-api.md          | 510 ------------------
 .../cohere-inference-integration.md           | 190 -------
 .../elastic-inference-service-eis.md          |   2 +-
 .../elasticsearch-inference-integration.md    | 279 ----------
 .../elser-inference-integration.md            | 171 ------
 .../google-ai-studio-inference-integration.md |  96 ----
 .../google-vertex-ai-inference-integration.md | 136 -----
 .../huggingface-inference-integration.md      | 111 ----
 .../jinaai-inference-integration.md           | 225 --------
 .../machine-learning/nlp/ml-nlp-e5.md         |   8 +-
 .../machine-learning/nlp/ml-nlp-elser.md      |   9 +-
 .../machine-learning/nlp/ml-nlp-rerank.md     |   5 +-
 explore-analyze/toc.yml                       |  13 -
 solutions/search/hybrid-semantic-text.md      |   3 +-
 .../search/ranking/semantic-reranking.md      |  12 +-
 .../semantic-search-inference.md              |   3 +-
 .../semantic-search-semantic-text.md          |  10 +-
 23 files changed, 35 insertions(+), 2568 deletions(-)
 delete mode 100644 explore-analyze/elastic-inference/inference-api/alibabacloud-ai-search-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/amazon-bedrock-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/anthropic-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/azure-ai-studio-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/azure-openai-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/chat-completion-inference-api.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/cohere-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/elser-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/google-ai-studio-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/google-vertex-ai-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/huggingface-inference-integration.md
 delete mode 100644 explore-analyze/elastic-inference/inference-api/jinaai-inference-integration.md

diff --git a/deploy-manage/autoscaling/trained-model-autoscaling.md b/deploy-manage/autoscaling/trained-model-autoscaling.md
index bbd60a1d6f..208c82a6fd 100644
--- a/deploy-manage/autoscaling/trained-model-autoscaling.md
+++ b/deploy-manage/autoscaling/trained-model-autoscaling.md
@@ -46,7 +46,8 @@ If you set the minimum number of allocations to 1, you will be charged even if t
 
 You can enable adaptive allocations by using:
 
-* the create inference endpoint API for [ELSER](../../explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md), [E5 and models uploaded through Eland](../../explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md) that are used as inference services.
+* the create inference endpoint API for ELSER, E5 and models uploaded through Eland that are used as inference services.
+  %TBD URL for APIs
 * the [start trained model deployment](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-ml-start-trained-model-deployment) or [update trained model deployment](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-ml-update-trained-model-deployment) APIs for trained models that are deployed on {{ml}} nodes.
 
 If the new allocations fit on the current {{ml}} nodes, they are immediately started. If more resource capacity is needed for creating new model allocations, then your {{ml}} node will be scaled up if {{ml}} autoscaling is enabled to provide enough resources for the new allocation. The number of model allocations can be scaled down to 0. They cannot be scaled up to more than 32 allocations, unless you explicitly set the maximum number of allocations to more. Adaptive allocations must be set up independently for each deployment and [{{infer}} endpoint](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference).
diff --git a/explore-analyze/elastic-inference/inference-api/alibabacloud-ai-search-inference-integration.md b/explore-analyze/elastic-inference/inference-api/alibabacloud-ai-search-inference-integration.md
deleted file mode 100644
index 080d8fb202..0000000000
--- a/explore-analyze/elastic-inference/inference-api/alibabacloud-ai-search-inference-integration.md
+++ /dev/null
@@ -1,215 +0,0 @@
----
-navigation_title: "Alibaba Cloud AI Search"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-alibabacloud-ai-search.html
-applies_to:
-  stack:
-  serverless:
----
-
-# AlibabaCloud AI Search inference integration [infer-service-alibabacloud-ai-search]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::  
-
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `alibabacloud-ai-search` service.
-
-
-## {{api-request-title}} [infer-service-alibabacloud-ai-search-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-alibabacloud-ai-search-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `completion`,
-    * `rerank`
-    * `sparse_embedding`,
-    * `text_embedding`.
-
-
-
-## {{api-request-body-title}} [infer-service-alibabacloud-ai-search-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `alibabacloud-ai-search`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `alibabacloud-ai-search` service.
-
-    `api_key`
-    :   (Required, string) A valid API key for the AlibabaCloud AI Search API.
-
-    `service_id`
-    :   (Required, string) The name of the model service to use for the {{infer}} task.
-
-        Available service_ids for the `completion` task:
-
-        * `ops-qwen-turbo`
-        * `qwen-turbo`
-        * `qwen-plus`
-        * `qwen-max`
-        * `deepseek-r1`
-        * `deepseek-v3`
-        * `deepseek-r1-distill-qwen-7b`
-        * `deepseek-r1-distill-qwen-14b`
-
-        For the supported `completion` service_ids, refer to the [documentation](https://help.aliyun.com/zh/open-search/search-platform/developer-reference/text-generation-api-details).
-
-        Available service_id for the `rerank` task is:
-
-        * `ops-bge-reranker-larger`
-
-        For the supported `rerank` service_id, refer to the [documentation](https://help.aliyun.com/zh/open-search/search-platform/developer-reference/ranker-api-details).
-
-        Available service_id for the `sparse_embedding` task:
-
-        * `ops-text-sparse-embedding-001`
-
-        For the supported `sparse_embedding` service_id, refer to the [documentation](https://help.aliyun.com/zh/open-search/search-platform/developer-reference/text-sparse-embedding-api-details).
-
-        Available service_ids for the `text_embedding` task:
-
-        * `ops-text-embedding-001`
-        * `ops-text-embedding-zh-001`
-        * `ops-text-embedding-en-001`
-        * `ops-text-embedding-002`
-
-        For the supported `text_embedding` service_ids, refer to the [documentation](https://help.aliyun.com/zh/open-search/search-platform/developer-reference/text-embedding-api-details).
-
-
-    `host`
-    :   (Required, string) The name of the host address used for the {{infer}} task. You can find the host address at [the API keys section](https://opensearch.console.aliyun.com/cn-shanghai/rag/api-key) of the documentation.
-
-    `workspace`
-    :   (Required, string) The name of the workspace used for the {{infer}} task.
-
-    `rate_limit`
-    :   (Optional, object) By default, the `alibabacloud-ai-search` service sets the number of requests allowed per minute to `1000`. This helps to minimize the number of rate limit errors returned from AlibabaCloud AI Search. To modify this, set the `requests_per_minute` setting of this object in your service settings:
-
-        ```text
-        "rate_limit": {
-            "requests_per_minute": <<number_of_requests>>
-        }
-        ```
-
-
-`task_settings`
-:   (Optional, object) Settings to configure the {{infer}} task. These settings are specific to the `<task_type>` you specified.
-
-    ::::{dropdown} `task_settings` for the `text_embedding` task type
-    `input_type`
-    :   (Optional, string) Specifies the type of input passed to the model. Valid values are:
-
-        * `ingest`: for storing document embeddings in a vector database.
-        * `search`: for storing embeddings of search queries run against a vector database to find relevant documents.
-
-
-    ::::
-
-
-    ::::{dropdown} `task_settings` for the `sparse_embedding` task type
-    `input_type`
-    :   (Optional, string) Specifies the type of input passed to the model. Valid values are:
-
-        * `ingest`: for storing document embeddings in a vector database.
-        * `search`: for storing embeddings of search queries run against a vector database to find relevant documents.
-
-
-    `return_token`
-    :   (Optional, boolean) If `true`, the token name will be returned in the response. Defaults to `false` which means only the token ID will be returned in the response.
-
-    ::::
-
-
-
-## AlibabaCloud AI Search service examples [inference-example-alibabacloud-ai-search] 
-
-The following example shows how to create an {{infer}} endpoint called `alibabacloud_ai_search_completion` to perform a `completion` task type.
-
-```console
-PUT _inference/completion/alibabacloud_ai_search_completion
-{
-    "service": "alibabacloud-ai-search",
-    "service_settings": {
-        "host" : "default-j01.platform-cn-shanghai.opensearch.aliyuncs.com",
-        "api_key": "{{API_KEY}}",
-        "service_id": "ops-qwen-turbo",
-        "workspace" : "default"
-    }
-}
-```
-
-The next example shows how to create an {{infer}} endpoint called `alibabacloud_ai_search_rerank` to perform a `rerank` task type.
-
-```console
-PUT _inference/rerank/alibabacloud_ai_search_rerank
-{
-    "service": "alibabacloud-ai-search",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "service_id": "ops-bge-reranker-larger",
-        "host": "default-j01.platform-cn-shanghai.opensearch.aliyuncs.com",
-        "workspace": "default"
-    }
-}
-```
-
-The following example shows how to create an {{infer}} endpoint called `alibabacloud_ai_search_sparse` to perform a `sparse_embedding` task type.
-
-```console
-PUT _inference/sparse_embedding/alibabacloud_ai_search_sparse
-{
-    "service": "alibabacloud-ai-search",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "service_id": "ops-text-sparse-embedding-001",
-        "host": "default-j01.platform-cn-shanghai.opensearch.aliyuncs.com",
-        "workspace": "default"
-    }
-}
-```
-
-The following example shows how to create an {{infer}} endpoint called `alibabacloud_ai_search_embeddings` to perform a `text_embedding` task type.
-
-```console
-PUT _inference/text_embedding/alibabacloud_ai_search_embeddings
-{
-    "service": "alibabacloud-ai-search",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "service_id": "ops-text-embedding-001",
-        "host": "default-j01.platform-cn-shanghai.opensearch.aliyuncs.com",
-        "workspace": "default"
-    }
-}
-```
-
diff --git a/explore-analyze/elastic-inference/inference-api/amazon-bedrock-inference-integration.md b/explore-analyze/elastic-inference/inference-api/amazon-bedrock-inference-integration.md
deleted file mode 100644
index 79de7781d5..0000000000
--- a/explore-analyze/elastic-inference/inference-api/amazon-bedrock-inference-integration.md
+++ /dev/null
@@ -1,158 +0,0 @@
----
-navigation_title: "Amazon Bedrock"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-amazon-bedrock.html
-applies_to:
-  stack:
-  serverless:
----
-
-# Amazon Bedrock inference integration [infer-service-amazon-bedrock]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `amazonbedrock` service.
-
-
-## {{api-request-title}} [infer-service-amazon-bedrock-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-amazon-bedrock-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `completion`,
-    * `text_embedding`.
-
-
-
-## {{api-request-body-title}} [infer-service-amazon-bedrock-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `amazonbedrock`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `amazonbedrock` service.
-
-    `access_key`
-    :   (Required, string) A valid AWS access key that has permissions to use Amazon Bedrock and access to models for inference requests.
-
-    `secret_key`
-    :   (Required, string) A valid AWS secret key that is paired with the `access_key`. To create or manage access and secret keys, see [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.md) in the AWS documentation.
-
-
-::::{important} 
-You need to provide the access and secret keys only once, during the {{infer}} model creation. The [Get {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-get) does not retrieve your access or secret keys. After creating the {{infer}} model, you cannot change the associated key pairs. If you want to use a different access and secret key pair, delete the {{infer}} model and recreate it with the same name and the updated keys.
-::::
-
-
-`provider`
-:   (Required, string) The model provider for your deployment. Note that some providers may support only certain task types. Supported providers include:
-
-    * `amazontitan` - available for `text_embedding` and `completion` task types
-    * `anthropic` - available for `completion` task type only
-    * `ai21labs` - available for `completion` task type only
-    * `cohere` - available for `text_embedding` and `completion` task types
-    * `meta` - available for `completion` task type only
-    * `mistral` - available for `completion` task type only
-
-
-`model`
-:   (Required, string) The base model ID or an ARN to a custom model based on a foundational model. The base model IDs can be found in the [Amazon Bedrock model IDs](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids.html) documentation. Note that the model ID must be available for the provider chosen, and your IAM user must have access to the model.
-
-`region`
-:   (Required, string) The region that your model or ARN is deployed in. The list of available regions per model can be found in the [Model support by AWS region](https://docs.aws.amazon.com/bedrock/latest/userguide/models-regions.html) documentation.
-
-`rate_limit`
-:   (Optional, object) By default, the `amazonbedrock` service sets the number of requests allowed per minute to `240`. This helps to minimize the number of rate limit errors returned from Amazon Bedrock. To modify this, set the `requests_per_minute` setting of this object in your service settings:
-
-    ```text
-    "rate_limit": {
-        "requests_per_minute": <<number_of_requests>>
-    }
-    ```
-
-    `task_settings`
-    :   (Optional, object) Settings to configure the {{infer}} task. These settings are specific to the `<task_type>` you specified.
-
-    ::::{dropdown} `task_settings` for the `completion` task type
-    `max_new_tokens`
-    :   (Optional, integer) Sets the maximum number for the output tokens to be generated. Defaults to 64.
-
-    `temperature`
-    :   (Optional, float) A number between 0.0 and 1.0 that controls the apparent creativity of the results. At temperature 0.0 the model is most deterministic, at temperature 1.0 most random. Should not be used if `top_p` or `top_k` is specified.
-
-    `top_p`
-    :   (Optional, float) Alternative to `temperature`. A number in the range of 0.0 to 1.0, to eliminate low-probability tokens. Top-p uses nucleus sampling to select top tokens whose sum of likelihoods does not exceed a certain value, ensuring both variety and coherence. Should not be used if `temperature` is specified.
-
-    `top_k`
-    :   (Optional, float) Only available for `anthropic`, `cohere`, and `mistral` providers. Alternative to `temperature`. Limits samples to the top-K most likely words, balancing coherence and variability. Should not be used if `temperature` is specified.
-
-    ::::
-
-
-
-## Amazon Bedrock service example [inference-example-amazonbedrock] 
-
-The following example shows how to create an {{infer}} endpoint called `amazon_bedrock_embeddings` to perform a `text_embedding` task type.
-
-Choose chat completion and embeddings models that you have access to from the [Amazon Bedrock base models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids.html).
-
-```console
-PUT _inference/text_embedding/amazon_bedrock_embeddings
-{
-    "service": "amazonbedrock",
-    "service_settings": {
-        "access_key": "<aws_access_key>",
-        "secret_key": "<aws_secret_key>",
-        "region": "us-east-1",
-        "provider": "amazontitan",
-        "model": "amazon.titan-embed-text-v2:0"
-    }
-}
-```
-
-The next example shows how to create an {{infer}} endpoint called `amazon_bedrock_completion` to perform a `completion` task type.
-
-```console
-PUT _inference/completion/amazon_bedrock_completion
-{
-    "service": "amazonbedrock",
-    "service_settings": {
-        "access_key": "<aws_access_key>",
-        "secret_key": "<aws_secret_key>",
-        "region": "us-east-1",
-        "provider": "amazontitan",
-        "model": "amazon.titan-text-premier-v1:0"
-    }
-}
-```
-
diff --git a/explore-analyze/elastic-inference/inference-api/anthropic-inference-integration.md b/explore-analyze/elastic-inference/inference-api/anthropic-inference-integration.md
deleted file mode 100644
index 32d5611105..0000000000
--- a/explore-analyze/elastic-inference/inference-api/anthropic-inference-integration.md
+++ /dev/null
@@ -1,132 +0,0 @@
----
-navigation_title: "Anthropic"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-anthropic.html
-applies_to:
-  stack:
-  serverless:
----
-
-# Anthropic inference integration [infer-service-anthropic]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `anthropic` service.
-
-
-## {{api-request-title}} [infer-service-anthropic-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-anthropic-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `completion`
-
-
-
-## {{api-request-body-title}} [infer-service-anthropic-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `anthropic`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `anthropic` service.
-
-    `api_key`
-    :   (Required, string) A valid API key for the Anthropic API.
-
-    `model_id`
-    :   (Required, string) The name of the model to use for the {{infer}} task. You can find the supported models at [Anthropic models](https://docs.anthropic.com/en/docs/about-claude/models#model-names).
-
-    `rate_limit`
-    :   (Optional, object) By default, the `anthropic` service sets the number of requests allowed per minute to `50`. This helps to minimize the number of rate limit errors returned from Anthropic. To modify this, set the `requests_per_minute` setting of this object in your service settings:
-
-        ```text
-        "rate_limit": {
-            "requests_per_minute": <<number_of_requests>>
-        }
-        ```
-
-
-`task_settings`
-:   (Required, object) Settings to configure the {{infer}} task. These settings are specific to the `<task_type>` you specified.
-
-    ::::{dropdown} `task_settings` for the `completion` task type
-    `max_tokens`
-    :   (Required, integer) The maximum number of tokens to generate before stopping.
-
-    `temperature`
-    :   (Optional, float) The amount of randomness injected into the response.
-
-        For more details about the supported range, see the [Anthropic messages API](https://docs.anthropic.com/en/api/messages).
-
-
-    `top_k`
-    :   (Optional, integer) Specifies to only sample from the top K options for each subsequent token.
-
-        Recommended for advanced use cases only. You usually only need to use `temperature`.
-
-        For more details, see the [Anthropic messages API](https://docs.anthropic.com/en/api/messages).
-
-
-    `top_p`
-    :   (Optional, float) Specifies to use Anthropic’s nucleus sampling.
-
-        In nucleus sampling, Anthropic computes the cumulative distribution over all the options for each subsequent token in decreasing probability order and cut it off once it reaches a particular probability specified by `top_p`. You should either alter `temperature` or `top_p`, but not both.
-
-        Recommended for advanced use cases only. You usually only need to use `temperature`.
-
-        For more details, see the [Anthropic messages API](https://docs.anthropic.com/en/api/messages).
-
-
-    ::::
-
-
-
-## Anthropic service example [inference-example-anthropic] 
-
-The following example shows how to create an {{infer}} endpoint called `anthropic_completion` to perform a `completion` task type.
-
-```console
-PUT _inference/completion/anthropic_completion
-{
-    "service": "anthropic",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "model_id": "<model_id>"
-    },
-    "task_settings": {
-        "max_tokens": 1024
-    }
-}
-```
-
diff --git a/explore-analyze/elastic-inference/inference-api/azure-ai-studio-inference-integration.md b/explore-analyze/elastic-inference/inference-api/azure-ai-studio-inference-integration.md
deleted file mode 100644
index 5bde120a6e..0000000000
--- a/explore-analyze/elastic-inference/inference-api/azure-ai-studio-inference-integration.md
+++ /dev/null
@@ -1,162 +0,0 @@
----
-navigation_title: "Azure AI Studio"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-azure-ai-studio.html
-applies_to:
-  stack:
-  serverless:
----
-
-# Azure AI Studio inference integration [infer-service-azure-ai-studio]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `azureaistudio` service.
-
-
-## {{api-request-title}} [infer-service-azure-ai-studio-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-azure-ai-studio-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `completion`,
-    * `text_embedding`.
-
-
-
-## {{api-request-body-title}} [infer-service-azure-ai-studio-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `azureaistudio`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `azureaistudio` service.
-
-    `api_key`
-    :   (Required, string) A valid API key of your Azure AI Studio model deployment. This key can be found on the overview page for your deployment in the management section of your [Azure AI Studio](https://ai.azure.com/) account.
-
-        ::::{important} 
-        You need to provide the API key only once, during the {{infer}} model creation. The [Get {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-get) does not retrieve your API key. After creating the {{infer}} model, you cannot change the associated API key. If you want to use a different API key, delete the {{infer}} model and recreate it with the same name and the updated API key.
-        ::::
-
-
-    `target`
-    :   (Required, string) The target URL of your Azure AI Studio model deployment. This can be found on the overview page for your deployment in the management section of your [Azure AI Studio](https://ai.azure.com/) account.
-
-    `provider`
-    :   (Required, string) The model provider for your deployment. Note that some providers may support only certain task types. Supported providers include:
-
-        * `cohere` - available for `text_embedding` and `completion` task types
-        * `databricks` - available for `completion` task type only
-        * `meta` - available for `completion` task type only
-        * `microsoft_phi` - available for `completion` task type only
-        * `mistral` - available for `completion` task type only
-        * `openai` - available for `text_embedding` and `completion` task types
-
-
-    `endpoint_type`
-    :   (Required, string) One of `token` or `realtime`. Specifies the type of endpoint that is used in your model deployment. There are [two endpoint types available](https://learn.microsoft.com/en-us/azure/ai-studio/concepts/deployments-overview#billing-for-deploying-and-inferencing-llms-in-azure-ai-studio) for deployment through Azure AI Studio. "Pay as you go" endpoints are billed per token. For these, you must specify `token` for your `endpoint_type`. For "real-time" endpoints which are billed per hour of usage, specify `realtime`.
-
-    `rate_limit`
-    :   (Optional, object) By default, the `azureaistudio` service sets the number of requests allowed per minute to `240`. This helps to minimize the number of rate limit errors returned from Azure AI Studio. To modify this, set the `requests_per_minute` setting of this object in your service settings:
-
-        ```text
-        "rate_limit": {
-            "requests_per_minute": <<number_of_requests>>
-        }
-        ```
-
-
-`task_settings`
-:   (Optional, object) Settings to configure the {{infer}} task. These settings are specific to the `<task_type>` you specified.
-
-    ::::{dropdown} `task_settings` for the `completion` task type
-    `do_sample`
-    :   (Optional, float) Instructs the inference process to perform sampling or not. Has no effect unless `temperature` or `top_p` is specified.
-
-    `max_new_tokens`
-    :   (Optional, integer) Provides a hint for the maximum number of output tokens to be generated. Defaults to 64.
-
-    `temperature`
-    :   (Optional, float) A number in the range of 0.0 to 2.0 that specifies the sampling temperature to use that controls the apparent creativity of generated completions. Should not be used if `top_p` is specified.
-
-    `top_p`
-    :   (Optional, float) A number in the range of 0.0 to 2.0 that is an alternative value to temperature that causes the model to consider the results of the tokens with nucleus sampling probability. Should not be used if `temperature` is specified.
-
-    ::::
-
-
-    ::::{dropdown} `task_settings` for the `text_embedding` task type
-    `user`
-    :   (optional, string) Specifies the user issuing the request, which can be used for abuse detection.
-
-    ::::
-
-
-
-## Azure AI Studio service example [inference-example-azureaistudio] 
-
-The following example shows how to create an {{infer}} endpoint called `azure_ai_studio_embeddings` to perform a `text_embedding` task type. Note that we do not specify a model here, as it is defined already via our Azure AI Studio deployment.
-
-The list of embeddings models that you can choose from in your deployment can be found in the [Azure AI Studio model explorer](https://ai.azure.com/explore/models?selectedTask=embeddings).
-
-```console
-PUT _inference/text_embedding/azure_ai_studio_embeddings
-{
-    "service": "azureaistudio",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "target": "<target_uri>",
-        "provider": "<model_provider>",
-        "endpoint_type": "<endpoint_type>"
-    }
-}
-```
-
-The next example shows how to create an {{infer}} endpoint called `azure_ai_studio_completion` to perform a `completion` task type.
-
-```console
-PUT _inference/completion/azure_ai_studio_completion
-{
-    "service": "azureaistudio",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "target": "<target_uri>",
-        "provider": "<model_provider>",
-        "endpoint_type": "<endpoint_type>"
-    }
-}
-```
-
-The list of chat completion models that you can choose from in your deployment can be found in the [Azure AI Studio model explorer](https://ai.azure.com/explore/models?selectedTask=chat-completion).
-
diff --git a/explore-analyze/elastic-inference/inference-api/azure-openai-inference-integration.md b/explore-analyze/elastic-inference/inference-api/azure-openai-inference-integration.md
deleted file mode 100644
index b222927021..0000000000
--- a/explore-analyze/elastic-inference/inference-api/azure-openai-inference-integration.md
+++ /dev/null
@@ -1,150 +0,0 @@
----
-navigation_title: "Azure OpenAI"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-azure-openai.html
-applies_to:
-  stack:
-  serverless:
----
-
-# Azure OpenAI inference integration [infer-service-azure-openai]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `azureopenai` service.
-
-
-## {{api-request-title}} [infer-service-azure-openai-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-azure-openai-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `completion`,
-    * `text_embedding`.
-
-
-
-## {{api-request-body-title}} [infer-service-azure-openai-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `azureopenai`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `azureopenai` service.
-
-    `api_key` or `entra_id`
-    :   (Required, string) You must provide *either* an API key or an Entra ID. If you do not provide either, or provide both, you will receive an error when trying to create your model. See the [Azure OpenAI Authentication documentation](https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#authentication) for more details on these authentication types.
-
-        ::::{important} 
-        You need to provide the API key only once, during the {{infer}} model creation. The [Get {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-get) does not retrieve your API key. After creating the {{infer}} model, you cannot change the associated API key. If you want to use a different API key, delete the {{infer}} model and recreate it with the same name and the updated API key.
-        ::::
-
-
-    `resource_name`
-    :   (Required, string) The name of your Azure OpenAI resource. You can find this from the [list of resources](https://portal.azure.com/#view/HubsExtension/BrowseAll) in the Azure Portal for your subscription.
-
-    `deployment_id`
-    :   (Required, string) The deployment name of your deployed models. Your Azure OpenAI deployments can be found though the [Azure OpenAI Studio](https://oai.azure.com/) portal that is linked to your subscription.
-
-    `api_version`
-    :   (Required, string) The Azure API version ID to use. We recommend using the [latest supported non-preview version](https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddings).
-
-    `rate_limit`
-    :   (Optional, object) The `azureopenai` service sets a default number of requests allowed per minute depending on the task type. For `text_embedding` it is set to `1440`. For `completion` it is set to `120`. This helps to minimize the number of rate limit errors returned from Azure. To modify this, set the `requests_per_minute` setting of this object in your service settings:
-
-        ```text
-        "rate_limit": {
-            "requests_per_minute": <<number_of_requests>>
-        }
-        ```
-
-        More information about the rate limits for Azure can be found in the [Quota limits docs](https://learn.microsoft.com/en-us/azure/ai-services/openai/quotas-limits) and [How to change the quotas](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/quota?tabs=rest).
-
-
-`task_settings`
-:   (Optional, object) Settings to configure the {{infer}} task. These settings are specific to the `<task_type>` you specified.
-
-    ::::{dropdown} `task_settings` for the `completion` task type
-    `user`
-    :   (optional, string) Specifies the user issuing the request, which can be used for abuse detection.
-
-    ::::
-
-
-    ::::{dropdown} `task_settings` for the `text_embedding` task type
-    `user`
-    :   (optional, string) Specifies the user issuing the request, which can be used for abuse detection.
-
-    ::::
-
-
-
-## Azure OpenAI service example [inference-example-azure-openai] 
-
-The following example shows how to create an {{infer}} endpoint called `azure_openai_embeddings` to perform a `text_embedding` task type. Note that we do not specify a model here, as it is defined already via our Azure OpenAI deployment.
-
-The list of embeddings models that you can choose from in your deployment can be found in the [Azure models documentation](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#embeddings).
-
-```console
-PUT _inference/text_embedding/azure_openai_embeddings
-{
-    "service": "azureopenai",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "resource_name": "<resource_name>",
-        "deployment_id": "<deployment_id>",
-        "api_version": "2024-02-01"
-    }
-}
-```
-
-The next example shows how to create an {{infer}} endpoint called `azure_openai_completion` to perform a `completion` task type.
-
-```console
-PUT _inference/completion/azure_openai_completion
-{
-    "service": "azureopenai",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "resource_name": "<resource_name>",
-        "deployment_id": "<deployment_id>",
-        "api_version": "2024-02-01"
-    }
-}
-```
-
-The list of chat completion models that you can choose from in your Azure OpenAI deployment can be found at the following places:
-
-* [GPT-4 and GPT-4 Turbo models](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-and-gpt-4-turbo-models)
-* [GPT-3.5](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-35)
-
diff --git a/explore-analyze/elastic-inference/inference-api/chat-completion-inference-api.md b/explore-analyze/elastic-inference/inference-api/chat-completion-inference-api.md
deleted file mode 100644
index 1369efee44..0000000000
--- a/explore-analyze/elastic-inference/inference-api/chat-completion-inference-api.md
+++ /dev/null
@@ -1,510 +0,0 @@
----
-navigation_title: "Chat completion"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/master/chat-completion-inference-api.html
-applies_to:
-  stack:
-  serverless:
----
-
-# Chat completion inference API [chat-completion-inference-api]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Streams a chat completion response.
-
-::::{important} 
-The {{infer}} APIs enable you to use certain services, such as built-in {{ml}} models (ELSER, E5), models uploaded through Eland, Cohere, OpenAI, Azure, Google AI Studio, Google Vertex AI, Anthropic, Watsonx.ai, or Hugging Face. For built-in models and models uploaded through Eland, the {{infer}} APIs offer an alternative way to use and manage trained models. However, if you do not plan to use the {{infer}} APIs to use these models or if you want to use non-NLP models, use the [*{{ml-cap}} trained model APIs*](https://www.elastic.co/guide/en/elasticsearch/reference/master/ml-df-trained-models-apis.html).
-::::
-
-
-
-## {{api-request-title}} [chat-completion-inference-api-request] 
-
-`POST /_inference/<inference_id>/_stream`
-
-`POST /_inference/chat_completion/<inference_id>/_stream`
-
-
-## {{api-prereq-title}} [chat-completion-inference-api-prereqs] 
-
-* Requires the `monitor_inference` [cluster privilege](https://www.elastic.co/guide/en/elasticsearch/reference/master/security-privileges.html#privileges-list-cluster) (the built-in `inference_admin` and `inference_user` roles grant this privilege)
-* You must use a client that supports streaming.
-
-
-## {{api-description-title}} [chat-completion-inference-api-desc] 
-
-The chat completion {{infer}} API enables real-time responses for chat completion tasks by delivering answers incrementally, reducing response times during computation. It only works with the `chat_completion` task type for `openai` and `elastic` {{infer}} services.
-
-::::{note}
-* The `chat_completion` task type is only available within the `_stream` API and only supports streaming.
-* The Chat completion {{infer}} API and the Stream {{infer}} API differ in their response structure and capabilities. The Chat completion {{infer}} API provides more comprehensive customization options through more fields and function calling support. If you use the `openai` service or the `elastic` service, use the Chat completion {{infer}} API.
-
-::::
-
-
-
-## {{api-path-parms-title}} [chat-completion-inference-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Optional, string) The type of {{infer}} task that the model performs. If included, this must be set to the value `chat_completion`.
-
-
-## {{api-request-body-title}} [chat-completion-inference-api-request-body] 
-
-`messages`
-:   (Required, array of objects) A list of objects representing the conversation. Requests should generally only add new messages from the user (role `user`). The other message roles (`assistant`, `system`, or `tool`) should generally only be copied from the response to a previous completion request, such that the messages array is built up throughout a conversation.
-
-    ::::{dropdown} Assistant message
-    `content`
-    :   (Required unless `tool_calls` is specified, string or array of objects) The contents of the message.
-
-        ::::{dropdown} Examples
-        String example
-
-        ```js
-        {
-            "content": "Some string"
-        }
-        ```
-
-        Object example
-
-        ```js
-        {
-            "content": [
-                {
-                    "text": "Some text",
-                    "type": "text"
-                }
-            ]
-        }
-        ```
-
-        ::::
-
-
-        String representation
-        :   (Required, string) The text content.
-
-        Object representation
-        :   `text`
-        :   (Required, string) The text content.
-
-        `type`
-        :   (Required, string) This must be set to the value `text`.
-
-
-    `role`
-    :   (Required, string) The role of the message author. This should be set to `assistant` for this type of message.
-
-    `tool_calls`
-    :   (Optional, array of objects) The tool calls generated by the model.
-
-        ::::{dropdown} Examples
-        ```js
-        {
-            "tool_calls": [
-                {
-                    "id": "call_KcAjWtAww20AihPHphUh46Gd",
-                    "type": "function",
-                    "function": {
-                        "name": "get_current_weather",
-                        "arguments": "{\"location\":\"Boston, MA\"}"
-                    }
-                }
-            ]
-        }
-        ```
-
-        ::::
-
-
-        `id`
-        :   (Required, string) The identifier of the tool call.
-
-        `type`
-        :   (Required, string) The type of tool call. This must be set to the value `function`.
-
-        `function`
-        :   (Required, object) The function that the model called.
-
-            `name`
-            :   (Required, string) The name of the function to call.
-
-            `arguments`
-            :   (Required, string) The arguments to call the function with in JSON format.
-
-
-    ::::
-
-
-    ::::{dropdown} System message
-    `content`
-    :   (Required, string or array of objects) The contents of the message.
-
-        ::::{dropdown} Examples
-        String example
-
-        ```js
-        {
-            "content": "Some string"
-        }
-        ```
-
-        Object example
-
-        ```js
-        {
-            "content": [
-                {
-                    "text": "Some text",
-                    "type": "text"
-                }
-            ]
-        }
-        ```
-
-        ::::
-
-
-    String representation
-    :   (Required, string) The text content.
-
-    Object representation
-    :   `text`
-    :   (Required, string) The text content.
-
-    `type`
-    :   (Required, string) This must be set to the value `text`.
-
-
-    `role`
-    :   (Required, string) The role of the message author. This should be set to `system` for this type of message.
-
-    ::::
-
-
-    ::::{dropdown} Tool message
-    `content`
-    :   (Required, string or array of objects) The contents of the message.
-
-        ::::{dropdown} Examples
-        String example
-
-        ```js
-        {
-            "content": "Some string"
-        }
-        ```
-
-        Object example
-
-        ```js
-        {
-            "content": [
-                {
-                    "text": "Some text",
-                    "type": "text"
-                }
-            ]
-        }
-        ```
-
-        ::::
-
-
-        String representation
-        :   (Required, string) The text content.
-
-        Object representation
-        :   `text`
-        :   (Required, string) The text content.
-
-        `type`
-        :   (Required, string) This must be set to the value `text`.
-
-
-    `role`
-    :   (Required, string) The role of the message author. This should be set to `tool` for this type of message.
-
-    `tool_call_id`
-    :   (Required, string) The tool call that this message is responding to.
-
-    ::::
-
-
-    ::::{dropdown} User message
-    `content`
-    :   (Required, string or array of objects) The contents of the message.
-
-        ::::{dropdown} Examples
-        String example
-
-        ```js
-        {
-            "content": "Some string"
-        }
-        ```
-
-        Object example
-
-        ```js
-        {
-            "content": [
-                {
-                    "text": "Some text",
-                    "type": "text"
-                }
-            ]
-        }
-        ```
-
-        ::::
-
-
-        String representation
-        :   (Required, string) The text content.
-
-        Object representation
-        :   `text`
-        :   (Required, string) The text content.
-
-        `type`
-        :   (Required, string) This must be set to the value `text`.
-
-
-    `role`
-    :   (Required, string) The role of the message author. This should be set to `user` for this type of message.
-
-    ::::
-
-
-`model`
-:   (Optional, string) The ID of the model to use. By default, the model ID is set to the value included when creating the inference endpoint.
-
-`max_completion_tokens`
-:   (Optional, integer) The upper bound limit for the number of tokens that can be generated for a completion request.
-
-`stop`
-:   (Optional, array of strings) A sequence of strings to control when the model should stop generating additional tokens.
-
-`temperature`
-:   (Optional, float) The sampling temperature to use.
-
-`tools`
-:   (Optional, array of objects) A list of tools that the model can call.
-
-    ::::{dropdown} Structure
-    `type`
-    :   (Required, string) The type of tool, must be set to the value `function`.
-
-    `function`
-    :   (Required, object) The function definition.
-
-        `description`
-        :   (Optional, string) A description of what the function does. This is used by the model to choose when and how to call the function.
-
-        `name`
-        :   (Required, string) The name of the function.
-
-        `parameters`
-        :   (Optional, object) The parameters the functional accepts. This should be formatted as a JSON object.
-
-        `strict`
-        :   (Optional, boolean) Whether to enable schema adherence when generating the function call.
-
-
-    ::::
-
-
-    ::::{dropdown} Examples
-    ```js
-    {
-        "tools": [
-            {
-                "type": "function",
-                "function": {
-                    "name": "get_price_of_item",
-                    "description": "Get the current price of an item",
-                    "parameters": {
-                        "type": "object",
-                        "properties": {
-                            "item": {
-                                "id": "12345"
-                            },
-                            "unit": {
-                                "type": "currency"
-                            }
-                        }
-                    }
-                }
-            }
-        ]
-    }
-    ```
-
-    ::::
-
-
-`tool_choice`
-:   (Optional, string or object) Controls which tool is called by the model.
-
-    String representation
-    :   One of `auto`, `none`, or `requrired`. `auto` allows the model to choose between calling tools and generating a message. `none` causes the model to not call any tools. `required` forces the model to call one or more tools.
-
-    Object representation
-    :   ::::{dropdown} Structure
-    `type`
-    :   (Required, string) The type of the tool. This must be set to the value `function`.
-
-    `function`
-    :   (Required, object)
-
-        `name`
-        :   (Required, string) The name of the function to call.
-
-
-    ::::
-
-
-    ::::{dropdown} Examples
-    ```js
-    {
-        "tool_choice": {
-            "type": "function",
-            "function": {
-                "name": "get_current_weather"
-            }
-        }
-    }
-    ```
-
-    ::::
-
-
-`top_p`
-:   (Optional, float) Nucleus sampling, an alternative to sampling with temperature.
-
-
-## {{api-examples-title}} [chat-completion-inference-api-example] 
-
-The following example performs a chat completion on the example question with streaming.
-
-```console
-POST _inference/chat_completion/openai-completion/_stream
-{
-    "model": "gpt-4o",
-    "messages": [
-        {
-            "role": "user",
-            "content": "What is Elastic?"
-        }
-    ]
-}
-```
-
-The following example performs a chat completion using an Assistant message with `tool_calls`.
-
-```console
-POST _inference/chat_completion/openai-completion/_stream
-{
-    "messages": [
-        {
-            "role": "assistant",
-            "content": "Let's find out what the weather is",
-            "tool_calls": [ <1>
-                {
-                    "id": "call_KcAjWtAww20AihPHphUh46Gd",
-                    "type": "function",
-                    "function": {
-                        "name": "get_current_weather",
-                        "arguments": "{\"location\":\"Boston, MA\"}"
-                    }
-                }
-            ]
-        },
-        { <2>
-            "role": "tool",
-            "content": "The weather is cold",
-            "tool_call_id": "call_KcAjWtAww20AihPHphUh46Gd"
-        }
-    ]
-}
-```
-
-1. Each tool call needs a corresponding Tool message.
-2. The corresponding Tool message.
-
-
-The following example performs a chat completion using a User message with `tools` and `tool_choice`.
-
-```console
-POST _inference/chat_completion/openai-completion/_stream
-{
-    "messages": [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's the price of a scarf?"
-                }
-            ]
-        }
-    ],
-    "tools": [
-        {
-            "type": "function",
-            "function": {
-                "name": "get_current_price",
-                "description": "Get the current price of a item",
-                "parameters": {
-                    "type": "object",
-                    "properties": {
-                        "item": {
-                            "id": "123"
-                        }
-                    }
-                }
-            }
-        }
-    ],
-    "tool_choice": {
-        "type": "function",
-        "function": {
-            "name": "get_current_price"
-        }
-    }
-}
-```
-
-The API returns the following response when a request is made to the OpenAI service:
-
-```txt
-event: message
-data: {"chat_completion":{"id":"chatcmpl-Ae0TWsy2VPnSfBbv5UztnSdYUMFP3","choices":[{"delta":{"content":"","role":"assistant"},"index":0}],"model":"gpt-4o-2024-08-06","object":"chat.completion.chunk"}}
-
-event: message
-data: {"chat_completion":{"id":"chatcmpl-Ae0TWsy2VPnSfBbv5UztnSdYUMFP3","choices":[{"delta":{"content":Elastic"},"index":0}],"model":"gpt-4o-2024-08-06","object":"chat.completion.chunk"}}
-
-event: message
-data: {"chat_completion":{"id":"chatcmpl-Ae0TWsy2VPnSfBbv5UztnSdYUMFP3","choices":[{"delta":{"content":" is"},"index":0}],"model":"gpt-4o-2024-08-06","object":"chat.completion.chunk"}}
-
-(...)
-
-event: message
-data: {"chat_completion":{"id":"chatcmpl-Ae0TWsy2VPnSfBbv5UztnSdYUMFP3","choices":[],"model":"gpt-4o-2024-08-06","object":"chat.completion.chunk","usage":{"completion_tokens":28,"prompt_tokens":16,"total_tokens":44}}} <1>
-
-event: message
-data: [DONE]
-```
-
-1. The last object message of the stream contains the token usage information.
-
-
diff --git a/explore-analyze/elastic-inference/inference-api/cohere-inference-integration.md b/explore-analyze/elastic-inference/inference-api/cohere-inference-integration.md
deleted file mode 100644
index d654a5f9b9..0000000000
--- a/explore-analyze/elastic-inference/inference-api/cohere-inference-integration.md
+++ /dev/null
@@ -1,190 +0,0 @@
----
-navigation_title: "Cohere"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-cohere.html
-applies_to:
-  stack:
-  serverless:
----
-
-# Cohere inference integration [infer-service-cohere]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `cohere` service.
-
-
-## {{api-request-title}} [infer-service-cohere-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-cohere-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `completion`,
-    * `rerank`,
-    * `text_embedding`.
-
-
-
-## {{api-request-body-title}} [infer-service-cohere-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `cohere`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `cohere` service.
-
-    `api_key`
-    :   (Required, string) A valid API key of your Cohere account. You can find your Cohere API keys or you can create a new one [on the API keys settings page](https://dashboard.cohere.com/api-keys).
-
-        ::::{important} 
-        You need to provide the API key only once, during the {{infer}} model creation. The [Get {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-get) does not retrieve your API key. After creating the {{infer}} model, you cannot change the associated API key. If you want to use a different API key, delete the {{infer}} model and recreate it with the same name and the updated API key.
-        ::::
-
-
-    `rate_limit`
-    :   (Optional, object) By default, the `cohere` service sets the number of requests allowed per minute to `10000`. This value is the same for all task types. This helps to minimize the number of rate limit errors returned from Cohere. To modify this, set the `requests_per_minute` setting of this object in your service settings:
-
-        ```text
-        "rate_limit": {
-            "requests_per_minute": <<number_of_requests>>
-        }
-        ```
-
-        More information about Cohere’s rate limits can be found in [Cohere’s production key docs](https://docs.cohere.com/docs/going-live#production-key-specifications).
-
-        ::::{dropdown} `service_settings` for the `completion` task type
-        `model_id`
-        :   (Optional, string) The name of the model to use for the {{infer}} task. To review the available `completion` models, refer to the [Cohere docs](https://docs.cohere.com/docs/models#command).
-
-        ::::
-
-
-        ::::{dropdown} `service_settings` for the `rerank` task type
-        `model_id`
-        :   (Optional, string) The name of the model to use for the {{infer}} task. To review the available `rerank` models, refer to the [Cohere docs](https://docs.cohere.com/reference/rerank-1).
-
-        ::::
-
-
-        ::::{dropdown} `service_settings` for the `text_embedding` task type
-        `embedding_type`
-        :   (Optional, string) Specifies the types of embeddings you want to get back. Defaults to `float`. Valid values are:
-
-            * `byte`: use it for signed int8 embeddings (this is a synonym of `int8`).
-            * `float`: use it for the default float embeddings.
-            * `int8`: use it for signed int8 embeddings.
-
-
-        `model_id`
-        :   (Optional, string) The name of the model to use for the {{infer}} task. To review the available `text_embedding` models, refer to the [Cohere docs](https://docs.cohere.com/reference/embed). The default value for `text_embedding` is `embed-english-v2.0`.
-
-        `similarity`
-        :   (Optional, string) Similarity measure. One of `cosine`, `dot_product`, `l2_norm`. Defaults based on the `embedding_type` (`float` → `dot_product`, `int8/byte` → `cosine`).
-
-        ::::
-
-
-`task_settings`
-:   (Optional, object) Settings to configure the {{infer}} task. These settings are specific to the `<task_type>` you specified.
-
-    ::::{dropdown} `task_settings` for the `rerank` task type
-    `return_documents`
-    :   (Optional, boolean) Specify whether to return doc text within the results.
-
-    `top_n`
-    :   (Optional, integer) The number of most relevant documents to return, defaults to the number of the documents. If this {{infer}} endpoint is used in a `text_similarity_reranker` retriever query and `top_n` is set, it must be greater than or equal to `rank_window_size` in the query.
-
-    ::::
-
-
-    ::::{dropdown} `task_settings` for the `text_embedding` task type
-    `input_type`
-    :   (Optional, string) Specifies the type of input passed to the model. Valid values are:
-
-        * `classification`: use it for embeddings passed through a text classifier.
-        * `clusterning`: use it for the embeddings run through a clustering algorithm.
-        * `ingest`: use it for storing document embeddings in a vector database.
-        * `search`: use it for storing embeddings of search queries run against a vector database to find relevant documents.
-
-            ::::{important} 
-            The `input_type` field is required when using embedding models `v3` and higher.
-            ::::
-
-
-    `truncate`
-    :   (Optional, string) Specifies how the API handles inputs longer than the maximum token length. Defaults to `END`. Valid values are:
-
-        * `NONE`: when the input exceeds the maximum input token length an error is returned.
-        * `START`: when the input exceeds the maximum input token length the start of the input is discarded.
-        * `END`: when the input exceeds the maximum input token length the end of the input is discarded.
-
-
-    ::::
-
-
-
-## Cohere service examples [inference-example-cohere] 
-
-The following example shows how to create an {{infer}} endpoint called `cohere-embeddings` to perform a `text_embedding` task type.
-
-```console
-PUT _inference/text_embedding/cohere-embeddings
-{
-    "service": "cohere",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "model_id": "embed-english-light-v3.0",
-        "embedding_type": "byte"
-    }
-}
-```
-
-The following example shows how to create an {{infer}} endpoint called `cohere-rerank` to perform a `rerank` task type.
-
-```console
-PUT _inference/rerank/cohere-rerank
-{
-    "service": "cohere",
-    "service_settings": {
-        "api_key": "<API-KEY>",
-        "model_id": "rerank-english-v3.0"
-    },
-    "task_settings": {
-        "top_n": 10,
-        "return_documents": true
-    }
-}
-```
-
-For more examples, also review the [Cohere documentation](https://docs.cohere.com/docs/elasticsearch-and-cohere#rerank-search-results-with-cohere-and-elasticsearch).
-
diff --git a/explore-analyze/elastic-inference/inference-api/elastic-inference-service-eis.md b/explore-analyze/elastic-inference/inference-api/elastic-inference-service-eis.md
index d6127e53f3..3c30ebb6d7 100644
--- a/explore-analyze/elastic-inference/inference-api/elastic-inference-service-eis.md
+++ b/explore-analyze/elastic-inference/inference-api/elastic-inference-service-eis.md
@@ -38,7 +38,7 @@ Creates an {{infer}} endpoint to perform an {{infer}} task with the `elastic` se
 ::::{note} 
 The `chat_completion` task type only supports streaming and only through the `_stream` API.
 
-For more information on how to use the `chat_completion` task type, please refer to the [chat completion documentation](chat-completion-inference-api.md).
+For more information on how to use the `chat_completion` task type, refer to the [chat completion documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-stream-inference).
 
 ::::
 
diff --git a/explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md b/explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md
deleted file mode 100644
index f899600fed..0000000000
--- a/explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md
+++ /dev/null
@@ -1,279 +0,0 @@
----
-navigation_title: "Elasticsearch"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elasticsearch.html
-applies_to:
-  stack:
-  serverless:
----
-
-# Elasticsearch inference integration [infer-service-elasticsearch]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `elasticsearch` service.
-
-::::{note} 
-* Your {{es}} deployment contains [preconfigured ELSER and E5 {{infer}} endpoints](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference), you only need to create the enpoints using the API if you want to customize the settings.
-* If you use the ELSER or the E5 model through the `elasticsearch` service, the API request will automatically download and deploy the model if it isn’t downloaded yet.
-
-::::
-
-
-
-## {{api-request-title}} [infer-service-elasticsearch-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-elasticsearch-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `rerank`,
-    * `sparse_embedding`,
-    * `text_embedding`.
-
-
-
-## {{api-request-body-title}} [infer-service-elasticsearch-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `elasticsearch`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `elasticsearch` service.
-
-    `deployment_id`
-    :   (Optional, string) The `deployment_id` of an existing trained model deployment. When `deployment_id` is used the `model_id` is optional.
-
-    `adaptive_allocations`
-    :   (Optional, object) Adaptive allocations configuration object. If enabled, the number of allocations of the model is set based on the current load the process gets. When the load is high, a new model allocation is automatically created (respecting the value of `max_number_of_allocations` if it’s set). When the load is low, a model allocation is automatically removed (respecting the value of `min_number_of_allocations` if it’s set). If `adaptive_allocations` is enabled, do not set the number of allocations manually.
-
-        `enabled`
-        :   (Optional, Boolean) If `true`, `adaptive_allocations` is enabled. Defaults to `false`.
-
-        `max_number_of_allocations`
-        :   (Optional, integer) Specifies the maximum number of allocations to scale to. If set, it must be greater than or equal to `min_number_of_allocations`.
-
-        `min_number_of_allocations`
-        :   (Optional, integer) Specifies the minimum number of allocations to scale to. If set, it must be greater than or equal to `0`. If not defined, the deployment scales to `0`.
-
-
-    `model_id`
-    :   (Required, string) The name of the model to use for the {{infer}} task. It can be the ID of either a built-in model (for example, `.multilingual-e5-small` for E5), a text embedding model already [uploaded through Eland](../../../explore-analyze/machine-learning/nlp/ml-nlp-import-model.md#ml-nlp-import-script).
-
-    `num_allocations`
-    :   (Required, integer) The total number of allocations this model is assigned across machine learning nodes. Increasing this value generally increases the throughput. If `adaptive_allocations` is enabled, do not set this value, because it’s automatically set.
-
-    `num_threads`
-    :   (Required, integer) Sets the number of threads used by each model allocation during inference. This generally increases the speed per inference request. The inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node. Must be a power of 2. Max allowed value is 32.
-
-
-`task_settings`
-:   (Optional, object) Settings to configure the {{infer}} task. These settings are specific to the `<task_type>` you specified.
-
-    ::::{dropdown} `task_settings` for the `rerank` task type
-    `return_documents`
-    :   (Optional, Boolean) Returns the document instead of only the index. Defaults to `true`.
-
-    ::::
-
-
-
-## ELSER via the `elasticsearch` service [inference-example-elasticsearch-elser] 
-
-The following example shows how to create an {{infer}} endpoint called `my-elser-model` to perform a `sparse_embedding` task type.
-
-The API request below will automatically download the ELSER model if it isn’t already downloaded and then deploy the model.
-
-```console
-PUT _inference/sparse_embedding/my-elser-model
-{
-  "service": "elasticsearch",
-  "service_settings": {
-    "adaptive_allocations": { <1>
-      "enabled": true,
-      "min_number_of_allocations": 1,
-      "max_number_of_allocations": 4
-    },
-    "num_threads": 1,
-    "model_id": ".elser_model_2" <2>
-  }
-}
-```
-
-1. Adaptive allocations will be enabled with the minimum of 1 and the maximum of 10 allocations.
-2. The `model_id` must be the ID of one of the built-in ELSER models. Valid values are `.elser_model_2` and `.elser_model_2_linux-x86_64`. For further details, refer to the [ELSER model documentation](../../../explore-analyze/machine-learning/nlp/ml-nlp-elser.md).
-
-
-
-## Elastic Rerank via the `elasticsearch` service [inference-example-elastic-reranker] 
-
-The following example shows how to create an {{infer}} endpoint called `my-elastic-rerank` to perform a `rerank` task type using the built-in [Elastic Rerank](../../../explore-analyze/machine-learning/nlp/ml-nlp-rerank.md) cross-encoder model.
-
-::::{tip} 
-Refer to this [Python notebook](https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/search/12-semantic-reranking-elastic-rerank.ipynb) for an end-to-end example using Elastic Rerank.
-
-::::
-
-
-The API request below will automatically download the Elastic Rerank model if it isn’t already downloaded and then deploy the model. Once deployed, the model can be used for semantic re-ranking with a [`text_similarity_reranker` retriever](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-search#operation-search-body-application-json-retriever).
-
-```console
-PUT _inference/rerank/my-elastic-rerank
-{
-  "service": "elasticsearch",
-  "service_settings": {
-    "model_id": ".rerank-v1", <1>
-    "num_threads": 1,
-    "adaptive_allocations": { <2>
-      "enabled": true,
-      "min_number_of_allocations": 1,
-      "max_number_of_allocations": 4
-    }
-  }
-}
-```
-
-1. The `model_id` must be the ID of the built-in Elastic Rerank model: `.rerank-v1`.
-2. [Adaptive allocations](../../../deploy-manage/autoscaling/trained-model-autoscaling.md#enabling-autoscaling-through-apis-adaptive-allocations) will be enabled with the minimum of 1 and the maximum of 10 allocations.
-
-
-
-## E5 via the `elasticsearch` service [inference-example-elasticsearch] 
-
-The following example shows how to create an {{infer}} endpoint called `my-e5-model` to perform a `text_embedding` task type.
-
-The API request below will automatically download the E5 model if it isn’t already downloaded and then deploy the model.
-
-```console
-PUT _inference/text_embedding/my-e5-model
-{
-  "service": "elasticsearch",
-  "service_settings": {
-    "num_allocations": 1,
-    "num_threads": 1,
-    "model_id": ".multilingual-e5-small" <1>
-  }
-}
-```
-
-1. The `model_id` must be the ID of one of the built-in E5 models. Valid values are `.multilingual-e5-small` and `.multilingual-e5-small_linux-x86_64`. For further details, refer to the [E5 model documentation](../../../explore-analyze/machine-learning/nlp/ml-nlp-e5.md).
-
-
-::::{note} 
-You might see a 502 bad gateway error in the response when using the {{kib}} Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the {{ml-app}} UI. If using the Python client, you can set the `timeout` parameter to a higher value.
-
-::::
-
-
-
-## Models uploaded by Eland via the `elasticsearch` service [inference-example-eland] 
-
-The following example shows how to create an {{infer}} endpoint called `my-msmarco-minilm-model` to perform a `text_embedding` task type.
-
-```console
-PUT _inference/text_embedding/my-msmarco-minilm-model <1>
-{
-  "service": "elasticsearch",
-  "service_settings": {
-    "num_allocations": 1,
-    "num_threads": 1,
-    "model_id": "msmarco-MiniLM-L12-cos-v5" <2>
-  }
-}
-```
-
-1. Provide an unique identifier for the inference endpoint. The `inference_id` must be unique and must not match the `model_id`.
-2. The `model_id` must be the ID of a text embedding model which has already been [uploaded through Eland](../../../explore-analyze/machine-learning/nlp/ml-nlp-import-model.md#ml-nlp-import-script).
-
-
-
-## Setting adaptive allocation for E5 via the `elasticsearch` service [inference-example-adaptive-allocation] 
-
-The following example shows how to create an {{infer}} endpoint called `my-e5-model` to perform a `text_embedding` task type and configure adaptive allocations.
-
-The API request below will automatically download the E5 model if it isn’t already downloaded and then deploy the model.
-
-```console
-PUT _inference/text_embedding/my-e5-model
-{
-  "service": "elasticsearch",
-  "service_settings": {
-    "adaptive_allocations": {
-      "enabled": true,
-      "min_number_of_allocations": 3,
-      "max_number_of_allocations": 10
-    },
-    "num_threads": 1,
-    "model_id": ".multilingual-e5-small"
-  }
-}
-```
-
-
-## Using an existing model deployment with the `elasticsearch` service [inference-example-existing-deployment] 
-
-The following example shows how to use an already existing model deployment when creating an {{infer}} endpoint.
-
-```console
-PUT _inference/sparse_embedding/use_existing_deployment
-{
-  "service": "elasticsearch",
-  "service_settings": {
-    "deployment_id": ".elser_model_2" <1>
-  }
-}
-```
-
-1. The `deployment_id` of the already existing model deployment.
-
-
-The API response contains the `model_id`, and the threads and allocations settings from the model deployment:
-
-```console-result
-{
-  "inference_id": "use_existing_deployment",
-  "task_type": "sparse_embedding",
-  "service": "elasticsearch",
-  "service_settings": {
-    "num_allocations": 2,
-    "num_threads": 1,
-    "model_id": ".elser_model_2",
-    "deployment_id": ".elser_model_2"
-  },
-  "chunking_settings": {
-    "strategy": "sentence",
-    "max_chunk_size": 250,
-    "sentence_overlap": 1
-  }
-}
-```
-
diff --git a/explore-analyze/elastic-inference/inference-api/elser-inference-integration.md b/explore-analyze/elastic-inference/inference-api/elser-inference-integration.md
deleted file mode 100644
index dac64a6b4d..0000000000
--- a/explore-analyze/elastic-inference/inference-api/elser-inference-integration.md
+++ /dev/null
@@ -1,171 +0,0 @@
----
-navigation_title: "ELSER"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elser.html
-applies_to:
-  stack:
-  serverless:
----
-
-# ELSER inference integration [infer-service-elser]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `elser` service. You can also deploy ELSER by using the [Elasticsearch {{infer}} integration](elasticsearch-inference-integration.md).
-
-::::{note} 
-* Your {{es}} deployment contains [a preconfigured ELSER {{infer}} endpoint](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference), you only need to create the enpoint using the API if you want to customize the settings.
-* The API request will automatically download and deploy the ELSER model if it isn’t already downloaded.
-
-::::
-
-
-::::{admonition} Deprecated in 8.16
-:class: warning
-
-The `elser` service is deprecated and will be removed in a future release. Use the [Elasticsearch {{infer}} integration](elasticsearch-inference-integration.md) instead, with `model_id` included in the `service_settings`.
-
-::::
-
-
-
-## {{api-request-title}} [infer-service-elser-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-elser-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `sparse_embedding`.
-
-
-
-## {{api-request-body-title}} [infer-service-elser-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `elser`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `elser` service.
-
-    `adaptive_allocations`
-    :   (Optional, object) Adaptive allocations configuration object. If enabled, the number of allocations of the model is set based on the current load the process gets. When the load is high, a new model allocation is automatically created (respecting the value of `max_number_of_allocations` if it’s set). When the load is low, a model allocation is automatically removed (respecting the value of `min_number_of_allocations` if it’s set). If `adaptive_allocations` is enabled, do not set the number of allocations manually.
-
-        `enabled`
-        :   (Optional, Boolean) If `true`, `adaptive_allocations` is enabled. Defaults to `false`.
-
-        `max_number_of_allocations`
-        :   (Optional, integer) Specifies the maximum number of allocations to scale to. If set, it must be greater than or equal to `min_number_of_allocations`.
-
-        `min_number_of_allocations`
-        :   (Optional, integer) Specifies the minimum number of allocations to scale to. If set, it must be greater than or equal to `0`. If not defined, the deployment scales to `0`.
-
-
-    `num_allocations`
-    :   (Required, integer) The total number of allocations this model is assigned across machine learning nodes. Increasing this value generally increases the throughput. If `adaptive_allocations` is enabled, do not set this value, because it’s automatically set.
-
-    `num_threads`
-    :   (Required, integer) Sets the number of threads used by each model allocation during inference. This generally increases the speed per inference request. The inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node. Must be a power of 2. Max allowed value is 32.
-
-
-
-## ELSER service example with adaptive allocations [inference-example-elser-adaptive-allocation] 
-
-When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load.
-
-::::{note} 
-For more information on how to optimize your ELSER endpoints, refer to [the ELSER recommendations](../../../explore-analyze/machine-learning/nlp/ml-nlp-elser.md#elser-recommendations) section in the model documentation. To learn more about model autoscaling, refer to the [trained model autoscaling](../../../deploy-manage/autoscaling/trained-model-autoscaling.md) page.
-::::
-
-
-The following example shows how to create an {{infer}} endpoint called `my-elser-model` to perform a `sparse_embedding` task type and configure adaptive allocations.
-
-The request below will automatically download the ELSER model if it isn’t already downloaded and then deploy the model.
-
-```console
-PUT _inference/sparse_embedding/my-elser-model
-{
-  "service": "elser",
-  "service_settings": {
-    "adaptive_allocations": {
-      "enabled": true,
-      "min_number_of_allocations": 3,
-      "max_number_of_allocations": 10
-    },
-    "num_threads": 1
-  }
-}
-```
-
-
-## ELSER service example without adaptive allocations [inference-example-elser] 
-
-The following example shows how to create an {{infer}} endpoint called `my-elser-model` to perform a `sparse_embedding` task type. Refer to the [ELSER model documentation](../../../explore-analyze/machine-learning/nlp/ml-nlp-elser.md) for more info.
-
-::::{note} 
-If you want to optimize your ELSER endpoint for ingest, set the number of threads to `1` (`"num_threads": 1`). If you want to optimize your ELSER endpoint for search, set the number of threads to greater than `1`.
-::::
-
-
-The request below will automatically download the ELSER model if it isn’t already downloaded and then deploy the model.
-
-```console
-PUT _inference/sparse_embedding/my-elser-model
-{
-  "service": "elser",
-  "service_settings": {
-    "num_allocations": 1,
-    "num_threads": 1
-  }
-}
-```
-
-Example response:
-
-```console-result
-{
-  "inference_id": "my-elser-model",
-  "task_type": "sparse_embedding",
-  "service": "elser",
-  "service_settings": {
-    "num_allocations": 1,
-    "num_threads": 1
-  },
-  "task_settings": {}
-}
-```
-
-::::{note} 
-You might see a 502 bad gateway error in the response when using the {{kib}} Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the {{ml-app}} UI. If using the Python client, you can set the `timeout` parameter to a higher value.
-
-::::
-
-
diff --git a/explore-analyze/elastic-inference/inference-api/google-ai-studio-inference-integration.md b/explore-analyze/elastic-inference/inference-api/google-ai-studio-inference-integration.md
deleted file mode 100644
index 1ee88b1c7f..0000000000
--- a/explore-analyze/elastic-inference/inference-api/google-ai-studio-inference-integration.md
+++ /dev/null
@@ -1,96 +0,0 @@
----
-navigation_title: "Google AI Studio"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-google-ai-studio.html
-applies_to:
-  stack:
-  serverless:
----
-
-# Google AI Studio inference integration [infer-service-google-ai-studio]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `googleaistudio` service.
-
-
-## {{api-request-title}} [infer-service-google-ai-studio-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-google-ai-studio-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `completion`,
-    * `text_embedding`.
-
-
-
-## {{api-request-body-title}} [infer-service-google-ai-studio-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `googleaistudio`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `googleaistudio` service.
-
-    `api_key`
-    :   (Required, string) A valid API key for the Google Gemini API.
-
-    `model_id`
-    :   (Required, string) The name of the model to use for the {{infer}} task. You can find the supported models at [Gemini API models](https://ai.google.dev/gemini-api/docs/models/gemini).
-
-    `rate_limit`
-    :   (Optional, object) By default, the `googleaistudio` service sets the number of requests allowed per minute to `360`. This helps to minimize the number of rate limit errors returned from Google AI Studio. To modify this, set the `requests_per_minute` setting of this object in your service settings:
-
-        ```text
-        "rate_limit": {
-            "requests_per_minute": <<number_of_requests>>
-        }
-        ```
-
-
-
-## Google AI Studio service example [inference-example-google-ai-studio] 
-
-The following example shows how to create an {{infer}} endpoint called `google_ai_studio_completion` to perform a `completion` task type.
-
-```console
-PUT _inference/completion/google_ai_studio_completion
-{
-    "service": "googleaistudio",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "model_id": "<model_id>"
-    }
-}
-```
-
diff --git a/explore-analyze/elastic-inference/inference-api/google-vertex-ai-inference-integration.md b/explore-analyze/elastic-inference/inference-api/google-vertex-ai-inference-integration.md
deleted file mode 100644
index afcf2809d8..0000000000
--- a/explore-analyze/elastic-inference/inference-api/google-vertex-ai-inference-integration.md
+++ /dev/null
@@ -1,136 +0,0 @@
----
-navigation_title: "Google Vertex AI"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-google-vertex-ai.html
-applies_to:
-  stack:
-  serverless:
----
-
-# Google Vertex AI inference integration [infer-service-google-vertex-ai]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `googlevertexai` service.
-
-
-## {{api-request-title}} [infer-service-google-vertex-ai-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-google-vertex-ai-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `rerank`
-    * `text_embedding`.
-
-
-
-## {{api-request-body-title}} [infer-service-google-vertex-ai-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `googlevertexai`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `googlevertexai` service.
-
-    `service_account_json`
-    :   (Required, string) A valid service account in json format for the Google Vertex AI API.
-
-    `model_id`
-    :   (Required, string) The name of the model to use for the {{infer}} task. You can find the supported models at [Text embeddings API](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api).
-
-    `location`
-    :   (Required, string) The name of the location to use for the {{infer}} task. You find the supported locations at [Generative AI on Vertex AI locations](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations).
-
-    `project_id`
-    :   (Required, string) The name of the project to use for the {{infer}} task.
-
-    `rate_limit`
-    :   (Optional, object) By default, the `googlevertexai` service sets the number of requests allowed per minute to `30.000`. This helps to minimize the number of rate limit errors returned from Google Vertex AI. To modify this, set the `requests_per_minute` setting of this object in your service settings:
-
-        ```text
-        "rate_limit": {
-            "requests_per_minute": <<number_of_requests>>
-        }
-        ```
-
-        More information about the rate limits for Google Vertex AI can be found in the [Google Vertex AI Quotas docs](https://cloud.google.com/vertex-ai/docs/quotas).
-
-
-`task_settings`
-:   (Optional, object) Settings to configure the {{infer}} task. These settings are specific to the `<task_type>` you specified.
-
-    ::::{dropdown} `task_settings` for the `rerank` task type
-    `top_n`
-    :   (optional, boolean) Specifies the number of the top n documents, which should be returned.
-
-    ::::
-
-
-    ::::{dropdown} `task_settings` for the `text_embedding` task type
-    `auto_truncate`
-    :   (optional, boolean) Specifies if the API truncates inputs longer than the maximum token length automatically.
-
-    ::::
-
-
-
-## Google Vertex AI service example [inference-example-google-vertex-ai] 
-
-The following example shows how to create an {{infer}} endpoint called `google_vertex_ai_embeddings` to perform a `text_embedding` task type.
-
-```console
-PUT _inference/text_embedding/google_vertex_ai_embeddings
-{
-    "service": "googlevertexai",
-    "service_settings": {
-        "service_account_json": "<service_account_json>",
-        "model_id": "<model_id>",
-        "location": "<location>",
-        "project_id": "<project_id>"
-    }
-}
-```
-
-The next example shows how to create an {{infer}} endpoint called `google_vertex_ai_rerank` to perform a `rerank` task type.
-
-```console
-PUT _inference/rerank/google_vertex_ai_rerank
-{
-    "service": "googlevertexai",
-    "service_settings": {
-        "service_account_json": "<service_account_json>",
-        "project_id": "<project_id>"
-    }
-}
-```
-
diff --git a/explore-analyze/elastic-inference/inference-api/huggingface-inference-integration.md b/explore-analyze/elastic-inference/inference-api/huggingface-inference-integration.md
deleted file mode 100644
index ed7aeb215f..0000000000
--- a/explore-analyze/elastic-inference/inference-api/huggingface-inference-integration.md
+++ /dev/null
@@ -1,111 +0,0 @@
----
-navigation_title: "HuggingFace"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-hugging-face.html
-applies_to:
-  stack:
-  serverless:
----
-
-# HuggingFace inference integration [infer-service-hugging-face]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `hugging_face` service.
-
-## {{api-request-title}} [infer-service-hugging-face-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-## {{api-path-parms-title}} [infer-service-hugging-face-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `text_embedding`.
-
-## {{api-request-body-title}} [infer-service-hugging-face-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `hugging_face`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `hugging_face` service.
-
-    `api_key`
-    :   (Required, string) A valid access token of your Hugging Face account. You can find your Hugging Face access tokens or you can create a new one [on the settings page](https://huggingface.co/settings/tokens).
-        ::::{important} 
-        You need to provide the API key only once, during the {{infer}} model creation. The [Get {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-get) does not retrieve your API key. After creating the {{infer}} model, you cannot change the associated API key. If you want to use a different API key, delete the {{infer}} model and recreate it with the same name and the updated API key.
-        ::::
-
-
-    `url`
-    :   (Required, string) The URL endpoint to use for the requests.
-
-    `rate_limit`
-    :   (Optional, object) By default, the `huggingface` service sets the number of requests allowed per minute to `3000`. This helps to minimize the number of rate limit errors returned from Hugging Face. To modify this, set the `requests_per_minute` setting of this object in your service settings:
-
-        ```text
-        "rate_limit": {
-            "requests_per_minute": <<number_of_requests>>
-        }
-        ```
-
-
-
-## Hugging Face service example [inference-example-hugging-face] 
-
-The following example shows how to create an {{infer}} endpoint called `hugging-face-embeddings` to perform a `text_embedding` task type.
-
-```console
-PUT _inference/text_embedding/hugging-face-embeddings
-{
-  "service": "hugging_face",
-  "service_settings": {
-    "api_key": "<access_token>", <1>
-    "url": "<url_endpoint>" <2>
-  }
-}
-```
-
-1. A valid Hugging Face access token. You can find on the [settings page of your account](https://huggingface.co/settings/tokens).
-2. The {{infer}} endpoint URL you created on Hugging Face.
-
-
-Create a new {{infer}} endpoint on [the Hugging Face endpoint page](https://ui.endpoints.huggingface.co/) to get an endpoint URL. Select the model you want to use on the new endpoint creation page - for example `intfloat/e5-small-v2` - then select the `Sentence Embeddings` task under the Advanced configuration section. Create the endpoint. Copy the URL after the endpoint initialization has been finished.
-
-$$$inference-example-hugging-face-supported-models$$$
-The list of recommended models for the Hugging Face service:
-
-* [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
-* [all-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2)
-* [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)
-* [e5-base-v2](https://huggingface.co/intfloat/e5-base-v2)
-* [e5-small-v2](https://huggingface.co/intfloat/e5-small-v2)
-* [multilingual-e5-base](https://huggingface.co/intfloat/multilingual-e5-base)
-* [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small)
-
diff --git a/explore-analyze/elastic-inference/inference-api/jinaai-inference-integration.md b/explore-analyze/elastic-inference/inference-api/jinaai-inference-integration.md
deleted file mode 100644
index 40ee6e4715..0000000000
--- a/explore-analyze/elastic-inference/inference-api/jinaai-inference-integration.md
+++ /dev/null
@@ -1,225 +0,0 @@
----
-navigation_title: "JinaAI"
-mapped_pages:
-  - https://www.elastic.co/guide/en/elasticsearch/reference/master/infer-service-jinaai.html
-applies_to:
-  stack:
-  serverless:
----
-
-# JinaAI inference integration [infer-service-jinaai]
-
-:::{tip} Inference API reference  
-Refer to the [{{infer-cap}} APIs](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference) for further information.  
-:::
-
-Creates an {{infer}} endpoint to perform an {{infer}} task with the `jinaai` service.
-
-
-## {{api-request-title}} [infer-service-jinaai-api-request] 
-
-`PUT /_inference/<task_type>/<inference_id>`
-
-
-## {{api-path-parms-title}} [infer-service-jinaai-api-path-params] 
-
-`<inference_id>`
-:   (Required, string) The unique identifier of the {{infer}} endpoint.
-
-`<task_type>`
-:   (Required, string) The type of the {{infer}} task that the model will perform.
-
-    Available task types:
-
-    * `text_embedding`,
-    * `rerank`.
-
-
-
-## {{api-request-body-title}} [infer-service-jinaai-api-request-body] 
-
-`chunking_settings`
-:   (Optional, object) Chunking configuration object. Refer to [Configuring chunking](https://www.elastic.co/guide/en/elasticsearch/reference/master/inference-apis.html#infer-chunking-config) to learn more about chunking.
-
-    `max_chunk_size`
-    :   (Optional, integer) Specifies the maximum size of a chunk in words. Defaults to `250`. This value cannot be higher than `300` or lower than `20` (for `sentence` strategy) or `10` (for `word` strategy).
-
-    `overlap`
-    :   (Optional, integer) Only for `word` chunking strategy. Specifies the number of overlapping words for chunks. Defaults to `100`. This value cannot be higher than the half of `max_chunk_size`.
-
-    `sentence_overlap`
-    :   (Optional, integer) Only for `sentence` chunking strategy. Specifies the numnber of overlapping sentences for chunks. It can be either `1` or `0`. Defaults to `1`.
-
-    `strategy`
-    :   (Optional, string) Specifies the chunking strategy. It could be either `sentence` or `word`.
-
-
-`service`
-:   (Required, string) The type of service supported for the specified task type. In this case, `jinaai`.
-
-`service_settings`
-:   (Required, object) Settings used to install the {{infer}} model.
-
-    These settings are specific to the `jinaai` service.
-
-    `api_key`
-    :   (Required, string) A valid API key for your JinaAI account. You can find it at [https://jina.ai/embeddings/](https://jina.ai/embeddings/).
-
-        ::::{important} 
-        You need to provide the API key only once, during the {{infer}} model creation. The [Get {{infer}} API](https://www.elastic.co/guide/en/elasticsearch/reference/master/get-inference-api.html) does not retrieve your API key. After creating the {{infer}} model, you cannot change the associated API key. If you want to use a different API key, delete the {{infer}} model and recreate it with the same name and the updated API key.
-        ::::
-
-
-    `rate_limit`
-    :   (Optional, object) The default rate limit for the `jinaai` service is 2000 requests per minute for all task types. You can modify this using the `requests_per_minute` setting in your service settings:
-
-        ```text
-        "rate_limit": {
-            "requests_per_minute": <<number_of_requests>>
-        }
-        ```
-
-        More information about JinaAI’s rate limits can be found in [https://jina.ai/contact-sales/#rate-limit](https://jina.ai/contact-sales/#rate-limit).
-
-        ::::{dropdown} `service_settings` for the `rerank` task type
-        `model_id`
-        :   (Required, string) The name of the model to use for the {{infer}} task. To review the available `rerank` compatible models, refer to [https://jina.ai/reranker](https://jina.ai/reranker).
-
-        ::::
-
-
-        ::::{dropdown} `service_settings` for the `text_embedding` task type
-        `model_id`
-        :   (Optional, string) The name of the model to use for the {{infer}} task. To review the available `text_embedding` models, refer to the [https://jina.ai/embeddings/](https://jina.ai/embeddings/).
-
-        `similarity`
-        :   (Optional, string) Similarity measure. One of `cosine`, `dot_product`, `l2_norm`. Defaults based on the `embedding_type` (`float` → `dot_product`, `int8/byte` → `cosine`).
-
-        ::::
-
-
-`task_settings`
-:   (Optional, object) Settings to configure the {{infer}} task. These settings are specific to the `<task_type>` you specified.
-
-    ::::{dropdown} `task_settings` for the `rerank` task type
-    `return_documents`
-    :   (Optional, boolean) Specify whether to return doc text within the results.
-
-    `top_n`
-    :   (Optional, integer) The number of most relevant documents to return, defaults to the number of the documents. If this {{infer}} endpoint is used in a `text_similarity_reranker` retriever query and `top_n` is set, it must be greater than or equal to `rank_window_size` in the query.
-
-    ::::
-
-
-    ::::{dropdown} `task_settings` for the `text_embedding` task type
-    `task`
-    :   (Optional, string) Specifies the task passed to the model. Valid values are:
-
-        * `classification`: use it for embeddings passed through a text classifier.
-        * `clustering`: use it for the embeddings run through a clustering algorithm.
-        * `ingest`: use it for storing document embeddings in a vector database.
-        * `search`: use it for storing embeddings of search queries run against a vector database to find relevant documents.
-
-
-    ::::
-
-
-
-## JinaAI service examples [inference-example-jinaai] 
-
-The following examples demonstrate how to create {{infer}} endpoints for `text_embeddings` and `rerank` tasks using the JinaAI service and use them in search requests.
-
-First, we create the `embeddings` service:
-
-```console
-PUT _inference/text_embedding/jinaai-embeddings
-{
-    "service": "jinaai",
-    "service_settings": {
-        "model_id": "jina-embeddings-v3",
-        "api_key": "<api_key>"
-    }
-}
-```
-
-Then, we create the `rerank` service:
-
-```console
-PUT _inference/rerank/jinaai-rerank
-{
-    "service": "jinaai",
-    "service_settings": {
-        "api_key": "<api_key>",
-        "model_id": "jina-reranker-v2-base-multilingual"
-    },
-    "task_settings": {
-        "top_n": 10,
-        "return_documents": true
-    }
-}
-```
-
-Now we can create an index that will use `jinaai-embeddings` service to index the documents.
-
-```console
-PUT jinaai-index
-{
-  "mappings": {
-    "properties": {
-      "content": {
-        "type": "semantic_text",
-        "inference_id": "jinaai-embeddings"
-      }
-    }
-  }
-}
-```
-
-```console
-PUT jinaai-index/_bulk
-{ "index" : { "_index" : "jinaai-index", "_id" : "1" } }
-{"content": "Sarah Johnson is a talented marine biologist working at the Oceanographic Institute. Her groundbreaking research on coral reef ecosystems has garnered international attention and numerous accolades."}
-{ "index" : { "_index" : "jinaai-index", "_id" : "2" } }
-{"content": "She spends months at a time diving in remote locations, meticulously documenting the intricate relationships between various marine species. "}
-{ "index" : { "_index" : "jinaai-index", "_id" : "3" } }
-{"content": "Her dedication to preserving these delicate underwater environments has inspired a new generation of conservationists."}
-```
-
-Now, with the index created, we can search with and without the reranker service.
-
-```console
-GET jinaai-index/_search
-{
-  "query": {
-    "semantic": {
-      "field": "content",
-      "query": "who inspired taking care of the sea?"
-    }
-  }
-}
-```
-
-```console
-POST jinaai-index/_search
-{
-  "retriever": {
-    "text_similarity_reranker": {
-      "retriever": {
-        "standard": {
-          "query": {
-            "semantic": {
-              "field": "content",
-              "query": "who inspired taking care of the sea?"
-            }
-          }
-        }
-      },
-      "field": "content",
-      "rank_window_size": 100,
-      "inference_id": "jinaai-rerank",
-      "inference_text": "who inspired taking care of the sea?"
-    }
-  }
-}
-```
-
diff --git a/explore-analyze/machine-learning/nlp/ml-nlp-e5.md b/explore-analyze/machine-learning/nlp/ml-nlp-e5.md
index 88cdd4c7f3..6d0ab83b2b 100644
--- a/explore-analyze/machine-learning/nlp/ml-nlp-e5.md
+++ b/explore-analyze/machine-learning/nlp/ml-nlp-e5.md
@@ -13,7 +13,8 @@ EmbEddings from bidirEctional Encoder rEpresentations - or E5 -  is a {{nlp}} mo
 
 [Semantic search](../../../solutions/search/semantic-search.md) provides you search results based on contextual meaning and user intent, rather than exact keyword matches.
 
-E5 has two versions: one cross-platform version which runs on any hardware and one version which is optimized for Intel® silicon. The **Model Management** > **Trained Models** page shows you which version of E5 is recommended to deploy based on your cluster’s hardware. However, the recommended way to use E5 is through the [{{infer}} API](../../elastic-inference/inference-api/elasticsearch-inference-integration.md) as a service which makes it easier to download and deploy the model and you don’t need to select from different versions.
+E5 has two versions: one cross-platform version which runs on any hardware and one version which is optimized for Intel® silicon. The **Model Management** > **Trained Models** page shows you which version of E5 is recommended to deploy based on your cluster’s hardware. However, the recommended way to use E5 is through the {{infer}} API as a service which makes it easier to download and deploy the model and you don’t need to select from different versions.
+% TBD URL for API
 
 Refer to the model cards of the [multilingual-e5-small](https://huggingface.co/elastic/multilingual-e5-small) and the [multilingual-e5-small-optimized](https://huggingface.co/elastic/multilingual-e5-small-optimized) models on HuggingFace for further information including licensing.
 
@@ -42,9 +43,10 @@ PUT _inference/text_embedding/my-e5-model
     }
 ```
 
-    The API request automatically initiates the model download and then deploy the model.
+The API request automatically initiates the model download and then deploy the model.
 
-Refer to the [`elasticsearch` {{infer}} service documentation](../../elastic-inference/inference-api/elasticsearch-inference-integration.md) to learn more about the available settings.
+Refer to the `elasticsearch` {{infer}} service documentation to learn more about the available settings.
+% TBD URL for API
 
 After you created the E5 {{infer}} endpoint, it’s ready to be used for semantic search. The easiest way to perform semantic search in the {{stack}} is to [follow the `semantic_text` workflow](../../../solutions/search/semantic-search/semantic-search-semantic-text.md).
 
diff --git a/explore-analyze/machine-learning/nlp/ml-nlp-elser.md b/explore-analyze/machine-learning/nlp/ml-nlp-elser.md
index f9a4c1eab9..19e9d17f7d 100644
--- a/explore-analyze/machine-learning/nlp/ml-nlp-elser.md
+++ b/explore-analyze/machine-learning/nlp/ml-nlp-elser.md
@@ -39,7 +39,8 @@ Enabling trained model autoscaling for your ELSER deployment is recommended. Ref
 
 Compared to the initial version of the model, ELSER v2 offers improved retrieval accuracy and more efficient indexing. This enhancement is attributed to the extension of the training data set, which includes high-quality question and answer pairs and the improved FLOPS regularizer which reduces the cost of computing the similarity between a query and a document.
 
-ELSER v2 has two versions: one cross-platform version which runs on any hardware and one version which is optimized for Intel® silicon. The **Model Management** > **Trained Models** page shows you which version of ELSER v2 is recommended to deploy based on your cluster’s hardware. However, the recommended way to use ELSER is through the [{{infer}} API](../../elastic-inference/inference-api/elser-inference-integration.md) as a service which makes it easier to download and deploy the model and you don’t need to select from different versions.
+ELSER v2 has two versions: one cross-platform version which runs on any hardware and one version which is optimized for Intel® silicon. The **Model Management** > **Trained Models** page shows you which version of ELSER v2 is recommended to deploy based on your cluster’s hardware. However, the recommended way to use ELSER is through the {{infer}} API as a service which makes it easier to download and deploy the model and you don't need to select from different versions.
+% TBD URL for API
 
 If you want to learn more about the ELSER V2 improvements, refer to [this blog post](https://www.elastic.co/search-labs/blog/introducing-elser-v2-part-1).
 
@@ -74,7 +75,8 @@ PUT _inference/sparse_embedding/my-elser-endpoint
 
 The API request automatically initiates the model download and then deploy the model. This example uses [autoscaling](../../../deploy-manage/autoscaling/trained-model-autoscaling.md) through adaptive allocation.
 
-Refer to the [ELSER {{infer}} integration documentation](../../elastic-inference/inference-api/elser-inference-integration.md) to learn more about the available settings.
+Refer to the ELSER {{infer}} integration documentation to learn more about the available settings.
+% TBD URL for API
 
 After you created the ELSER {{infer}} endpoint, it’s ready to be used for semantic search. The easiest way to perform semantic search in the {{stack}} is to [follow the `semantic_text` workflow](../../../solutions/search/semantic-search/semantic-search-semantic-text.md).
 
@@ -306,7 +308,8 @@ To gain the biggest value out of ELSER trained models, consider to follow this l
 ## Benchmark information [elser-benchmarks]
 
 ::::{important}
-The recommended way to use ELSER is through the [{{infer}} API](../../elastic-inference/inference-api/elser-inference-integration.md) as a service.
+The recommended way to use ELSER is through the {{infer}} API as a service.
+% TBD URL for API
 ::::
 
 The following sections provide information about how ELSER performs on different hardwares and compares the model performance to {{es}} BM25 and other strong baselines.
diff --git a/explore-analyze/machine-learning/nlp/ml-nlp-rerank.md b/explore-analyze/machine-learning/nlp/ml-nlp-rerank.md
index 70af12766d..c522d012df 100644
--- a/explore-analyze/machine-learning/nlp/ml-nlp-rerank.md
+++ b/explore-analyze/machine-learning/nlp/ml-nlp-rerank.md
@@ -44,7 +44,8 @@ Elastic Rerank is available in Elastic Stack version 8.17+:
 
 ## Download and deploy [ml-nlp-rerank-deploy]
 
-To download and deploy Elastic Rerank, use the [create inference API](../../elastic-inference/inference-api/elasticsearch-inference-integration.md) to create an {{es}} service `rerank` endpoint.
+To download and deploy Elastic Rerank, use the create inference API to create an {{es}} service `rerank` endpoint.
+% TBD URL for API
 
 ::::{tip}
 Refer to this [Python notebook](https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/search/12-semantic-reranking-elastic-rerank.ipynb) for an end-to-end example using Elastic Rerank.
@@ -280,7 +281,7 @@ For detailed benchmark information, including complete dataset results and metho
 **Documentation**:
 
 * [Semantic re-ranking in {{es}} overview](../../../solutions/search/ranking/semantic-reranking.md#semantic-reranking-in-es)
-* [Inference API example](../../elastic-inference/inference-api/elasticsearch-inference-integration.md#inference-example-elastic-reranker)
+% TBD URL for API * [Inference API example](../../elastic-inference/inference-api/elasticsearch-inference-integration.md#inference-example-elastic-reranker)
 
 **Blogs**:
 
diff --git a/explore-analyze/toc.yml b/explore-analyze/toc.yml
index e5ae4917b9..5685008ace 100644
--- a/explore-analyze/toc.yml
+++ b/explore-analyze/toc.yml
@@ -122,19 +122,6 @@ toc:
       - file: elastic-inference/inference-api.md
         children:
           - file: elastic-inference/inference-api/elastic-inference-service-eis.md
-          - file: elastic-inference/inference-api/alibabacloud-ai-search-inference-integration.md
-          - file: elastic-inference/inference-api/amazon-bedrock-inference-integration.md
-          - file: elastic-inference/inference-api/anthropic-inference-integration.md
-          - file: elastic-inference/inference-api/azure-ai-studio-inference-integration.md
-          - file: elastic-inference/inference-api/azure-openai-inference-integration.md
-          - file: elastic-inference/inference-api/chat-completion-inference-api.md
-          - file: elastic-inference/inference-api/cohere-inference-integration.md
-          - file: elastic-inference/inference-api/elasticsearch-inference-integration.md
-          - file: elastic-inference/inference-api/elser-inference-integration.md
-          - file: elastic-inference/inference-api/google-ai-studio-inference-integration.md
-          - file: elastic-inference/inference-api/google-vertex-ai-inference-integration.md
-          - file: elastic-inference/inference-api/huggingface-inference-integration.md
-          - file: elastic-inference/inference-api/jinaai-inference-integration.md
   - file: machine-learning.md
     children:
       - file: machine-learning/setting-up-machine-learning.md
diff --git a/solutions/search/hybrid-semantic-text.md b/solutions/search/hybrid-semantic-text.md
index 839d1d1ead..49fe6582a0 100644
--- a/solutions/search/hybrid-semantic-text.md
+++ b/solutions/search/hybrid-semantic-text.md
@@ -14,7 +14,8 @@ This tutorial demonstrates how to perform hybrid search, combining semantic sear
 
 In hybrid search, semantic search retrieves results based on the meaning of the text, while full-text search focuses on exact word matches. By combining both methods, hybrid search delivers more relevant results, particularly in cases where relying on a single approach may not be sufficient.
 
-The recommended way to use hybrid search in the {{stack}} is following the `semantic_text` workflow. This tutorial uses the [`elasticsearch` service](../../explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md) for demonstration, but you can use any service and their supported models offered by the {{infer-cap}} API.
+The recommended way to use hybrid search in the {{stack}} is following the `semantic_text` workflow. This tutorial uses the `elasticsearch` service for demonstration, but you can use any service and their supported models offered by the {{infer-cap}} API.
+% TBD URL for API
 
 
 ## Create an index mapping [hybrid-search-create-index-mapping]
diff --git a/solutions/search/ranking/semantic-reranking.md b/solutions/search/ranking/semantic-reranking.md
index 3d875b77e1..15612acb08 100644
--- a/solutions/search/ranking/semantic-reranking.md
+++ b/solutions/search/ranking/semantic-reranking.md
@@ -92,10 +92,14 @@ To use semantic re-ranking in {{es}}, you need to:
 
 1. **Select and configure a re-ranking model**. You have the following options:
 
-    1. Use the [Elastic Rerank](../../../explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md#inference-example-elastic-reranker) cross-encoder model via the inference API’s {{es}} service.
-    2. Use the [Cohere Rerank inference endpoint](../../../explore-analyze/elastic-inference/inference-api/cohere-inference-integration.md) to create a `rerank` endpoint.
-    3. Use the [Google Vertex AI inference endpoint](../../../explore-analyze/elastic-inference/inference-api/google-vertex-ai-inference-integration.md) to create a `rerank` endpoint.
-    4. Upload a model to {{es}} from Hugging Face with [Eland](eland://reference/machine-learning.md#ml-nlp-pytorch). You’ll need to use the `text_similarity` NLP task type when loading the model using Eland. Then set up an [{{es}} service inference endpoint](../../../explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md#inference-example-eland) with the `rerank` endpoint type.
+    1. Use the Elastic Rerank cross-encoder model via the inference API's {{es}} service.
+       % TBD URL for API
+    2. Use the Cohere Rerank inference endpoint to create a `rerank` endpoint.
+       % TBD URL for API
+    3. Use the Google Vertex AI inference endpoint to create a `rerank` endpoint.
+       % TBD URL for API
+    4. Upload a model to {{es}} from Hugging Face with [Eland](eland://reference/machine-learning.md#ml-nlp-pytorch). You’ll need to use the `text_similarity` NLP task type when loading the model using Eland. Then set up an {{es}} service inference endpoint with the `rerank` endpoint type.
+       % TBD URL for API
 
         Refer to [the Elastic NLP model reference](../../../explore-analyze/machine-learning/nlp/ml-nlp-model-ref.md#ml-nlp-model-ref-text-similarity) for a list of third party text similarity models supported by {{es}} for semantic re-ranking.
 
diff --git a/solutions/search/semantic-search/semantic-search-inference.md b/solutions/search/semantic-search/semantic-search-inference.md
index b3561385d9..901888500d 100644
--- a/solutions/search/semantic-search/semantic-search-inference.md
+++ b/solutions/search/semantic-search/semantic-search-inference.md
@@ -28,7 +28,8 @@ The following examples use the:
 * `amazon.titan-embed-text-v1` model for [Amazon Bedrock](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids.html)
 * `ops-text-embedding-zh-001` model for [AlibabaCloud AI](https://help.aliyun.com/zh/open-search/search-platform/developer-reference/text-embedding-api-details)
 
-You can use any Cohere and OpenAI models, they are all supported by the {{infer}} API. For a list of recommended models available on HuggingFace, refer to [the supported model list](../../../explore-analyze/elastic-inference/inference-api/huggingface-inference-integration.md#inference-example-hugging-face-supported-models).
+You can use any Cohere and OpenAI models, they are all supported by the {{infer}} API.
+% TBD URL: For a list of recommended models available on HuggingFace, refer to [the supported model list](../../../explore-analyze/elastic-inference/inference-api/huggingface-inference-integration.md#inference-example-hugging-face-supported-models).
 
 Click the name of the service you want to use on any of the widgets below to review the corresponding instructions.
 
diff --git a/solutions/search/semantic-search/semantic-search-semantic-text.md b/solutions/search/semantic-search/semantic-search-semantic-text.md
index 9548e675b2..147fc4c5d1 100644
--- a/solutions/search/semantic-search/semantic-search-semantic-text.md
+++ b/solutions/search/semantic-search/semantic-search-semantic-text.md
@@ -15,12 +15,14 @@ Semantic text simplifies the {{infer}} workflow by providing {{infer}} at ingest
 
 The recommended way to use [semantic search](../semantic-search.md) in the {{stack}} is following the `semantic_text` workflow. When you need more control over indexing and query settings, you can still use the complete {{infer}} workflow (refer to  [this tutorial](../../../explore-analyze/elastic-inference/inference-api.md) to review the process).
 
-This tutorial uses the [`elasticsearch` service](../../../explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md) for demonstration, but you can use any service and their supported models offered by the {{infer-cap}} API.
+This tutorial uses the `elasticsearch` service for demonstration, but you can use any service and their supported models offered by the {{infer-cap}} API.
+% TBD URL for API
 
 
 ## Requirements [semantic-text-requirements]
 
-This tutorial uses the [`elasticsearch` service](../../../explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md) for demonstration, which is created automatically as needed. To use the `semantic_text` field type with an {{infer}} service other than `elasticsearch` service, you must create an inference endpoint using the [Create {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put).
+This tutorial uses the `elasticsearch` service for demonstration, which is created automatically as needed. To use the `semantic_text` field type with an {{infer}} service other than `elasticsearch` service, you must create an inference endpoint using the [Create {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put).
+% TBD URL for API
 
 
 ## Create the index mapping [semantic-text-index-mapping]
@@ -41,8 +43,8 @@ PUT semantic-embeddings
 ```
 
 1. The name of the field to contain the generated embeddings.
-2. The field to contain the embeddings is a `semantic_text` field. Since no `inference_id` is provided, the default endpoint `.elser-2-elasticsearch` for the [`elasticsearch` service](../../../explore-analyze/elastic-inference/inference-api/elasticsearch-inference-integration.md) is used. To use a different {{infer}} service, you must create an {{infer}} endpoint first using the [Create {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put) and then specify it in the `semantic_text` field mapping using the `inference_id` parameter.
-
+2. The field to contain the embeddings is a `semantic_text` field. Since no `inference_id` is provided, the default endpoint `.elser-2-elasticsearch` for the `elasticsearch` service is used. To use a different {{infer}} service, you must create an {{infer}} endpoint first using the [Create {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put) and then specify it in the `semantic_text` field mapping using the `inference_id` parameter.
+% TBD URL for API
 
 ::::{note}
 If you’re using web crawlers or connectors to generate indices, you have to [update the index mappings](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-put-mapping) for these indices to include the `semantic_text` field. Once the mapping is updated, you’ll need to run a full web crawl or a full connector sync. This ensures that all existing documents are reprocessed and updated with the new semantic embeddings, enabling semantic search on the updated data.

From fb66cabb9c47a7bf181e73ced31be089485643bf Mon Sep 17 00:00:00 2001
From: lcawl <lcawley@elastic.co>
Date: Tue, 25 Mar 2025 21:46:11 -0700
Subject: [PATCH 2/2] Fix API URLs

---
 .../autoscaling/trained-model-autoscaling.md         |  3 +--
 explore-analyze/machine-learning/nlp/ml-nlp-e5.md    |  6 ++----
 explore-analyze/machine-learning/nlp/ml-nlp-elser.md |  7 ++-----
 .../machine-learning/nlp/ml-nlp-rerank.md            |  5 ++---
 solutions/search/hybrid-semantic-text.md             |  4 +---
 solutions/search/ranking/semantic-reranking.md       | 12 ++++--------
 .../semantic-search/semantic-search-inference.md     |  2 +-
 .../semantic-search/semantic-search-semantic-text.md |  7 +------
 8 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/deploy-manage/autoscaling/trained-model-autoscaling.md b/deploy-manage/autoscaling/trained-model-autoscaling.md
index 208c82a6fd..2b718909e1 100644
--- a/deploy-manage/autoscaling/trained-model-autoscaling.md
+++ b/deploy-manage/autoscaling/trained-model-autoscaling.md
@@ -46,8 +46,7 @@ If you set the minimum number of allocations to 1, you will be charged even if t
 
 You can enable adaptive allocations by using:
 
-* the create inference endpoint API for ELSER, E5 and models uploaded through Eland that are used as inference services.
-  %TBD URL for APIs
+* the create inference endpoint API for [ELSER](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elser), [E5 and models uploaded through Eland](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch) that are used as inference services.
 * the [start trained model deployment](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-ml-start-trained-model-deployment) or [update trained model deployment](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-ml-update-trained-model-deployment) APIs for trained models that are deployed on {{ml}} nodes.
 
 If the new allocations fit on the current {{ml}} nodes, they are immediately started. If more resource capacity is needed for creating new model allocations, then your {{ml}} node will be scaled up if {{ml}} autoscaling is enabled to provide enough resources for the new allocation. The number of model allocations can be scaled down to 0. They cannot be scaled up to more than 32 allocations, unless you explicitly set the maximum number of allocations to more. Adaptive allocations must be set up independently for each deployment and [{{infer}} endpoint](https://www.elastic.co/docs/api/doc/elasticsearch/group/endpoint-inference).
diff --git a/explore-analyze/machine-learning/nlp/ml-nlp-e5.md b/explore-analyze/machine-learning/nlp/ml-nlp-e5.md
index 6d0ab83b2b..f64cbe1508 100644
--- a/explore-analyze/machine-learning/nlp/ml-nlp-e5.md
+++ b/explore-analyze/machine-learning/nlp/ml-nlp-e5.md
@@ -13,8 +13,7 @@ EmbEddings from bidirEctional Encoder rEpresentations - or E5 -  is a {{nlp}} mo
 
 [Semantic search](../../../solutions/search/semantic-search.md) provides you search results based on contextual meaning and user intent, rather than exact keyword matches.
 
-E5 has two versions: one cross-platform version which runs on any hardware and one version which is optimized for Intel® silicon. The **Model Management** > **Trained Models** page shows you which version of E5 is recommended to deploy based on your cluster’s hardware. However, the recommended way to use E5 is through the {{infer}} API as a service which makes it easier to download and deploy the model and you don’t need to select from different versions.
-% TBD URL for API
+E5 has two versions: one cross-platform version which runs on any hardware and one version which is optimized for Intel® silicon. The **Model Management** > **Trained Models** page shows you which version of E5 is recommended to deploy based on your cluster’s hardware. However, the recommended way to use E5 is through the [{{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch) as a service which makes it easier to download and deploy the model and you don’t need to select from different versions.
 
 Refer to the model cards of the [multilingual-e5-small](https://huggingface.co/elastic/multilingual-e5-small) and the [multilingual-e5-small-optimized](https://huggingface.co/elastic/multilingual-e5-small-optimized) models on HuggingFace for further information including licensing.
 
@@ -45,8 +44,7 @@ PUT _inference/text_embedding/my-e5-model
 
 The API request automatically initiates the model download and then deploy the model.
 
-Refer to the `elasticsearch` {{infer}} service documentation to learn more about the available settings.
-% TBD URL for API
+Refer to the `elasticsearch` [{{infer}} service documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch) to learn more about the available settings.
 
 After you created the E5 {{infer}} endpoint, it’s ready to be used for semantic search. The easiest way to perform semantic search in the {{stack}} is to [follow the `semantic_text` workflow](../../../solutions/search/semantic-search/semantic-search-semantic-text.md).
 
diff --git a/explore-analyze/machine-learning/nlp/ml-nlp-elser.md b/explore-analyze/machine-learning/nlp/ml-nlp-elser.md
index 19e9d17f7d..5c9d004db5 100644
--- a/explore-analyze/machine-learning/nlp/ml-nlp-elser.md
+++ b/explore-analyze/machine-learning/nlp/ml-nlp-elser.md
@@ -39,8 +39,7 @@ Enabling trained model autoscaling for your ELSER deployment is recommended. Ref
 
 Compared to the initial version of the model, ELSER v2 offers improved retrieval accuracy and more efficient indexing. This enhancement is attributed to the extension of the training data set, which includes high-quality question and answer pairs and the improved FLOPS regularizer which reduces the cost of computing the similarity between a query and a document.
 
-ELSER v2 has two versions: one cross-platform version which runs on any hardware and one version which is optimized for Intel® silicon. The **Model Management** > **Trained Models** page shows you which version of ELSER v2 is recommended to deploy based on your cluster’s hardware. However, the recommended way to use ELSER is through the {{infer}} API as a service which makes it easier to download and deploy the model and you don't need to select from different versions.
-% TBD URL for API
+ELSER v2 has two versions: one cross-platform version which runs on any hardware and one version which is optimized for Intel® silicon. The **Model Management** > **Trained Models** page shows you which version of ELSER v2 is recommended to deploy based on your cluster’s hardware. However, the recommended way to use ELSER is through the [{{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch) as a service which makes it easier to download and deploy the model and you don't need to select from different versions.
 
 If you want to learn more about the ELSER V2 improvements, refer to [this blog post](https://www.elastic.co/search-labs/blog/introducing-elser-v2-part-1).
 
@@ -75,8 +74,7 @@ PUT _inference/sparse_embedding/my-elser-endpoint
 
 The API request automatically initiates the model download and then deploy the model. This example uses [autoscaling](../../../deploy-manage/autoscaling/trained-model-autoscaling.md) through adaptive allocation.
 
-Refer to the ELSER {{infer}} integration documentation to learn more about the available settings.
-% TBD URL for API
+Refer to the [ELSER {{infer}} integration documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elser) to learn more about the available settings.
 
 After you created the ELSER {{infer}} endpoint, it’s ready to be used for semantic search. The easiest way to perform semantic search in the {{stack}} is to [follow the `semantic_text` workflow](../../../solutions/search/semantic-search/semantic-search-semantic-text.md).
 
@@ -309,7 +307,6 @@ To gain the biggest value out of ELSER trained models, consider to follow this l
 
 ::::{important}
 The recommended way to use ELSER is through the {{infer}} API as a service.
-% TBD URL for API
 ::::
 
 The following sections provide information about how ELSER performs on different hardwares and compares the model performance to {{es}} BM25 and other strong baselines.
diff --git a/explore-analyze/machine-learning/nlp/ml-nlp-rerank.md b/explore-analyze/machine-learning/nlp/ml-nlp-rerank.md
index c522d012df..3215e8c5c8 100644
--- a/explore-analyze/machine-learning/nlp/ml-nlp-rerank.md
+++ b/explore-analyze/machine-learning/nlp/ml-nlp-rerank.md
@@ -44,8 +44,7 @@ Elastic Rerank is available in Elastic Stack version 8.17+:
 
 ## Download and deploy [ml-nlp-rerank-deploy]
 
-To download and deploy Elastic Rerank, use the create inference API to create an {{es}} service `rerank` endpoint.
-% TBD URL for API
+To download and deploy Elastic Rerank, use the [create inference API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch) to create an {{es}} service `rerank` endpoint.
 
 ::::{tip}
 Refer to this [Python notebook](https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/search/12-semantic-reranking-elastic-rerank.ipynb) for an end-to-end example using Elastic Rerank.
@@ -281,7 +280,7 @@ For detailed benchmark information, including complete dataset results and metho
 **Documentation**:
 
 * [Semantic re-ranking in {{es}} overview](../../../solutions/search/ranking/semantic-reranking.md#semantic-reranking-in-es)
-% TBD URL for API * [Inference API example](../../elastic-inference/inference-api/elasticsearch-inference-integration.md#inference-example-elastic-reranker)
+* [Inference API example](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch)
 
 **Blogs**:
 
diff --git a/solutions/search/hybrid-semantic-text.md b/solutions/search/hybrid-semantic-text.md
index 49fe6582a0..023d504b0c 100644
--- a/solutions/search/hybrid-semantic-text.md
+++ b/solutions/search/hybrid-semantic-text.md
@@ -14,9 +14,7 @@ This tutorial demonstrates how to perform hybrid search, combining semantic sear
 
 In hybrid search, semantic search retrieves results based on the meaning of the text, while full-text search focuses on exact word matches. By combining both methods, hybrid search delivers more relevant results, particularly in cases where relying on a single approach may not be sufficient.
 
-The recommended way to use hybrid search in the {{stack}} is following the `semantic_text` workflow. This tutorial uses the `elasticsearch` service for demonstration, but you can use any service and their supported models offered by the {{infer-cap}} API.
-% TBD URL for API
-
+The recommended way to use hybrid search in the {{stack}} is following the `semantic_text` workflow. This tutorial uses the [`elasticsearch` service](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch) for demonstration, but you can use any service and their supported models offered by the {{infer-cap}} API.
 
 ## Create an index mapping [hybrid-search-create-index-mapping]
 
diff --git a/solutions/search/ranking/semantic-reranking.md b/solutions/search/ranking/semantic-reranking.md
index 15612acb08..da6cd0d809 100644
--- a/solutions/search/ranking/semantic-reranking.md
+++ b/solutions/search/ranking/semantic-reranking.md
@@ -92,14 +92,10 @@ To use semantic re-ranking in {{es}}, you need to:
 
 1. **Select and configure a re-ranking model**. You have the following options:
 
-    1. Use the Elastic Rerank cross-encoder model via the inference API's {{es}} service.
-       % TBD URL for API
-    2. Use the Cohere Rerank inference endpoint to create a `rerank` endpoint.
-       % TBD URL for API
-    3. Use the Google Vertex AI inference endpoint to create a `rerank` endpoint.
-       % TBD URL for API
-    4. Upload a model to {{es}} from Hugging Face with [Eland](eland://reference/machine-learning.md#ml-nlp-pytorch). You’ll need to use the `text_similarity` NLP task type when loading the model using Eland. Then set up an {{es}} service inference endpoint with the `rerank` endpoint type.
-       % TBD URL for API
+    1. Use the Elastic Rerank cross-encoder model via the [inference API's {{es}} service](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch).
+    2. Use the [Cohere Rerank inference endpoint](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-cohere) to create a `rerank` endpoint.
+    3. Use the [Google Vertex AI inference endpoint](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-googlevertexai) to create a `rerank` endpoint.
+    4. Upload a model to {{es}} from Hugging Face with [Eland](eland://reference/machine-learning.md#ml-nlp-pytorch). You’ll need to use the `text_similarity` NLP task type when loading the model using Eland. Then set up an [{{es}} service inference endpoint](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch) with the `rerank` endpoint type.
 
         Refer to [the Elastic NLP model reference](../../../explore-analyze/machine-learning/nlp/ml-nlp-model-ref.md#ml-nlp-model-ref-text-similarity) for a list of third party text similarity models supported by {{es}} for semantic re-ranking.
 
diff --git a/solutions/search/semantic-search/semantic-search-inference.md b/solutions/search/semantic-search/semantic-search-inference.md
index 901888500d..d882c7a94d 100644
--- a/solutions/search/semantic-search/semantic-search-inference.md
+++ b/solutions/search/semantic-search/semantic-search-inference.md
@@ -29,7 +29,7 @@ The following examples use the:
 * `ops-text-embedding-zh-001` model for [AlibabaCloud AI](https://help.aliyun.com/zh/open-search/search-platform/developer-reference/text-embedding-api-details)
 
 You can use any Cohere and OpenAI models, they are all supported by the {{infer}} API.
-% TBD URL: For a list of recommended models available on HuggingFace, refer to [the supported model list](../../../explore-analyze/elastic-inference/inference-api/huggingface-inference-integration.md#inference-example-hugging-face-supported-models).
+For a list of recommended models available on HuggingFace, refer to the supported model list in the [API documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-hugging-face).
 
 Click the name of the service you want to use on any of the widgets below to review the corresponding instructions.
 
diff --git a/solutions/search/semantic-search/semantic-search-semantic-text.md b/solutions/search/semantic-search/semantic-search-semantic-text.md
index 147fc4c5d1..a98c945f18 100644
--- a/solutions/search/semantic-search/semantic-search-semantic-text.md
+++ b/solutions/search/semantic-search/semantic-search-semantic-text.md
@@ -15,15 +15,11 @@ Semantic text simplifies the {{infer}} workflow by providing {{infer}} at ingest
 
 The recommended way to use [semantic search](../semantic-search.md) in the {{stack}} is following the `semantic_text` workflow. When you need more control over indexing and query settings, you can still use the complete {{infer}} workflow (refer to  [this tutorial](../../../explore-analyze/elastic-inference/inference-api.md) to review the process).
 
-This tutorial uses the `elasticsearch` service for demonstration, but you can use any service and their supported models offered by the {{infer-cap}} API.
-% TBD URL for API
-
+This tutorial uses the [`elasticsearch` service](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put-elasticsearch) for demonstration, but you can use any service and their supported models offered by the {{infer-cap}} API.
 
 ## Requirements [semantic-text-requirements]
 
 This tutorial uses the `elasticsearch` service for demonstration, which is created automatically as needed. To use the `semantic_text` field type with an {{infer}} service other than `elasticsearch` service, you must create an inference endpoint using the [Create {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put).
-% TBD URL for API
-
 
 ## Create the index mapping [semantic-text-index-mapping]
 
@@ -44,7 +40,6 @@ PUT semantic-embeddings
 
 1. The name of the field to contain the generated embeddings.
 2. The field to contain the embeddings is a `semantic_text` field. Since no `inference_id` is provided, the default endpoint `.elser-2-elasticsearch` for the `elasticsearch` service is used. To use a different {{infer}} service, you must create an {{infer}} endpoint first using the [Create {{infer}} API](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put) and then specify it in the `semantic_text` field mapping using the `inference_id` parameter.
-% TBD URL for API
 
 ::::{note}
 If you’re using web crawlers or connectors to generate indices, you have to [update the index mappings](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-indices-put-mapping) for these indices to include the `semantic_text` field. Once the mapping is updated, you’ll need to run a full web crawl or a full connector sync. This ensures that all existing documents are reprocessed and updated with the new semantic embeddings, enabling semantic search on the updated data.