From 7f21997e0cef82be49cbc04f541a5529edb4d685 Mon Sep 17 00:00:00 2001 From: Lisa Cawley Date: Tue, 25 Mar 2025 07:42:11 -0700 Subject: [PATCH] Add ELSER inference API details (#4026) (cherry picked from commit 33420c6e0fddea1544cfaeca89ec6a178b3d0c1f) --- output/openapi/elasticsearch-openapi.json | 139 +++ .../elasticsearch-serverless-openapi.json | 139 +++ output/schema/schema-serverless.json | 1008 ++++++++++++++--- output/schema/schema.json | 296 +++++ output/typescript/types.ts | 28 + .../_json_spec/inference.put_elser.json | 39 + .../inference/put_elser/PutElserRequest.ts | 137 +++ .../inference/put_elser/PutElserResponse.ts | 24 + .../request/PutElserRequestExample1.yaml | 12 + .../request/PutElserRequestExample2.yaml | 16 + .../response/PutElserResponseExample1.yaml | 15 + 11 files changed, 1723 insertions(+), 130 deletions(-) create mode 100644 specification/_json_spec/inference.put_elser.json create mode 100644 specification/inference/put_elser/PutElserRequest.ts create mode 100644 specification/inference/put_elser/PutElserResponse.ts create mode 100644 specification/inference/put_elser/examples/request/PutElserRequestExample1.yaml create mode 100644 specification/inference/put_elser/examples/request/PutElserRequestExample2.yaml create mode 100644 specification/inference/put_elser/examples/response/PutElserResponseExample1.yaml diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json index 7f52c162e2..c793f9e836 100644 --- a/output/openapi/elasticsearch-openapi.json +++ b/output/openapi/elasticsearch-openapi.json @@ -18380,6 +18380,96 @@ "x-state": "Added in 8.13.0" } }, + "/_inference/{task_type}/{elser_inference_id}": { + "put": { + "tags": [ + "inference" + ], + "summary": "Create an ELSER inference endpoint", + "description": "Create an inference endpoint to perform an inference task with the `elser` service.\nYou can also deploy ELSER by using the Elasticsearch inference integration.\n\n> info\n> Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings.\n\nThe API request will automatically download and deploy the ELSER model if it isn't already downloaded.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "operationId": "inference-put-elser", + "parameters": [ + { + "in": "path", + "name": "task_type", + "description": "The type of the inference task that the model will perform.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/inference.put_elser:ElserTaskType" + }, + "style": "simple" + }, + { + "in": "path", + "name": "elser_inference_id", + "description": "The unique identifier of the inference endpoint.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/_types:Id" + }, + "style": "simple" + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "chunking_settings": { + "$ref": "#/components/schemas/inference._types:InferenceChunkingSettings" + }, + "service": { + "$ref": "#/components/schemas/inference.put_elser:ServiceType" + }, + "service_settings": { + "$ref": "#/components/schemas/inference.put_elser:ElserServiceSettings" + } + }, + "required": [ + "service", + "service_settings" + ] + }, + "examples": { + "PutElserRequestExample1": { + "summary": "A sparse embedding task", + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The request will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n }\n}" + }, + "PutElserRequestExample2": { + "summary": "Adaptive allocations", + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task with adaptive allocations. When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load.", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1\n }\n}" + } + } + } + } + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/inference._types:InferenceEndpointInfo" + }, + "examples": { + "PutElserResponseExample1": { + "description": "A successful response when creating an ELSER inference endpoint.", + "value": "{\n \"inference_id\": \"my-elser-model\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n },\n \"task_settings\": {}\n}" + } + } + } + } + } + }, + "deprecated": true, + "x-state": "Added in 8.11.0" + } + }, "/_inference/{task_type}/{huggingface_inference_id}": { "put": { "tags": [ @@ -78203,6 +78293,55 @@ } } }, + "inference.put_elser:ElserTaskType": { + "type": "string", + "enum": [ + "sparse_embedding" + ] + }, + "inference.put_elser:ServiceType": { + "type": "string", + "enum": [ + "elser" + ] + }, + "inference.put_elser:ElserServiceSettings": { + "type": "object", + "properties": { + "adaptive_allocations": { + "$ref": "#/components/schemas/inference.put_elser:AdaptiveAllocations" + }, + "num_allocations": { + "description": "The total number of allocations this model is assigned across machine learning nodes.\nIncreasing this value generally increases the throughput.\nIf adaptive allocations is enabled, do not set this value because it's automatically set.", + "type": "number" + }, + "num_threads": { + "description": "The number of threads used by each model allocation during inference.\nIncreasing this value generally increases the speed per inference request.\nThe inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node.\nThe value must be a power of 2.\nThe maximum value is 32.\n\n> info\n> If you want to optimize your ELSER endpoint for ingest, set the number of threads to 1. If you want to optimize your ELSER endpoint for search, set the number of threads to greater than 1.", + "type": "number" + } + }, + "required": [ + "num_allocations", + "num_threads" + ] + }, + "inference.put_elser:AdaptiveAllocations": { + "type": "object", + "properties": { + "enabled": { + "description": "Turn on `adaptive_allocations`.", + "type": "boolean" + }, + "max_number_of_allocations": { + "description": "The maximum number of allocations to scale to.\nIf set, it must be greater than or equal to `min_number_of_allocations`.", + "type": "number" + }, + "min_number_of_allocations": { + "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", + "type": "number" + } + } + }, "inference.put_hugging_face:HuggingFaceTaskType": { "type": "string", "enum": [ diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json index 84a54ca1b0..23b604902b 100644 --- a/output/openapi/elasticsearch-serverless-openapi.json +++ b/output/openapi/elasticsearch-serverless-openapi.json @@ -10206,6 +10206,96 @@ "x-state": "Added in 8.13.0" } }, + "/_inference/{task_type}/{elser_inference_id}": { + "put": { + "tags": [ + "inference" + ], + "summary": "Create an ELSER inference endpoint", + "description": "Create an inference endpoint to perform an inference task with the `elser` service.\nYou can also deploy ELSER by using the Elasticsearch inference integration.\n\n> info\n> Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings.\n\nThe API request will automatically download and deploy the ELSER model if it isn't already downloaded.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "operationId": "inference-put-elser", + "parameters": [ + { + "in": "path", + "name": "task_type", + "description": "The type of the inference task that the model will perform.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/inference.put_elser:ElserTaskType" + }, + "style": "simple" + }, + { + "in": "path", + "name": "elser_inference_id", + "description": "The unique identifier of the inference endpoint.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/_types:Id" + }, + "style": "simple" + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "chunking_settings": { + "$ref": "#/components/schemas/inference._types:InferenceChunkingSettings" + }, + "service": { + "$ref": "#/components/schemas/inference.put_elser:ServiceType" + }, + "service_settings": { + "$ref": "#/components/schemas/inference.put_elser:ElserServiceSettings" + } + }, + "required": [ + "service", + "service_settings" + ] + }, + "examples": { + "PutElserRequestExample1": { + "summary": "A sparse embedding task", + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The request will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n }\n}" + }, + "PutElserRequestExample2": { + "summary": "Adaptive allocations", + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task with adaptive allocations. When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load.", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1\n }\n}" + } + } + } + } + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/inference._types:InferenceEndpointInfo" + }, + "examples": { + "PutElserResponseExample1": { + "description": "A successful response when creating an ELSER inference endpoint.", + "value": "{\n \"inference_id\": \"my-elser-model\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n },\n \"task_settings\": {}\n}" + } + } + } + } + } + }, + "deprecated": true, + "x-state": "Added in 8.11.0" + } + }, "/_inference/{task_type}/{huggingface_inference_id}": { "put": { "tags": [ @@ -49399,6 +49489,55 @@ } } }, + "inference.put_elser:ElserTaskType": { + "type": "string", + "enum": [ + "sparse_embedding" + ] + }, + "inference.put_elser:ServiceType": { + "type": "string", + "enum": [ + "elser" + ] + }, + "inference.put_elser:ElserServiceSettings": { + "type": "object", + "properties": { + "adaptive_allocations": { + "$ref": "#/components/schemas/inference.put_elser:AdaptiveAllocations" + }, + "num_allocations": { + "description": "The total number of allocations this model is assigned across machine learning nodes.\nIncreasing this value generally increases the throughput.\nIf adaptive allocations is enabled, do not set this value because it's automatically set.", + "type": "number" + }, + "num_threads": { + "description": "The number of threads used by each model allocation during inference.\nIncreasing this value generally increases the speed per inference request.\nThe inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node.\nThe value must be a power of 2.\nThe maximum value is 32.\n\n> info\n> If you want to optimize your ELSER endpoint for ingest, set the number of threads to 1. If you want to optimize your ELSER endpoint for search, set the number of threads to greater than 1.", + "type": "number" + } + }, + "required": [ + "num_allocations", + "num_threads" + ] + }, + "inference.put_elser:AdaptiveAllocations": { + "type": "object", + "properties": { + "enabled": { + "description": "Turn on `adaptive_allocations`.", + "type": "boolean" + }, + "max_number_of_allocations": { + "description": "The maximum number of allocations to scale to.\nIf set, it must be greater than or equal to `min_number_of_allocations`.", + "type": "number" + }, + "min_number_of_allocations": { + "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", + "type": "number" + } + } + }, "inference.put_hugging_face:HuggingFaceTaskType": { "type": "string", "enum": [ diff --git a/output/schema/schema-serverless.json b/output/schema/schema-serverless.json index 0e81b4aa10..fcd8cda6f0 100644 --- a/output/schema/schema-serverless.json +++ b/output/schema/schema-serverless.json @@ -5138,6 +5138,193 @@ "visibility": "public" } }, +<<<<<<< HEAD +======= + "description": "Create an Elastic Inference Service (EIS) inference endpoint.\n\nCreate an inference endpoint to perform an inference task through the Elastic Inference Service (EIS).", + "docId": "inference-api-put-eis", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-eis.html", + "name": "inference.put_eis", + "privileges": { + "cluster": [ + "manage_inference" + ] + }, + "request": { + "name": "Request", + "namespace": "inference.put_eis" + }, + "requestBodyRequired": false, + "requestMediaType": [ + "application/json" + ], + "response": { + "name": "Response", + "namespace": "inference.put_eis" + }, + "responseMediaType": [ + "application/json" + ], + "urls": [ + { + "methods": [ + "PUT" + ], + "path": "/_inference/{task_type}/{eis_inference_id}" + } + ] + }, + { + "availability": { + "serverless": { + "stability": "stable", + "visibility": "public" + }, + "stack": { + "since": "8.11.0", + "stability": "stable", + "visibility": "public" + } + }, + "deprecation": { + "description": "The elser service is deprecated and will be removed in a future release. Use the Elasticsearch inference integration instead, with model_id included in the service_settings.", + "version": "8.16.0" + }, + "description": "Create an ELSER inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elser` service.\nYou can also deploy ELSER by using the Elasticsearch inference integration.\n\n> info\n> Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings.\n\nThe API request will automatically download and deploy the ELSER model if it isn't already downloaded.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "docId": "inference-api-put-elser", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elser.html", + "name": "inference.put_elser", + "privileges": { + "cluster": [ + "manage_inference" + ] + }, + "request": { + "name": "Request", + "namespace": "inference.put_elser" + }, + "requestBodyRequired": false, + "requestMediaType": [ + "application/json" + ], + "response": { + "name": "Response", + "namespace": "inference.put_elser" + }, + "responseMediaType": [ + "application/json" + ], + "urls": [ + { + "methods": [ + "PUT" + ], + "path": "/_inference/{task_type}/{elser_inference_id}" + } + ] + }, + { + "availability": { + "serverless": { + "stability": "stable", + "visibility": "public" + }, + "stack": { + "since": "8.15.0", + "stability": "stable", + "visibility": "public" + } + }, + "description": "Create an Google AI Studio inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `googleaistudio` service.\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "docId": "inference-api-put-googleaistudio", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-google-ai-studio.html", + "name": "inference.put_googleaistudio", + "privileges": { + "cluster": [ + "manage_inference" + ] + }, + "request": { + "name": "Request", + "namespace": "inference.put_googleaistudio" + }, + "requestBodyRequired": false, + "requestMediaType": [ + "application/json" + ], + "response": { + "name": "Response", + "namespace": "inference.put_googleaistudio" + }, + "responseMediaType": [ + "application/json" + ], + "urls": [ + { + "methods": [ + "PUT" + ], + "path": "/_inference/{task_type}/{googleaistudio_inference_id}" + } + ] + }, + { + "availability": { + "serverless": { + "stability": "stable", + "visibility": "public" + }, + "stack": { + "since": "8.15.0", + "stability": "stable", + "visibility": "public" + } + }, + "description": "Create a Google Vertex AI inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `googlevertexai` service.\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "docId": "inference-api-put-googlevertexai", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-google-vertex-ai.html", + "name": "inference.put_googlevertexai", + "privileges": { + "cluster": [ + "manage_inference" + ] + }, + "request": { + "name": "Request", + "namespace": "inference.put_googlevertexai" + }, + "requestBodyRequired": false, + "requestMediaType": [ + "application/json" + ], + "response": { + "name": "Response", + "namespace": "inference.put_googlevertexai" + }, + "responseMediaType": [ + "application/json" + ], + "urls": [ + { + "methods": [ + "PUT" + ], + "path": "/_inference/{task_type}/{googlevertexai_inference_id}" + } + ] + }, + { + "availability": { + "serverless": { + "stability": "stable", + "visibility": "public" + }, + "stack": { + "since": "8.12.0", + "stability": "stable", + "visibility": "public" + } + }, +>>>>>>> 33420c6e0 (Add ELSER inference API details (#4026)) "description": "Create a Hugging Face inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `hugging_face` service.\n\nYou must first create an inference endpoint on the Hugging Face endpoint page to get an endpoint URL.\nSelect the model you want to use on the new endpoint creation page (for example `intfloat/e5-small-v2`), then select the sentence embeddings task under the advanced configuration section.\nCreate the endpoint and copy the URL after the endpoint initialization has been finished.\n\nThe following models are recommended for the Hugging Face service:\n\n* `all-MiniLM-L6-v2`\n* `all-MiniLM-L12-v2`\n* `all-mpnet-base-v2`\n* `e5-base-v2`\n* `e5-small-v2`\n* `multilingual-e5-base`\n* `multilingual-e5-small`\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", "docId": "inference-api-put-huggingface", "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-hugging-face.html", @@ -27591,136 +27778,603 @@ { <<<<<<< HEAD ======= - "description": "The chunking configuration object.", - "extDocId": "inference-chunking", - "extDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html#infer-chunking-config", - "name": "chunking_settings", - "required": false, - "type": { - "kind": "instance_of", - "type": { - "name": "InferenceChunkingSettings", - "namespace": "inference._types" - } - } - }, - { - "description": "The type of service supported for the specified task type. In this case, `cohere`.", - "name": "service", - "required": true, - "type": { - "kind": "instance_of", - "type": { - "name": "ServiceType", - "namespace": "inference.put_cohere" - } - } - }, - { - "description": "Settings used to install the inference model.\nThese settings are specific to the `cohere` service.", - "name": "service_settings", - "required": true, - "type": { - "kind": "instance_of", - "type": { - "name": "CohereServiceSettings", - "namespace": "inference.put_cohere" - } - } - }, - { - "description": "Settings to configure the inference task.\nThese settings are specific to the task type you specified.", - "name": "task_settings", - "required": false, - "type": { - "kind": "instance_of", - "type": { - "name": "CohereTaskSettings", - "namespace": "inference.put_cohere" - } - } - } - ] - }, - "description": "Create a Cohere inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `cohere` service.\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", - "examples": { - "PutCohereRequestExample1": { - "description": "Run `PUT _inference/text_embedding/cohere-embeddings` to create an inference endpoint that performs a text embedding task.", - "summary": "A text embedding task", - "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n \"api_key\": \"Cohere-Api-key\",\n \"model_id\": \"embed-english-light-v3.0\",\n \"embedding_type\": \"byte\"\n }\n}" - }, - "PutCohereRequestExample2": { - "description": "Run `PUT _inference/rerank/cohere-rerank` to create an inference endpoint that performs a rerank task.", - "summary": "A rerank task", - "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n \"api_key\": \"Cohere-API-key\",\n \"model_id\": \"rerank-english-v3.0\"\n },\n \"task_settings\": {\n \"top_n\": 10,\n \"return_documents\": true\n }\n}" - } - }, - "inherits": { - "type": { - "name": "RequestBase", - "namespace": "_types" - } - }, - "kind": "request", - "name": { - "name": "Request", - "namespace": "inference.put_cohere" - }, - "path": [ - { - "description": "The type of the inference task that the model will perform.", - "name": "task_type", - "required": true, - "type": { - "kind": "instance_of", - "type": { - "name": "CohereTaskType", - "namespace": "inference.put_cohere" - } - } - }, - { - "description": "The unique identifier of the inference endpoint.", - "name": "cohere_inference_id", - "required": true, - "type": { - "kind": "instance_of", - "type": { - "name": "Id", - "namespace": "_types" - } - } - } - ], - "query": [], - "specLocation": "inference/put_cohere/PutCohereRequest.ts#L28-L82" - }, - { - "body": { - "kind": "value", - "value": { - "kind": "instance_of", - "type": { - "name": "InferenceEndpointInfo", - "namespace": "inference._types" - } - } - }, - "kind": "response", - "name": { - "name": "Response", - "namespace": "inference.put_cohere" - }, - "specLocation": "inference/put_cohere/PutCohereResponse.ts#L22-L24" - }, - { - "attachedBehaviors": [ - "CommonQueryParameters" - ], - "body": { - "kind": "properties", - "properties": [ - { +<<<<<<< HEAD +======= + "description": "The type of service supported for the specified task type. In this case, `elastic`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ServiceType", + "namespace": "inference.put_eis" + } + } + }, + { + "description": "Settings used to install the inference model. These settings are specific to the `elastic` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "EisServiceSettings", + "namespace": "inference.put_eis" + } + } + } + ] + }, + "description": "Create an Elastic Inference Service (EIS) inference endpoint.\n\nCreate an inference endpoint to perform an inference task through the Elastic Inference Service (EIS).", + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "kind": "request", + "name": { + "name": "Request", + "namespace": "inference.put_eis" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.\nNOTE: The `chat_completion` task type only supports streaming and only through the _stream API.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "EisTaskType", + "namespace": "inference.put_eis" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.", + "name": "eis_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [], + "specLocation": "inference/put_eis/PutEisRequest.ts#L24-L62" + }, + { + "body": { + "kind": "value", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfo", + "namespace": "inference._types" + } + } + }, + "kind": "response", + "name": { + "name": "Response", + "namespace": "inference.put_eis" + }, + "specLocation": "inference/put_eis/PutEisResponse.ts#L22-L24" + }, + { + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { + "description": "The chunking configuration object.", + "extDocId": "inference-chunking", + "extDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html#infer-chunking-config", + "name": "chunking_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "InferenceChunkingSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "The type of service supported for the specified task type. In this case, `elser`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ServiceType", + "namespace": "inference.put_elser" + } + } + }, + { + "description": "Settings used to install the inference model. These settings are specific to the `elser` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElserServiceSettings", + "namespace": "inference.put_elser" + } + } + } + ] + }, + "deprecation": { + "description": "The elser service is deprecated and will be removed in a future release. Use the Elasticsearch inference integration instead, with model_id included in the service_settings.", + "version": "8.16.0" + }, + "description": "Create an ELSER inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elser` service.\nYou can also deploy ELSER by using the Elasticsearch inference integration.\n\n> info\n> Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings.\n\nThe API request will automatically download and deploy the ELSER model if it isn't already downloaded.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "examples": { + "PutElserRequestExample1": { + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The request will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "summary": "A sparse embedding task", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n }\n}" + }, + "PutElserRequestExample2": { + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task with adaptive allocations. When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load.", + "summary": "Adaptive allocations", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1\n }\n}" + } + }, + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "kind": "request", + "name": { + "name": "Request", + "namespace": "inference.put_elser" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElserTaskType", + "namespace": "inference.put_elser" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.", + "name": "elser_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [], + "specLocation": "inference/put_elser/PutElserRequest.ts#L25-L82" + }, + { + "body": { + "kind": "value", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfo", + "namespace": "inference._types" + } + } + }, + "examples": { + "PutElserResponseExample1": { + "description": "A successful response when creating an ELSER inference endpoint.", + "value": "{\n \"inference_id\": \"my-elser-model\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n },\n \"task_settings\": {}\n}" + } + }, + "kind": "response", + "name": { + "name": "Response", + "namespace": "inference.put_elser" + }, + "specLocation": "inference/put_elser/PutElserResponse.ts#L22-L24" + }, + { + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { + "description": "The chunking configuration object.", + "extDocId": "inference-chunking", + "extDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html#infer-chunking-config", + "name": "chunking_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "InferenceChunkingSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "The type of service supported for the specified task type. In this case, `googleaistudio`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ServiceType", + "namespace": "inference.put_googleaistudio" + } + } + }, + { + "description": "Settings used to install the inference model. These settings are specific to the `googleaistudio` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "GoogleAiStudioServiceSettings", + "namespace": "inference.put_googleaistudio" + } + } + } + ] + }, + "description": "Create an Google AI Studio inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `googleaistudio` service.\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "examples": { + "PutGoogleAiStudioRequestExample1": { + "description": "Run `PUT _inference/completion/google_ai_studio_completion` to create an inference endpoint to perform a `completion` task type.", + "summary": "A completion task", + "value": "{\n \"service\": \"googleaistudio\",\n \"service_settings\": {\n \"api_key\": \"api-key\",\n \"model_id\": \"model-id\"\n }\n}" + } + }, + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "kind": "request", + "name": { + "name": "Request", + "namespace": "inference.put_googleaistudio" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "GoogleAiStudioTaskType", + "namespace": "inference.put_googleaistudio" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.", + "name": "googleaistudio_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [], + "specLocation": "inference/put_googleaistudio/PutGoogleAiStudioRequest.ts#L27-L75" + }, + { + "body": { + "kind": "value", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfo", + "namespace": "inference._types" + } + } + }, + "kind": "response", + "name": { + "name": "Response", + "namespace": "inference.put_googleaistudio" + }, + "specLocation": "inference/put_googleaistudio/PutGoogleAiStudioResponse.ts#L22-L24" + }, + { + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { + "description": "The chunking configuration object.", + "extDocId": "inference-chunking", + "extDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html#infer-chunking-config", + "name": "chunking_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "InferenceChunkingSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "The type of service supported for the specified task type. In this case, `googlevertexai`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ServiceType", + "namespace": "inference.put_googlevertexai" + } + } + }, + { + "description": "Settings used to install the inference model. These settings are specific to the `googlevertexai` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "GoogleVertexAIServiceSettings", + "namespace": "inference.put_googlevertexai" + } + } + }, + { + "description": "Settings to configure the inference task.\nThese settings are specific to the task type you specified.", + "name": "task_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "GoogleVertexAITaskSettings", + "namespace": "inference.put_googlevertexai" + } + } + } + ] + }, + "description": "Create a Google Vertex AI inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `googlevertexai` service.\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "examples": { + "PutGoogleVertexAiRequestExample1": { + "description": "Run `PUT _inference/text_embedding/google_vertex_ai_embeddings` to create an inference endpoint to perform a `text_embedding` task type.", + "summary": "A text embedding task", + "value": "{\n \"service\": \"googlevertexai\",\n \"service_settings\": {\n \"service_account_json\": \"service-account-json\",\n \"model_id\": \"model-id\",\n \"location\": \"location\",\n \"project_id\": \"project-id\"\n }\n}" + }, + "PutGoogleVertexAiRequestExample2": { + "description": "Run `PUT _inference/rerank/google_vertex_ai_rerank` to create an inference endpoint to perform a `rerank` task type.", + "summary": "A rerank task", + "value": "{\n \"service\": \"googlevertexai\",\n \"service_settings\": {\n \"service_account_json\": \"service-account-json\",\n \"project_id\": \"project-id\"\n }\n}" + } + }, + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "kind": "request", + "name": { + "name": "Request", + "namespace": "inference.put_googlevertexai" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "GoogleVertexAITaskType", + "namespace": "inference.put_googlevertexai" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.", + "name": "googlevertexai_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [], + "specLocation": "inference/put_googlevertexai/PutGoogleVertexAiRequest.ts#L28-L81" + }, + { + "body": { + "kind": "value", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfo", + "namespace": "inference._types" + } + } + }, + "kind": "response", + "name": { + "name": "Response", + "namespace": "inference.put_googlevertexai" + }, + "specLocation": "inference/put_googlevertexai/PutGoogleVertexAiResponse.ts#L22-L24" + }, + { + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { +>>>>>>> 33420c6e0 (Add ELSER inference API details (#4026)) +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) + "description": "The chunking configuration object.", + "extDocId": "inference-chunking", + "extDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html#infer-chunking-config", + "name": "chunking_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "InferenceChunkingSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "The type of service supported for the specified task type. In this case, `cohere`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ServiceType", + "namespace": "inference.put_cohere" + } + } + }, + { + "description": "Settings used to install the inference model.\nThese settings are specific to the `cohere` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "CohereServiceSettings", + "namespace": "inference.put_cohere" + } + } + }, + { + "description": "Settings to configure the inference task.\nThese settings are specific to the task type you specified.", + "name": "task_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "CohereTaskSettings", + "namespace": "inference.put_cohere" + } + } + } + ] + }, + "description": "Create a Cohere inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `cohere` service.\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "examples": { + "PutCohereRequestExample1": { + "description": "Run `PUT _inference/text_embedding/cohere-embeddings` to create an inference endpoint that performs a text embedding task.", + "summary": "A text embedding task", + "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n \"api_key\": \"Cohere-Api-key\",\n \"model_id\": \"embed-english-light-v3.0\",\n \"embedding_type\": \"byte\"\n }\n}" + }, + "PutCohereRequestExample2": { + "description": "Run `PUT _inference/rerank/cohere-rerank` to create an inference endpoint that performs a rerank task.", + "summary": "A rerank task", + "value": "{\n \"service\": \"cohere\",\n \"service_settings\": {\n \"api_key\": \"Cohere-API-key\",\n \"model_id\": \"rerank-english-v3.0\"\n },\n \"task_settings\": {\n \"top_n\": 10,\n \"return_documents\": true\n }\n}" + } + }, + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "kind": "request", + "name": { + "name": "Request", + "namespace": "inference.put_cohere" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "CohereTaskType", + "namespace": "inference.put_cohere" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.", + "name": "cohere_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [], + "specLocation": "inference/put_cohere/PutCohereRequest.ts#L28-L82" + }, + { + "body": { + "kind": "value", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfo", + "namespace": "inference._types" + } + } + }, + "kind": "response", + "name": { + "name": "Response", + "namespace": "inference.put_cohere" + }, + "specLocation": "inference/put_cohere/PutCohereResponse.ts#L22-L24" + }, + { + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { "description": "The type of service supported for the specified task type. In this case, `elastic`.", "name": "service", "required": true, @@ -103240,6 +103894,7 @@ { <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "name": "completion" }, @@ -103426,6 +104081,9 @@ "kind": "enum", "members": [ { +======= +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "chat_completion" } ], @@ -103452,6 +104110,7 @@ "kind": "enum", "members": [ { +<<<<<<< HEAD <<<<<<< HEAD "name": "rerank" }, @@ -103520,6 +104179,8 @@ { ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "sparse_embedding" } ], @@ -103543,11 +104204,14 @@ "specLocation": "inference/put_elser/PutElserRequest.ts#L88-L90" <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 72877ef81 (Add Amazon Bedrock inference API (#4022)) ======= >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) }, { "kind": "enum", @@ -103562,6 +104226,7 @@ "name": { <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "name": "AzureAiStudioTaskType", "namespace": "inference.put_azureaistudio" @@ -103572,17 +104237,22 @@ >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "GoogleAiStudioTaskType", "namespace": "inference.put_googleaistudio" }, "specLocation": "inference/put_googleaistudio/PutGoogleAiStudioRequest.ts#L77-L80" <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 72877ef81 (Add Amazon Bedrock inference API (#4022)) ======= >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) }, { "kind": "enum", @@ -103590,6 +104260,7 @@ { <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "name": "azureaistudio" ======= @@ -103601,12 +104272,16 @@ ======= "name": "googleaistudio" >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= + "name": "googleaistudio" +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) } ], "name": { "name": "ServiceType", <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "namespace": "inference.put_azureaistudio" }, @@ -103626,6 +104301,11 @@ }, "specLocation": "inference/put_googleaistudio/PutGoogleAiStudioRequest.ts#L82-L84" >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= + "namespace": "inference.put_googleaistudio" + }, + "specLocation": "inference/put_googleaistudio/PutGoogleAiStudioRequest.ts#L82-L84" +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) }, { "kind": "enum", @@ -103633,6 +104313,7 @@ { <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "name": "completion" ======= @@ -103644,6 +104325,9 @@ ======= "name": "rerank" >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= + "name": "rerank" +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) }, { "name": "text_embedding" @@ -103652,6 +104336,7 @@ "name": { <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "name": "AzureOpenAITaskType", "namespace": "inference.put_azureopenai" @@ -103662,17 +104347,22 @@ >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "GoogleVertexAITaskType", "namespace": "inference.put_googlevertexai" }, "specLocation": "inference/put_googlevertexai/PutGoogleVertexAiRequest.ts#L83-L86" <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 72877ef81 (Add Amazon Bedrock inference API (#4022)) ======= >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) }, { "kind": "enum", @@ -103680,6 +104370,7 @@ { <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "name": "azureopenai" ======= @@ -103691,12 +104382,16 @@ ======= "name": "googlevertexai" >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= + "name": "googlevertexai" +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) } ], "name": { "name": "ServiceType", <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "namespace": "inference.put_azureopenai" }, @@ -103716,6 +104411,11 @@ }, "specLocation": "inference/put_googlevertexai/PutGoogleVertexAiRequest.ts#L88-L90" >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= + "namespace": "inference.put_googlevertexai" + }, + "specLocation": "inference/put_googlevertexai/PutGoogleVertexAiRequest.ts#L88-L90" +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) }, { "kind": "enum", @@ -103724,6 +104424,7 @@ <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> f5eaaab24 (Add Amazon Bedrock inference API (#4022)) >>>>>>> 72877ef81 (Add Amazon Bedrock inference API (#4022)) @@ -103733,6 +104434,9 @@ ======= >>>>>>> 2dc985a1e (Add Cohere inference API details (#4025)) >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> 33420c6e0 (Add ELSER inference API details (#4026)) +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "text_embedding" } ], @@ -124273,6 +124977,7 @@ "name": { <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "name": "AlibabaCloudServiceSettings", "namespace": "inference.put_alibabacloud" @@ -125003,6 +125708,9 @@ { "kind": "interface", "name": { +======= +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "EisServiceSettings", "namespace": "inference.put_eis" }, @@ -125037,6 +125745,7 @@ { "kind": "interface", "name": { +<<<<<<< HEAD <<<<<<< HEAD "name": "ElasticsearchServiceSettings", "namespace": "inference.put_elasticsearch" @@ -125142,6 +125851,15 @@ { "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", "name": "min_number_of_allocations", +======= + "name": "RateLimitSetting", + "namespace": "inference._types" + }, + "properties": [ + { + "description": "The number of requests allowed per minute.", + "name": "requests_per_minute", +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "required": false, "type": { "kind": "instance_of", @@ -125152,11 +125870,16 @@ } } ], +<<<<<<< HEAD "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L98-L115" +======= + "specLocation": "inference/_types/Services.ts#L95-L100" +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) }, { "kind": "interface", "name": { +<<<<<<< HEAD "name": "ElasticsearchTaskSettings", "namespace": "inference.put_elasticsearch" }, @@ -125182,6 +125905,8 @@ "name": { ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "ElserServiceSettings", "namespace": "inference.put_elser" }, @@ -125384,11 +126109,14 @@ "name": "service_account_json", <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 72877ef81 (Add Amazon Bedrock inference API (#4022)) ======= >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "required": true, "type": { "kind": "instance_of", @@ -125401,6 +126129,7 @@ ], <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "specLocation": "inference/put_azureopenai/PutAzureOpenAiRequest.ts#L99-L144" ======= @@ -125412,12 +126141,16 @@ ======= "specLocation": "inference/put_googlevertexai/PutGoogleVertexAiRequest.ts#L92-L118" >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= + "specLocation": "inference/put_googlevertexai/PutGoogleVertexAiRequest.ts#L92-L118" +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) }, { "kind": "interface", "name": { <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "name": "AzureOpenAITaskSettings", "namespace": "inference.put_azureopenai" @@ -125431,6 +126164,8 @@ >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "GoogleVertexAITaskSettings", "namespace": "inference.put_googlevertexai" }, @@ -125440,17 +126175,21 @@ "name": "auto_truncate", <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 72877ef81 (Add Amazon Bedrock inference API (#4022)) ======= >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "required": false, "type": { "kind": "instance_of", "type": { <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD <<<<<<< HEAD "name": "string", "namespace": "_builtins" @@ -125464,6 +126203,8 @@ >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "boolean", "namespace": "_builtins" } @@ -125485,11 +126226,14 @@ "specLocation": "inference/put_googlevertexai/PutGoogleVertexAiRequest.ts#L120-L129" <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD >>>>>>> 72877ef81 (Add Amazon Bedrock inference API (#4022)) ======= >>>>>>> 76ab18016 (Add Anthropic inference API details (#4023)) ======= >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) }, { "kind": "interface", @@ -125497,6 +126241,7 @@ <<<<<<< HEAD <<<<<<< HEAD <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> f5eaaab24 (Add Amazon Bedrock inference API (#4022)) >>>>>>> 72877ef81 (Add Amazon Bedrock inference API (#4022)) @@ -125506,6 +126251,9 @@ ======= >>>>>>> 2dc985a1e (Add Cohere inference API details (#4025)) >>>>>>> 397d37cf8 (Add Cohere inference API details (#4025)) +======= +>>>>>>> 33420c6e0 (Add ELSER inference API details (#4026)) +>>>>>>> b82415b5e (Add ELSER inference API details (#4026)) "name": "HuggingFaceServiceSettings", "namespace": "inference.put_hugging_face" }, diff --git a/output/schema/schema.json b/output/schema/schema.json index 6dcb35c31c..0852f020ee 100644 --- a/output/schema/schema.json +++ b/output/schema/schema.json @@ -9618,6 +9618,55 @@ } ] }, + { + "availability": { + "serverless": { + "stability": "stable", + "visibility": "public" + }, + "stack": { + "since": "8.11.0", + "stability": "stable", + "visibility": "public" + } + }, + "deprecation": { + "description": "The elser service is deprecated and will be removed in a future release. Use the Elasticsearch inference integration instead, with model_id included in the service_settings.", + "version": "8.16.0" + }, + "description": "Create an ELSER inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elser` service.\nYou can also deploy ELSER by using the Elasticsearch inference integration.\n\n> info\n> Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings.\n\nThe API request will automatically download and deploy the ELSER model if it isn't already downloaded.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "docId": "inference-api-put-elser", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elser.html", + "name": "inference.put_elser", + "privileges": { + "cluster": [ + "manage_inference" + ] + }, + "request": { + "name": "Request", + "namespace": "inference.put_elser" + }, + "requestBodyRequired": false, + "requestMediaType": [ + "application/json" + ], + "response": { + "name": "Response", + "namespace": "inference.put_elser" + }, + "responseMediaType": [ + "application/json" + ], + "urls": [ + { + "methods": [ + "PUT" + ], + "path": "/_inference/{task_type}/{elser_inference_id}" + } + ] + }, { "availability": { "serverless": { @@ -152882,6 +152931,253 @@ }, "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L94-L96" }, + { + "kind": "interface", + "name": { + "name": "AdaptiveAllocations", + "namespace": "inference.put_elser" + }, + "properties": [ + { + "description": "Turn on `adaptive_allocations`.", + "name": "enabled", + "required": false, + "serverDefault": false, + "type": { + "kind": "instance_of", + "type": { + "name": "boolean", + "namespace": "_builtins" + } + } + }, + { + "description": "The maximum number of allocations to scale to.\nIf set, it must be greater than or equal to `min_number_of_allocations`.", + "name": "max_number_of_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + }, + { + "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", + "name": "min_number_of_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "inference/put_elser/PutElserRequest.ts#L92-L109" + }, + { + "kind": "interface", + "name": { + "name": "ElserServiceSettings", + "namespace": "inference.put_elser" + }, + "properties": [ + { + "description": "Adaptive allocations configuration details.\nIf `enabled` is true, the number of allocations of the model is set based on the current load the process gets.\nWhen the load is high, a new model allocation is automatically created, respecting the value of `max_number_of_allocations` if it's set.\nWhen the load is low, a model allocation is automatically removed, respecting the value of `min_number_of_allocations` if it's set.\nIf `enabled` is true, do not set the number of allocations manually.", + "name": "adaptive_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "AdaptiveAllocations", + "namespace": "inference.put_elser" + } + } + }, + { + "description": "The total number of allocations this model is assigned across machine learning nodes.\nIncreasing this value generally increases the throughput.\nIf adaptive allocations is enabled, do not set this value because it's automatically set.", + "name": "num_allocations", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + }, + { + "description": "The number of threads used by each model allocation during inference.\nIncreasing this value generally increases the speed per inference request.\nThe inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node.\nThe value must be a power of 2.\nThe maximum value is 32.\n\n> info\n> If you want to optimize your ELSER endpoint for ingest, set the number of threads to 1. If you want to optimize your ELSER endpoint for search, set the number of threads to greater than 1.", + "name": "num_threads", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "inference/put_elser/PutElserRequest.ts#L111-L137" + }, + { + "kind": "enum", + "members": [ + { + "name": "sparse_embedding" + } + ], + "name": { + "name": "ElserTaskType", + "namespace": "inference.put_elser" + }, + "specLocation": "inference/put_elser/PutElserRequest.ts#L84-L86" + }, + { + "kind": "request", + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { + "description": "The chunking configuration object.", + "extDocId": "inference-chunking", + "extDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html#infer-chunking-config", + "name": "chunking_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "InferenceChunkingSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "The type of service supported for the specified task type. In this case, `elser`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ServiceType", + "namespace": "inference.put_elser" + } + } + }, + { + "description": "Settings used to install the inference model. These settings are specific to the `elser` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElserServiceSettings", + "namespace": "inference.put_elser" + } + } + } + ] + }, + "deprecation": { + "description": "The elser service is deprecated and will be removed in a future release. Use the Elasticsearch inference integration instead, with model_id included in the service_settings.", + "version": "8.16.0" + }, + "description": "Create an ELSER inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elser` service.\nYou can also deploy ELSER by using the Elasticsearch inference integration.\n\n> info\n> Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings.\n\nThe API request will automatically download and deploy the ELSER model if it isn't already downloaded.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "examples": { + "PutElserRequestExample1": { + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The request will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "summary": "A sparse embedding task", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n }\n}" + }, + "PutElserRequestExample2": { + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task with adaptive allocations. When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load.", + "summary": "Adaptive allocations", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1\n }\n}" + } + }, + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "name": { + "name": "Request", + "namespace": "inference.put_elser" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElserTaskType", + "namespace": "inference.put_elser" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.", + "name": "elser_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [], + "specLocation": "inference/put_elser/PutElserRequest.ts#L25-L82" + }, + { + "kind": "response", + "body": { + "kind": "value", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfo", + "namespace": "inference._types" + } + } + }, + "examples": { + "PutElserResponseExample1": { + "description": "A successful response when creating an ELSER inference endpoint.", + "value": "{\n \"inference_id\": \"my-elser-model\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n },\n \"task_settings\": {}\n}" + } + }, + "name": { + "name": "Response", + "namespace": "inference.put_elser" + }, + "specLocation": "inference/put_elser/PutElserResponse.ts#L22-L24" + }, + { + "kind": "enum", + "members": [ + { + "name": "elser" + } + ], + "name": { + "name": "ServiceType", + "namespace": "inference.put_elser" + }, + "specLocation": "inference/put_elser/PutElserRequest.ts#L88-L90" + }, { "kind": "interface", "name": { diff --git a/output/typescript/types.ts b/output/typescript/types.ts index c90515b544..2987e79790 100644 --- a/output/typescript/types.ts +++ b/output/typescript/types.ts @@ -13478,6 +13478,34 @@ export type InferencePutElasticsearchResponse = InferenceInferenceEndpointInfo export type InferencePutElasticsearchServiceType = 'elasticsearch' +export interface InferencePutElserAdaptiveAllocations { + enabled?: boolean + max_number_of_allocations?: integer + min_number_of_allocations?: integer +} + +export interface InferencePutElserElserServiceSettings { + adaptive_allocations?: InferencePutElserAdaptiveAllocations + num_allocations: integer + num_threads: integer +} + +export type InferencePutElserElserTaskType = 'sparse_embedding' + +export interface InferencePutElserRequest extends RequestBase { + task_type: InferencePutElserElserTaskType + elser_inference_id: Id + body?: { + chunking_settings?: InferenceInferenceChunkingSettings + service: InferencePutElserServiceType + service_settings: InferencePutElserElserServiceSettings + } +} + +export type InferencePutElserResponse = InferenceInferenceEndpointInfo + +export type InferencePutElserServiceType = 'elser' + export interface InferencePutHuggingFaceHuggingFaceServiceSettings { api_key: string rate_limit?: InferenceRateLimitSetting diff --git a/specification/_json_spec/inference.put_elser.json b/specification/_json_spec/inference.put_elser.json new file mode 100644 index 0000000000..b943b31a7d --- /dev/null +++ b/specification/_json_spec/inference.put_elser.json @@ -0,0 +1,39 @@ +{ + "inference.put_elser": { + "deprecated": { + "version": "8.16.0", + "description": "The elser service is deprecated. Use the Elasticsearch inference integration instead, with model_id included in the service_settings." + }, + "documentation": { + "url": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elser.html", + "description": "Configure an ELSER inference endpoint" + }, + "stability": "stable", + "visibility": "public", + "headers": { + "accept": ["application/json"], + "content_type": ["application/json"] + }, + "url": { + "paths": [ + { + "path": "/_inference/{task_type}/{elser_inference_id}", + "methods": ["PUT"], + "parts": { + "task_type": { + "type": "string", + "description": "The task type" + }, + "elser_inference_id": { + "type": "string", + "description": "The inference Id" + } + } + } + ] + }, + "body": { + "description": "The inference endpoint's task and service settings" + } + } +} diff --git a/specification/inference/put_elser/PutElserRequest.ts b/specification/inference/put_elser/PutElserRequest.ts new file mode 100644 index 0000000000..3a21e01df3 --- /dev/null +++ b/specification/inference/put_elser/PutElserRequest.ts @@ -0,0 +1,137 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { InferenceChunkingSettings } from '@inference/_types/Services' +import { RequestBase } from '@_types/Base' +import { Id } from '@_types/common' +import { integer } from '@_types/Numeric' + +/** + * Create an ELSER inference endpoint. + * + * Create an inference endpoint to perform an inference task with the `elser` service. + * You can also deploy ELSER by using the Elasticsearch inference integration. + * + * > info + * > Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings. + * + * The API request will automatically download and deploy the ELSER model if it isn't already downloaded. + * + * > info + * > You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value. + * + * After creating the endpoint, wait for the model deployment to complete before using it. + * To verify the deployment status, use the get trained model statistics API. + * Look for `"state": "fully_allocated"` in the response and ensure that the `"allocation_count"` matches the `"target_allocation_count"`. + * Avoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources. + * @rest_spec_name inference.put_elser + * @availability stack since=8.11.0 stability=stable visibility=public + * @availability serverless stability=stable visibility=public + * @deprecated 8.16.0 The elser service is deprecated and will be removed in a future release. Use the Elasticsearch inference integration instead, with model_id included in the service_settings. + * @cluster_privileges manage_inference + * @doc_id inference-api-put-elser + */ +export interface Request extends RequestBase { + urls: [ + { + path: '/_inference/{task_type}/{elser_inference_id}' + methods: ['PUT'] + } + ] + path_parts: { + /** + * The type of the inference task that the model will perform. + */ + task_type: ElserTaskType + /** + * The unique identifier of the inference endpoint. + */ + elser_inference_id: Id + } + body: { + /** + * The chunking configuration object. + * @ext_doc_id inference-chunking + */ + chunking_settings?: InferenceChunkingSettings + /** + * The type of service supported for the specified task type. In this case, `elser`. + */ + service: ServiceType + /** + * Settings used to install the inference model. These settings are specific to the `elser` service. + */ + service_settings: ElserServiceSettings + } +} + +export enum ElserTaskType { + sparse_embedding +} + +export enum ServiceType { + elser +} + +export class AdaptiveAllocations { + /** + * Turn on `adaptive_allocations`. + * @server_default false + */ + enabled?: boolean + /** + * The maximum number of allocations to scale to. + * If set, it must be greater than or equal to `min_number_of_allocations`. + */ + max_number_of_allocations?: integer + /** + * The minimum number of allocations to scale to. + * If set, it must be greater than or equal to 0. + * If not defined, the deployment scales to 0. + */ + min_number_of_allocations?: integer +} + +export class ElserServiceSettings { + /** + * Adaptive allocations configuration details. + * If `enabled` is true, the number of allocations of the model is set based on the current load the process gets. + * When the load is high, a new model allocation is automatically created, respecting the value of `max_number_of_allocations` if it's set. + * When the load is low, a model allocation is automatically removed, respecting the value of `min_number_of_allocations` if it's set. + * If `enabled` is true, do not set the number of allocations manually. + */ + adaptive_allocations?: AdaptiveAllocations + /** + * The total number of allocations this model is assigned across machine learning nodes. + * Increasing this value generally increases the throughput. + * If adaptive allocations is enabled, do not set this value because it's automatically set. + */ + num_allocations: integer + /** + * The number of threads used by each model allocation during inference. + * Increasing this value generally increases the speed per inference request. + * The inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node. + * The value must be a power of 2. + * The maximum value is 32. + * + * > info + * > If you want to optimize your ELSER endpoint for ingest, set the number of threads to 1. If you want to optimize your ELSER endpoint for search, set the number of threads to greater than 1. + */ + num_threads: integer +} diff --git a/specification/inference/put_elser/PutElserResponse.ts b/specification/inference/put_elser/PutElserResponse.ts new file mode 100644 index 0000000000..d40639b031 --- /dev/null +++ b/specification/inference/put_elser/PutElserResponse.ts @@ -0,0 +1,24 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { InferenceEndpointInfo } from '@inference/_types/Services' + +export class Response { + body: InferenceEndpointInfo +} diff --git a/specification/inference/put_elser/examples/request/PutElserRequestExample1.yaml b/specification/inference/put_elser/examples/request/PutElserRequestExample1.yaml new file mode 100644 index 0000000000..85fd58f986 --- /dev/null +++ b/specification/inference/put_elser/examples/request/PutElserRequestExample1.yaml @@ -0,0 +1,12 @@ +summary: A sparse embedding task +description: Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The request will automatically download the ELSER model if it isn't already downloaded and then deploy the model. +# method_request: "PUT _inference/sparse_embedding/my-elser-model" +# type: "request" +value: |- + { + "service": "elser", + "service_settings": { + "num_allocations": 1, + "num_threads": 1 + } + } diff --git a/specification/inference/put_elser/examples/request/PutElserRequestExample2.yaml b/specification/inference/put_elser/examples/request/PutElserRequestExample2.yaml new file mode 100644 index 0000000000..831115834c --- /dev/null +++ b/specification/inference/put_elser/examples/request/PutElserRequestExample2.yaml @@ -0,0 +1,16 @@ +summary: Adaptive allocations +description: Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task with adaptive allocations. When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load. +# method_request: "PUT _inference/sparse_embedding/my-elser-model" +# type: "request" +value: |- + { + "service": "elser", + "service_settings": { + "adaptive_allocations": { + "enabled": true, + "min_number_of_allocations": 3, + "max_number_of_allocations": 10 + }, + "num_threads": 1 + } + } diff --git a/specification/inference/put_elser/examples/response/PutElserResponseExample1.yaml b/specification/inference/put_elser/examples/response/PutElserResponseExample1.yaml new file mode 100644 index 0000000000..9d0746cbd1 --- /dev/null +++ b/specification/inference/put_elser/examples/response/PutElserResponseExample1.yaml @@ -0,0 +1,15 @@ +# summary: +description: A successful response when creating an ELSER inference endpoint. +# type: response +# response_code: +value: |- + { + "inference_id": "my-elser-model", + "task_type": "sparse_embedding", + "service": "elser", + "service_settings": { + "num_allocations": 1, + "num_threads": 1 + }, + "task_settings": {} + }