From db70f92ab6a4a20631fc024993e62406e184e295 Mon Sep 17 00:00:00 2001 From: Lisa Cawley Date: Tue, 25 Mar 2025 07:42:11 -0700 Subject: [PATCH] Add ELSER inference API details (#4026) (cherry picked from commit 33420c6e0fddea1544cfaeca89ec6a178b3d0c1f) --- output/openapi/elasticsearch-openapi.json | 139 ++++++++ output/schema/schema.json | 308 +++++++++++++++++- output/typescript/types.ts | 28 ++ specification/_doc_ids/table.csv | 13 +- .../_json_spec/inference.put_elser.json | 39 +++ .../inference/put_elser/PutElserRequest.ts | 137 ++++++++ .../inference/put_elser/PutElserResponse.ts | 24 ++ .../request/PutElserRequestExample1.yaml | 12 + .../request/PutElserRequestExample2.yaml | 16 + .../response/PutElserResponseExample1.yaml | 15 + 10 files changed, 719 insertions(+), 12 deletions(-) create mode 100644 specification/_json_spec/inference.put_elser.json create mode 100644 specification/inference/put_elser/PutElserRequest.ts create mode 100644 specification/inference/put_elser/PutElserResponse.ts create mode 100644 specification/inference/put_elser/examples/request/PutElserRequestExample1.yaml create mode 100644 specification/inference/put_elser/examples/request/PutElserRequestExample2.yaml create mode 100644 specification/inference/put_elser/examples/response/PutElserResponseExample1.yaml diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json index a9ce86ff60..347d64ef68 100644 --- a/output/openapi/elasticsearch-openapi.json +++ b/output/openapi/elasticsearch-openapi.json @@ -17929,6 +17929,96 @@ "x-state": "Added in 8.12.0" } }, + "/_inference/{task_type}/{elser_inference_id}": { + "put": { + "tags": [ + "inference" + ], + "summary": "Create an ELSER inference endpoint", + "description": "Create an inference endpoint to perform an inference task with the `elser` service.\nYou can also deploy ELSER by using the Elasticsearch inference integration.\n\n> info\n> Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings.\n\nThe API request will automatically download and deploy the ELSER model if it isn't already downloaded.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "operationId": "inference-put-elser", + "parameters": [ + { + "in": "path", + "name": "task_type", + "description": "The type of the inference task that the model will perform.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/inference.put_elser:ElserTaskType" + }, + "style": "simple" + }, + { + "in": "path", + "name": "elser_inference_id", + "description": "The unique identifier of the inference endpoint.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/_types:Id" + }, + "style": "simple" + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "chunking_settings": { + "$ref": "#/components/schemas/inference._types:InferenceChunkingSettings" + }, + "service": { + "$ref": "#/components/schemas/inference.put_elser:ServiceType" + }, + "service_settings": { + "$ref": "#/components/schemas/inference.put_elser:ElserServiceSettings" + } + }, + "required": [ + "service", + "service_settings" + ] + }, + "examples": { + "PutElserRequestExample1": { + "summary": "A sparse embedding task", + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The request will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n }\n}" + }, + "PutElserRequestExample2": { + "summary": "Adaptive allocations", + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task with adaptive allocations. When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load.", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1\n }\n}" + } + } + } + } + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/inference._types:InferenceEndpointInfo" + }, + "examples": { + "PutElserResponseExample1": { + "description": "A successful response when creating an ELSER inference endpoint.", + "value": "{\n \"inference_id\": \"my-elser-model\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n },\n \"task_settings\": {}\n}" + } + } + } + } + } + }, + "deprecated": true, + "x-state": "Added in 8.11.0" + } + }, "/_inference/{task_type}/{googleaistudio_inference_id}": { "put": { "tags": [ @@ -77363,6 +77453,55 @@ } } }, + "inference.put_elser:ElserTaskType": { + "type": "string", + "enum": [ + "sparse_embedding" + ] + }, + "inference.put_elser:ServiceType": { + "type": "string", + "enum": [ + "elser" + ] + }, + "inference.put_elser:ElserServiceSettings": { + "type": "object", + "properties": { + "adaptive_allocations": { + "$ref": "#/components/schemas/inference.put_elser:AdaptiveAllocations" + }, + "num_allocations": { + "description": "The total number of allocations this model is assigned across machine learning nodes.\nIncreasing this value generally increases the throughput.\nIf adaptive allocations is enabled, do not set this value because it's automatically set.", + "type": "number" + }, + "num_threads": { + "description": "The number of threads used by each model allocation during inference.\nIncreasing this value generally increases the speed per inference request.\nThe inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node.\nThe value must be a power of 2.\nThe maximum value is 32.\n\n> info\n> If you want to optimize your ELSER endpoint for ingest, set the number of threads to 1. If you want to optimize your ELSER endpoint for search, set the number of threads to greater than 1.", + "type": "number" + } + }, + "required": [ + "num_allocations", + "num_threads" + ] + }, + "inference.put_elser:AdaptiveAllocations": { + "type": "object", + "properties": { + "enabled": { + "description": "Turn on `adaptive_allocations`.", + "type": "boolean" + }, + "max_number_of_allocations": { + "description": "The maximum number of allocations to scale to.\nIf set, it must be greater than or equal to `min_number_of_allocations`.", + "type": "number" + }, + "min_number_of_allocations": { + "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", + "type": "number" + } + } + }, "inference.put_googleaistudio:GoogleAiStudioTaskType": { "type": "string", "enum": [ diff --git a/output/schema/schema.json b/output/schema/schema.json index 5a93f9e22b..bed096e8c3 100644 --- a/output/schema/schema.json +++ b/output/schema/schema.json @@ -9366,7 +9366,7 @@ }, "description": "Create an Elastic Inference Service (EIS) inference endpoint.\n\nCreate an inference endpoint to perform an inference task through the Elastic Inference Service (EIS).", "docId": "inference-api-put-eis", - "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elastic.html", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-elastic.html", "name": "inference.put_eis", "privileges": { "cluster": [ @@ -9397,6 +9397,55 @@ } ] }, + { + "availability": { + "serverless": { + "stability": "stable", + "visibility": "public" + }, + "stack": { + "since": "8.11.0", + "stability": "stable", + "visibility": "public" + } + }, + "deprecation": { + "description": "The elser service is deprecated and will be removed in a future release. Use the Elasticsearch inference integration instead, with model_id included in the service_settings.", + "version": "8.16.0" + }, + "description": "Create an ELSER inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elser` service.\nYou can also deploy ELSER by using the Elasticsearch inference integration.\n\n> info\n> Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings.\n\nThe API request will automatically download and deploy the ELSER model if it isn't already downloaded.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "docId": "inference-api-put-elser", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-elser.html", + "name": "inference.put_elser", + "privileges": { + "cluster": [ + "manage_inference" + ] + }, + "request": { + "name": "Request", + "namespace": "inference.put_elser" + }, + "requestBodyRequired": false, + "requestMediaType": [ + "application/json" + ], + "response": { + "name": "Response", + "namespace": "inference.put_elser" + }, + "responseMediaType": [ + "application/json" + ], + "urls": [ + { + "methods": [ + "PUT" + ], + "path": "/_inference/{task_type}/{elser_inference_id}" + } + ] + }, { "availability": { "serverless": { @@ -9411,7 +9460,7 @@ }, "description": "Create an Google AI Studio inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `googleaistudio` service.\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", "docId": "inference-api-put-googleaistudio", - "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-google-ai-studio.html", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-google-ai-studio.html", "name": "inference.put_googleaistudio", "privileges": { "cluster": [ @@ -9456,7 +9505,7 @@ }, "description": "Create a Google Vertex AI inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `googlevertexai` service.\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", "docId": "inference-api-put-googlevertexai", - "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-google-vertex-ai.html", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-google-vertex-ai.html", "name": "inference.put_googlevertexai", "privileges": { "cluster": [ @@ -9501,7 +9550,7 @@ }, "description": "Create a Hugging Face inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `hugging_face` service.\n\nYou must first create an inference endpoint on the Hugging Face endpoint page to get an endpoint URL.\nSelect the model you want to use on the new endpoint creation page (for example `intfloat/e5-small-v2`), then select the sentence embeddings task under the advanced configuration section.\nCreate the endpoint and copy the URL after the endpoint initialization has been finished.\n\nThe following models are recommended for the Hugging Face service:\n\n* `all-MiniLM-L6-v2`\n* `all-MiniLM-L12-v2`\n* `all-mpnet-base-v2`\n* `e5-base-v2`\n* `e5-small-v2`\n* `multilingual-e5-base`\n* `multilingual-e5-small`\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", "docId": "inference-api-put-huggingface", - "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-hugging-face.html", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-hugging-face.html", "name": "inference.put_hugging_face", "privileges": { "cluster": [ @@ -9546,7 +9595,7 @@ }, "description": "Create an JinaAI inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `jinaai` service.\n\nTo review the available `rerank` models, refer to .\nTo review the available `text_embedding` models, refer to the .\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", "docId": "inference-api-put-jinaai", - "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-jinaai.html", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-jinaai.html", "name": "inference.put_jinaai", "privileges": { "cluster": [ @@ -9619,7 +9668,7 @@ }, "description": "Create an OpenAI inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `openai` service.\n\nWhen you create an inference endpoint, the associated machine learning model is automatically deployed if it is not already running.\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", "docId": "inference-api-put-openai", - "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-openai.html", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-openai.html", "name": "inference.put_openai", "privileges": { "cluster": [ @@ -150811,6 +150860,253 @@ }, "specLocation": "inference/put_eis/PutEisRequest.ts#L68-L70" }, + { + "kind": "interface", + "name": { + "name": "AdaptiveAllocations", + "namespace": "inference.put_elser" + }, + "properties": [ + { + "description": "Turn on `adaptive_allocations`.", + "name": "enabled", + "required": false, + "serverDefault": false, + "type": { + "kind": "instance_of", + "type": { + "name": "boolean", + "namespace": "_builtins" + } + } + }, + { + "description": "The maximum number of allocations to scale to.\nIf set, it must be greater than or equal to `min_number_of_allocations`.", + "name": "max_number_of_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + }, + { + "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", + "name": "min_number_of_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "inference/put_elser/PutElserRequest.ts#L92-L109" + }, + { + "kind": "interface", + "name": { + "name": "ElserServiceSettings", + "namespace": "inference.put_elser" + }, + "properties": [ + { + "description": "Adaptive allocations configuration details.\nIf `enabled` is true, the number of allocations of the model is set based on the current load the process gets.\nWhen the load is high, a new model allocation is automatically created, respecting the value of `max_number_of_allocations` if it's set.\nWhen the load is low, a model allocation is automatically removed, respecting the value of `min_number_of_allocations` if it's set.\nIf `enabled` is true, do not set the number of allocations manually.", + "name": "adaptive_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "AdaptiveAllocations", + "namespace": "inference.put_elser" + } + } + }, + { + "description": "The total number of allocations this model is assigned across machine learning nodes.\nIncreasing this value generally increases the throughput.\nIf adaptive allocations is enabled, do not set this value because it's automatically set.", + "name": "num_allocations", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + }, + { + "description": "The number of threads used by each model allocation during inference.\nIncreasing this value generally increases the speed per inference request.\nThe inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node.\nThe value must be a power of 2.\nThe maximum value is 32.\n\n> info\n> If you want to optimize your ELSER endpoint for ingest, set the number of threads to 1. If you want to optimize your ELSER endpoint for search, set the number of threads to greater than 1.", + "name": "num_threads", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "inference/put_elser/PutElserRequest.ts#L111-L137" + }, + { + "kind": "enum", + "members": [ + { + "name": "sparse_embedding" + } + ], + "name": { + "name": "ElserTaskType", + "namespace": "inference.put_elser" + }, + "specLocation": "inference/put_elser/PutElserRequest.ts#L84-L86" + }, + { + "kind": "request", + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { + "description": "The chunking configuration object.", + "extDocId": "inference-chunking", + "extDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html#infer-chunking-config", + "name": "chunking_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "InferenceChunkingSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "The type of service supported for the specified task type. In this case, `elser`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ServiceType", + "namespace": "inference.put_elser" + } + } + }, + { + "description": "Settings used to install the inference model. These settings are specific to the `elser` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElserServiceSettings", + "namespace": "inference.put_elser" + } + } + } + ] + }, + "deprecation": { + "description": "The elser service is deprecated and will be removed in a future release. Use the Elasticsearch inference integration instead, with model_id included in the service_settings.", + "version": "8.16.0" + }, + "description": "Create an ELSER inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elser` service.\nYou can also deploy ELSER by using the Elasticsearch inference integration.\n\n> info\n> Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings.\n\nThe API request will automatically download and deploy the ELSER model if it isn't already downloaded.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "examples": { + "PutElserRequestExample1": { + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The request will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "summary": "A sparse embedding task", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n }\n}" + }, + "PutElserRequestExample2": { + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task with adaptive allocations. When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load.", + "summary": "Adaptive allocations", + "value": "{\n \"service\": \"elser\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1\n }\n}" + } + }, + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "name": { + "name": "Request", + "namespace": "inference.put_elser" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElserTaskType", + "namespace": "inference.put_elser" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.", + "name": "elser_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [], + "specLocation": "inference/put_elser/PutElserRequest.ts#L25-L82" + }, + { + "kind": "response", + "body": { + "kind": "value", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfo", + "namespace": "inference._types" + } + } + }, + "examples": { + "PutElserResponseExample1": { + "description": "A successful response when creating an ELSER inference endpoint.", + "value": "{\n \"inference_id\": \"my-elser-model\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elser\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1\n },\n \"task_settings\": {}\n}" + } + }, + "name": { + "name": "Response", + "namespace": "inference.put_elser" + }, + "specLocation": "inference/put_elser/PutElserResponse.ts#L22-L24" + }, + { + "kind": "enum", + "members": [ + { + "name": "elser" + } + ], + "name": { + "name": "ServiceType", + "namespace": "inference.put_elser" + }, + "specLocation": "inference/put_elser/PutElserRequest.ts#L88-L90" + }, { "kind": "interface", "name": { diff --git a/output/typescript/types.ts b/output/typescript/types.ts index 1a4e632ca4..ce9788f29f 100644 --- a/output/typescript/types.ts +++ b/output/typescript/types.ts @@ -13272,6 +13272,34 @@ export type InferencePutEisResponse = InferenceInferenceEndpointInfo export type InferencePutEisServiceType = 'elastic' +export interface InferencePutElserAdaptiveAllocations { + enabled?: boolean + max_number_of_allocations?: integer + min_number_of_allocations?: integer +} + +export interface InferencePutElserElserServiceSettings { + adaptive_allocations?: InferencePutElserAdaptiveAllocations + num_allocations: integer + num_threads: integer +} + +export type InferencePutElserElserTaskType = 'sparse_embedding' + +export interface InferencePutElserRequest extends RequestBase { + task_type: InferencePutElserElserTaskType + elser_inference_id: Id + body?: { + chunking_settings?: InferenceInferenceChunkingSettings + service: InferencePutElserServiceType + service_settings: InferencePutElserElserServiceSettings + } +} + +export type InferencePutElserResponse = InferenceInferenceEndpointInfo + +export type InferencePutElserServiceType = 'elser' + export interface InferencePutGoogleaistudioGoogleAiStudioServiceSettings { api_key: string model_id: string diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv index 110e277e3d..a61bbb9a21 100644 --- a/specification/_doc_ids/table.csv +++ b/specification/_doc_ids/table.csv @@ -323,12 +323,13 @@ inference-api-get,https://www.elastic.co/guide/en/elasticsearch/reference/{branc inference-api-post,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/post-inference-api.html inference-api-post-eis-chat-completion,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/post-inference-api.html inference-api-put,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/put-inference-api.html -inference-api-put-eis,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elastic.html -inference-api-put-huggingface,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-hugging-face.html -inference-api-put-jinaai,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-jinaai.html -inference-api-put-googlevertexai,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-google-vertex-ai.html -inference-api-put-googleaistudio,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-google-ai-studio.html -inference-api-put-openai,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-openai.html +inference-api-put-eis,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-elastic.html +inference-api-put-elser,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-elser.html +inference-api-put-huggingface,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-hugging-face.html +inference-api-put-jinaai,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-jinaai.html +inference-api-put-googlevertexai,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-google-vertex-ai.html +inference-api-put-googleaistudio,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-google-ai-studio.html +inference-api-put-openai,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-openai.html inference-api-put-voyageai,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-voyageai.html inference-api-put-watsonx,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/infer-service-watsonx-ai.html inference-api-stream,https://www.elastic.co/guide/en/elasticsearch/reference/{branch}/stream-inference-api.html diff --git a/specification/_json_spec/inference.put_elser.json b/specification/_json_spec/inference.put_elser.json new file mode 100644 index 0000000000..b943b31a7d --- /dev/null +++ b/specification/_json_spec/inference.put_elser.json @@ -0,0 +1,39 @@ +{ + "inference.put_elser": { + "deprecated": { + "version": "8.16.0", + "description": "The elser service is deprecated. Use the Elasticsearch inference integration instead, with model_id included in the service_settings." + }, + "documentation": { + "url": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elser.html", + "description": "Configure an ELSER inference endpoint" + }, + "stability": "stable", + "visibility": "public", + "headers": { + "accept": ["application/json"], + "content_type": ["application/json"] + }, + "url": { + "paths": [ + { + "path": "/_inference/{task_type}/{elser_inference_id}", + "methods": ["PUT"], + "parts": { + "task_type": { + "type": "string", + "description": "The task type" + }, + "elser_inference_id": { + "type": "string", + "description": "The inference Id" + } + } + } + ] + }, + "body": { + "description": "The inference endpoint's task and service settings" + } + } +} diff --git a/specification/inference/put_elser/PutElserRequest.ts b/specification/inference/put_elser/PutElserRequest.ts new file mode 100644 index 0000000000..3a21e01df3 --- /dev/null +++ b/specification/inference/put_elser/PutElserRequest.ts @@ -0,0 +1,137 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { InferenceChunkingSettings } from '@inference/_types/Services' +import { RequestBase } from '@_types/Base' +import { Id } from '@_types/common' +import { integer } from '@_types/Numeric' + +/** + * Create an ELSER inference endpoint. + * + * Create an inference endpoint to perform an inference task with the `elser` service. + * You can also deploy ELSER by using the Elasticsearch inference integration. + * + * > info + * > Your Elasticsearch deployment contains a preconfigured ELSER inference endpoint, you only need to create the enpoint using the API if you want to customize the settings. + * + * The API request will automatically download and deploy the ELSER model if it isn't already downloaded. + * + * > info + * > You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value. + * + * After creating the endpoint, wait for the model deployment to complete before using it. + * To verify the deployment status, use the get trained model statistics API. + * Look for `"state": "fully_allocated"` in the response and ensure that the `"allocation_count"` matches the `"target_allocation_count"`. + * Avoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources. + * @rest_spec_name inference.put_elser + * @availability stack since=8.11.0 stability=stable visibility=public + * @availability serverless stability=stable visibility=public + * @deprecated 8.16.0 The elser service is deprecated and will be removed in a future release. Use the Elasticsearch inference integration instead, with model_id included in the service_settings. + * @cluster_privileges manage_inference + * @doc_id inference-api-put-elser + */ +export interface Request extends RequestBase { + urls: [ + { + path: '/_inference/{task_type}/{elser_inference_id}' + methods: ['PUT'] + } + ] + path_parts: { + /** + * The type of the inference task that the model will perform. + */ + task_type: ElserTaskType + /** + * The unique identifier of the inference endpoint. + */ + elser_inference_id: Id + } + body: { + /** + * The chunking configuration object. + * @ext_doc_id inference-chunking + */ + chunking_settings?: InferenceChunkingSettings + /** + * The type of service supported for the specified task type. In this case, `elser`. + */ + service: ServiceType + /** + * Settings used to install the inference model. These settings are specific to the `elser` service. + */ + service_settings: ElserServiceSettings + } +} + +export enum ElserTaskType { + sparse_embedding +} + +export enum ServiceType { + elser +} + +export class AdaptiveAllocations { + /** + * Turn on `adaptive_allocations`. + * @server_default false + */ + enabled?: boolean + /** + * The maximum number of allocations to scale to. + * If set, it must be greater than or equal to `min_number_of_allocations`. + */ + max_number_of_allocations?: integer + /** + * The minimum number of allocations to scale to. + * If set, it must be greater than or equal to 0. + * If not defined, the deployment scales to 0. + */ + min_number_of_allocations?: integer +} + +export class ElserServiceSettings { + /** + * Adaptive allocations configuration details. + * If `enabled` is true, the number of allocations of the model is set based on the current load the process gets. + * When the load is high, a new model allocation is automatically created, respecting the value of `max_number_of_allocations` if it's set. + * When the load is low, a model allocation is automatically removed, respecting the value of `min_number_of_allocations` if it's set. + * If `enabled` is true, do not set the number of allocations manually. + */ + adaptive_allocations?: AdaptiveAllocations + /** + * The total number of allocations this model is assigned across machine learning nodes. + * Increasing this value generally increases the throughput. + * If adaptive allocations is enabled, do not set this value because it's automatically set. + */ + num_allocations: integer + /** + * The number of threads used by each model allocation during inference. + * Increasing this value generally increases the speed per inference request. + * The inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node. + * The value must be a power of 2. + * The maximum value is 32. + * + * > info + * > If you want to optimize your ELSER endpoint for ingest, set the number of threads to 1. If you want to optimize your ELSER endpoint for search, set the number of threads to greater than 1. + */ + num_threads: integer +} diff --git a/specification/inference/put_elser/PutElserResponse.ts b/specification/inference/put_elser/PutElserResponse.ts new file mode 100644 index 0000000000..d40639b031 --- /dev/null +++ b/specification/inference/put_elser/PutElserResponse.ts @@ -0,0 +1,24 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { InferenceEndpointInfo } from '@inference/_types/Services' + +export class Response { + body: InferenceEndpointInfo +} diff --git a/specification/inference/put_elser/examples/request/PutElserRequestExample1.yaml b/specification/inference/put_elser/examples/request/PutElserRequestExample1.yaml new file mode 100644 index 0000000000..85fd58f986 --- /dev/null +++ b/specification/inference/put_elser/examples/request/PutElserRequestExample1.yaml @@ -0,0 +1,12 @@ +summary: A sparse embedding task +description: Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The request will automatically download the ELSER model if it isn't already downloaded and then deploy the model. +# method_request: "PUT _inference/sparse_embedding/my-elser-model" +# type: "request" +value: |- + { + "service": "elser", + "service_settings": { + "num_allocations": 1, + "num_threads": 1 + } + } diff --git a/specification/inference/put_elser/examples/request/PutElserRequestExample2.yaml b/specification/inference/put_elser/examples/request/PutElserRequestExample2.yaml new file mode 100644 index 0000000000..831115834c --- /dev/null +++ b/specification/inference/put_elser/examples/request/PutElserRequestExample2.yaml @@ -0,0 +1,16 @@ +summary: Adaptive allocations +description: Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task with adaptive allocations. When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load. +# method_request: "PUT _inference/sparse_embedding/my-elser-model" +# type: "request" +value: |- + { + "service": "elser", + "service_settings": { + "adaptive_allocations": { + "enabled": true, + "min_number_of_allocations": 3, + "max_number_of_allocations": 10 + }, + "num_threads": 1 + } + } diff --git a/specification/inference/put_elser/examples/response/PutElserResponseExample1.yaml b/specification/inference/put_elser/examples/response/PutElserResponseExample1.yaml new file mode 100644 index 0000000000..9d0746cbd1 --- /dev/null +++ b/specification/inference/put_elser/examples/response/PutElserResponseExample1.yaml @@ -0,0 +1,15 @@ +# summary: +description: A successful response when creating an ELSER inference endpoint. +# type: response +# response_code: +value: |- + { + "inference_id": "my-elser-model", + "task_type": "sparse_embedding", + "service": "elser", + "service_settings": { + "num_allocations": 1, + "num_threads": 1 + }, + "task_settings": {} + }