diff --git a/docs/docs/ops/functions.md b/docs/docs/ops/functions.md index d9314fa66..631b6e588 100644 --- a/docs/docs/ops/functions.md +++ b/docs/docs/ops/functions.md @@ -189,10 +189,11 @@ The spec takes the following fields: * `api_type` ([`cocoindex.LlmApiType`](/docs/ai/llm#llm-api-types)): The type of LLM API to use for embedding. * `model` (`str`): The name of the embedding model to use. * `address` (`str`, optional): The address of the LLM API. If not specified, uses the default address for the API type. -* `output_dimension` (`int`, optional): The expected dimension of the output embedding vector. If not specified, use the default dimension of the model. +* `output_dimension` (`int`, optional): The dimension to request from the embedding API. Some APIs support specifying the output dimension (e.g., OpenAI's models support dimension reduction). If not specified, the API will use its default dimension. +* `expected_output_dimension` (`int`, optional): The expected dimension of the output embedding vector for validation and type schema. If not specified, falls back to `output_dimension`, then to the default dimension of the model. - For most API types, the function internally keeps a registry for the default output dimension of known model. - You need to explicitly specify the `output_dimension` if you want to use a new model that is not in the registry yet. + For most API types, the function internally keeps a registry for the default output dimension of known models. + You need to explicitly specify `expected_output_dimension` (or `output_dimension`) if you want to use a new model that is not in the registry yet. * `task_type` (`str`, optional): The task type for embedding, used by some embedding models to optimize the embedding for specific use cases. diff --git a/python/cocoindex/functions/_engine_builtin_specs.py b/python/cocoindex/functions/_engine_builtin_specs.py index ee52c0d1c..346738557 100644 --- a/python/cocoindex/functions/_engine_builtin_specs.py +++ b/python/cocoindex/functions/_engine_builtin_specs.py @@ -55,6 +55,7 @@ class EmbedText(op.FunctionSpec): model: str address: str | None = None output_dimension: int | None = None + expected_output_dimension: int | None = None task_type: str | None = None api_config: llm.VertexAiConfig | None = None api_key: TransientAuthEntryReference[str] | None = None diff --git a/rust/cocoindex/src/ops/functions/embed_text.rs b/rust/cocoindex/src/ops/functions/embed_text.rs index ac119ee43..2d83783ac 100644 --- a/rust/cocoindex/src/ops/functions/embed_text.rs +++ b/rust/cocoindex/src/ops/functions/embed_text.rs @@ -12,6 +12,7 @@ struct Spec { address: Option, api_config: Option, output_dimension: Option, + expected_output_dimension: Option, task_type: Option, api_key: Option>, } @@ -129,15 +130,25 @@ impl SimpleFunctionFactoryBase for Factory { spec.api_config.clone(), ) .await?; - let output_dimension = match spec.output_dimension { - Some(output_dimension) => output_dimension, - None => { - client.get_default_embedding_dimension(spec.model.as_str()) - .ok_or_else(|| api_error!("model \"{}\" is unknown for {:?}, needs to specify `output_dimension` explicitly", spec.model, spec.api_type))? + + // Warn if both parameters are specified but have different values + if let (Some(expected), Some(output)) = + (spec.expected_output_dimension, spec.output_dimension) + { + if expected != output { + warn!( + "Both `expected_output_dimension` ({expected}) and `output_dimension` ({output}) are specified but have different values. \ + `expected_output_dimension` will be used for output schema and validation, while `output_dimension` will be sent to the embedding API." + ); } - }; + } + + let expected_output_dimension = spec.expected_output_dimension + .or(spec.output_dimension) + .or_else(|| client.get_default_embedding_dimension(spec.model.as_str())) + .ok_or_else(|| api_error!("model \"{}\" is unknown for {:?}, needs to specify `expected_output_dimension` (or `output_dimension`) explicitly", spec.model, spec.api_type))? as usize; let output_schema = make_output_type(BasicValueType::Vector(VectorTypeSchema { - dimension: Some(output_dimension as usize), + dimension: Some(expected_output_dimension), element_type: Box::new(BasicValueType::Float32), })); Ok(SimpleFunctionAnalysisOutput { @@ -145,7 +156,7 @@ impl SimpleFunctionFactoryBase for Factory { resolved_args: Args { client, text, - expected_output_dimension: output_dimension as usize, + expected_output_dimension, }, output_schema, }) @@ -179,6 +190,7 @@ mod tests { address: None, api_config: None, output_dimension: None, + expected_output_dimension: None, task_type: None, api_key: None, };