diff --git a/README.md b/README.md index 3c3497bf..12bed099 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind "doc_embeddings", cocoindex.storages.Postgres(), primary_key_fields=["filename", "location"], - vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) ``` It defines an index flow like this: diff --git a/docs/docs/core/flow_def.mdx b/docs/docs/core/flow_def.mdx index b5b96132..dc6870db 100644 --- a/docs/docs/core/flow_def.mdx +++ b/docs/docs/core/flow_def.mdx @@ -259,8 +259,10 @@ Export must happen at the top level of a flow, i.e. not within any child scopes * `name`: the name to identify the export target. * `target_spec`: the storage spec as the export target. -* `primary_key_fields` (optional): the fields to be used as primary key. Types of the fields must be supported as key fields. See [Key Types](data_types#key-types) for more details. -* `vector_index` (optional): the fields to create vector index. Each item is a tuple of a field name and a similarity metric. See [Vector Type](data_types#vector-type) for more details about supported similarity metrics. +* `primary_key_fields` (`Sequence[str]`): the fields to be used as primary key. Types of the fields must be supported as key fields. See [Key Types](data_types#key-types) for more details. +* `vector_indexes` (`Sequence[VectorIndexDef]`, optional): the fields to create vector index. `VectorIndexDef` has the following fields: + * `field_name`: the field to create vector index. + * `metric`: the similarity metric to use. See [Vector Type](data_types#vector-type) for more details about supported similarity metrics. * `setup_by_user` (optional): whether the export target is setup by user. By default, CocoIndex is managing the target setup (surfaced by the `cocoindex setup` CLI subcommand), e.g. create related tables/collections/etc. with compatible schema, and update them upon change. diff --git a/docs/docs/getting_started/quickstart.md b/docs/docs/getting_started/quickstart.md index 353ca668..be635adb 100644 --- a/docs/docs/getting_started/quickstart.md +++ b/docs/docs/getting_started/quickstart.md @@ -97,7 +97,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind "doc_embeddings", cocoindex.storages.Postgres(), primary_key_fields=["filename", "location"], - vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) ``` Notes: diff --git a/examples/code_embedding/main.py b/examples/code_embedding/main.py index 80741073..abd6d7b0 100644 --- a/examples/code_embedding/main.py +++ b/examples/code_embedding/main.py @@ -41,7 +41,10 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind "code_embeddings", cocoindex.storages.Postgres(), primary_key_fields=["filename", "location"], - vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) query_handler = cocoindex.query.SimpleSemanticsQueryHandler( diff --git a/examples/docs_to_kg/main.py b/examples/docs_to_kg/main.py index 29974632..4952a8f1 100644 --- a/examples/docs_to_kg/main.py +++ b/examples/docs_to_kg/main.py @@ -93,15 +93,13 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D ), nodes={ "Entity": cocoindex.storages.Neo4jRelationshipNodeSpec( - index_options=cocoindex.IndexOptions( - primary_key_fields=["value"], - vector_index_defs=[ - cocoindex.VectorIndexDef( - field_name="embedding", - metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, - ), - ], - ), + primary_key_fields=["value"], + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, + ), + ], ), }, ), diff --git a/examples/gdrive_text_embedding/main.py b/examples/gdrive_text_embedding/main.py index ce0353be..d86249a3 100644 --- a/examples/gdrive_text_embedding/main.py +++ b/examples/gdrive_text_embedding/main.py @@ -37,7 +37,10 @@ def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: "doc_embeddings", cocoindex.storages.Postgres(), primary_key_fields=["filename", "location"], - vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) query_handler = cocoindex.query.SimpleSemanticsQueryHandler( name="SemanticsSearch", diff --git a/examples/pdf_embedding/main.py b/examples/pdf_embedding/main.py index a87ea9ab..00b1ae51 100644 --- a/examples/pdf_embedding/main.py +++ b/examples/pdf_embedding/main.py @@ -63,7 +63,10 @@ def pdf_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoinde "doc_embeddings", cocoindex.storages.Postgres(), primary_key_fields=["id"], - vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) query_handler = cocoindex.query.SimpleSemanticsQueryHandler( name="SemanticsSearch", diff --git a/examples/text_embedding/main.py b/examples/text_embedding/main.py index 70b3807c..5713648f 100644 --- a/examples/text_embedding/main.py +++ b/examples/text_embedding/main.py @@ -35,7 +35,10 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind "doc_embeddings", cocoindex.storages.Postgres(), primary_key_fields=["filename", "location"], - vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) + vector_indexes=[ + cocoindex.VectorIndexDef( + field_name="embedding", + metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) query_handler = cocoindex.query.SimpleSemanticsQueryHandler( name="SemanticsSearch", diff --git a/python/cocoindex/flow.py b/python/cocoindex/flow.py index eecb6a80..4feae4dc 100644 --- a/python/cocoindex/flow.py +++ b/python/cocoindex/flow.py @@ -267,21 +267,27 @@ def collect(self, **kwargs): self._engine_data_collector, regular_kwargs, auto_uuid_field) def export(self, name: str, target_spec: op.StorageSpec, /, *, - primary_key_fields: Sequence[str] | None = None, + primary_key_fields: Sequence[str], + vector_indexes: Sequence[index.VectorIndexDef] = (), vector_index: Sequence[tuple[str, index.VectorSimilarityMetric]] = (), setup_by_user: bool = False): """ Export the collected data to the specified target. + + `vector_index` is for backward compatibility only. Please use `vector_indexes` instead. """ - index_options: dict[str, Any] = {} - if primary_key_fields is not None: - index_options["primary_key_fields"] = primary_key_fields - index_options["vector_index_defs"] = [ - {"field_name": field_name, "metric": metric.value} - for field_name, metric in vector_index] + # For backward compatibility only. + if len(vector_indexes) == 0 and len(vector_index) > 0: + vector_indexes = [index.VectorIndexDef(field_name=field_name, metric=metric) + for field_name, metric in vector_index] + + index_options = index.IndexOptions( + primary_key_fields=primary_key_fields, + vector_indexes=vector_indexes, + ) self._flow_builder_state.engine_flow_builder.export( name, _spec_kind(target_spec), dump_engine_object(target_spec), - index_options, self._engine_data_collector, setup_by_user) + dump_engine_object(index_options), self._engine_data_collector, setup_by_user) _flow_name_builder = _NameBuilder() diff --git a/python/cocoindex/index.py b/python/cocoindex/index.py index 90e12aa0..6379425a 100644 --- a/python/cocoindex/index.py +++ b/python/cocoindex/index.py @@ -1,6 +1,6 @@ from enum import Enum from dataclasses import dataclass - +from typing import Sequence class VectorSimilarityMetric(Enum): COSINE_SIMILARITY = "CosineSimilarity" L2_DISTANCE = "L2Distance" @@ -19,5 +19,5 @@ class IndexOptions: """ Options for an index. """ - primary_key_fields: list[str] | None = None - vector_index_defs: list[VectorIndexDef] | None = None + primary_key_fields: Sequence[str] + vector_indexes: Sequence[VectorIndexDef] = () diff --git a/python/cocoindex/storages.py b/python/cocoindex/storages.py index ec5d7326..44204396 100644 --- a/python/cocoindex/storages.py +++ b/python/cocoindex/storages.py @@ -35,8 +35,8 @@ class Neo4jRelationshipEndSpec: @dataclass class Neo4jRelationshipNodeSpec: """Spec for a Neo4j node type.""" - key_field_name: str | None = None - index_options: index.IndexOptions | None = None + primary_key_fields: list[str] + vector_indexes: list[index.VectorIndexDef] | None = None class Neo4jRelationship(op.StorageSpec): """Graph storage powered by Neo4j.""" diff --git a/src/base/spec.rs b/src/base/spec.rs index 66c5027d..570544be 100644 --- a/src/base/spec.rs +++ b/src/base/spec.rs @@ -232,7 +232,7 @@ pub struct IndexOptions { #[serde(default, skip_serializing_if = "Option::is_none")] pub primary_key_fields: Option>, #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub vector_index_defs: Vec, + pub vector_indexes: Vec, } /// Store data to a given sink. diff --git a/src/execution/query.rs b/src/execution/query.rs index ed5e8fc6..d06ca2d7 100644 --- a/src/execution/query.rs +++ b/src/execution/query.rs @@ -40,7 +40,7 @@ impl SimpleSemanticsQueryHandler { .position(|export_op| export_op.name == target_name) .unwrap(); let export_op = &flow.flow_instance.export_ops[export_op_idx]; - let vector_index_defs = &export_op.spec.index_options.vector_index_defs; + let vector_indexes = &export_op.spec.index_options.vector_indexes; let execution_plan = flow.get_execution_plan().await?; let analyzed_export_op = &execution_plan.export_ops[export_op_idx]; Ok(Self { @@ -55,8 +55,8 @@ impl SimpleSemanticsQueryHandler { }, query_transform_flow, default_similarity_metric, - default_vector_field_name: if vector_index_defs.len() == 1 { - Some(vector_index_defs[0].field_name.clone()) + default_vector_field_name: if vector_indexes.len() == 1 { + Some(vector_indexes[0].field_name.clone()) } else { None }, diff --git a/src/ops/storages/neo4j.rs b/src/ops/storages/neo4j.rs index 25896acd..04d617bb 100644 --- a/src/ops/storages/neo4j.rs +++ b/src/ops/storages/neo4j.rs @@ -39,6 +39,7 @@ pub struct RelationshipEndSpec { #[derive(Debug, Deserialize)] pub struct RelationshipNodeSpec { + #[serde(flatten)] index_options: spec::IndexOptions, } @@ -586,7 +587,7 @@ impl NodeLabelSetupState { key_constraint_name, vector_indexes: spec .index_options - .vector_index_defs + .vector_indexes .iter() .map(|v| -> Result<_> { Ok(( @@ -639,7 +640,7 @@ impl RelationshipSetupState { key_field_names, key_constraint_name: format!("r__{}__key", spec.rel_type), vector_indexes: index_options - .vector_index_defs + .vector_indexes .iter() .map(|v| -> Result<_> { Ok(( diff --git a/src/ops/storages/postgres.rs b/src/ops/storages/postgres.rs index 8835fe40..1d1ec446 100644 --- a/src/ops/storages/postgres.rs +++ b/src/ops/storages/postgres.rs @@ -508,7 +508,7 @@ impl SetupState { .map(|f| (f.name.clone(), f.value_type.typ.without_attrs())) .collect(), vector_indexes: index_options - .vector_index_defs + .vector_indexes .iter() .map(|v| (to_vector_index_name(&table_id.table_name, v), v.clone())) .collect(), @@ -611,11 +611,11 @@ impl SetupStatusCheck { .value_fields_schema .iter() .filter(|(field_name, schema)| { - existing.possible_versions().any(|v| { + !existing.current.as_ref().map_or(false, |v| { v.value_fields_schema .get(*field_name) .map(to_column_type_sql) - != Some(to_column_type_sql(schema)) + == Some(to_column_type_sql(schema)) }) }) .map(|(k, v)| (k.clone(), v.clone())) @@ -639,9 +639,10 @@ impl SetupStatusCheck { .vector_indexes .iter() .filter(|(name, def)| { - existing - .possible_versions() - .any(|v| v.vector_indexes.get(*name) != Some(def)) + !existing + .current + .as_ref() + .map_or(false, |v| v.vector_indexes.get(*name) != Some(def)) }) .map(|(k, v)| (k.clone(), v.clone())) .collect(), @@ -879,7 +880,7 @@ impl setup::ResourceSetupStatusCheck for SetupStatusCheck { let sql = format!( "CREATE INDEX IF NOT EXISTS {} ON {} {}", index_name, - index_spec.field_name, + self.table_id.table_name, to_index_spec_sql(index_spec) ); sqlx::query(&sql).execute(&db_pool).await?;