diff --git a/examples/text_embedding_lancedb/.env b/examples/text_embedding_lancedb/.env index 335feb69..a5d9cd8f 100644 --- a/examples/text_embedding_lancedb/.env +++ b/examples/text_embedding_lancedb/.env @@ -4,3 +4,10 @@ COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex # Fallback to CPU for operations not supported by MPS on Mac. # It's no-op for other platforms. PYTORCH_ENABLE_MPS_FALLBACK=1 + +# By default, the vector index is not enabled, because LanceDB requires at least +# 256 rows to be there before it can build the index (see +# https://github.com/lance-format/lance/issues/4034) for more details). +# +# After your index has enough data, you can change the following value to `true` to enable the index: +ENABLE_LANCEDB_VECTOR_INDEX=false diff --git a/examples/text_embedding_lancedb/README.md b/examples/text_embedding_lancedb/README.md index 44313636..dd3ea1fe 100644 --- a/examples/text_embedding_lancedb/README.md +++ b/examples/text_embedding_lancedb/README.md @@ -46,6 +46,13 @@ You can also run the command with `-L`, which will watch for file changes and up cocoindex update -L main ``` +By default, the vector index is not enabled, because LanceDB requires at least 256 rows to be there before it can build the index (see [this issue](https://github.com/lance-format/lance/issues/4034) for more details). +After your LanceDB target table has enough data, you can update `.env` file with the following environment variable to enable the vector index from there on: + +```sh +ENABLE_LANCEDB_VECTOR_INDEX=true +``` + ## CocoInsight I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. diff --git a/examples/text_embedding_lancedb/main.py b/examples/text_embedding_lancedb/main.py index d7beb3d8..2c89e244 100644 --- a/examples/text_embedding_lancedb/main.py +++ b/examples/text_embedding_lancedb/main.py @@ -1,4 +1,4 @@ -from dotenv import load_dotenv +import os import datetime import cocoindex import math @@ -31,6 +31,10 @@ def text_embedding_flow( """ Define an example flow that embeds text into a vector database. """ + ENABLE_LANCEDB_VECTOR_INDEX = os.environ.get( + "ENABLE_LANCEDB_VECTOR_INDEX", "0" + ).lower() in ("true", "1") + data_scope["documents"] = flow_builder.add_source( cocoindex.sources.LocalFile(path="markdown_files"), refresh_interval=datetime.timedelta(seconds=5), @@ -57,18 +61,21 @@ def text_embedding_flow( text_embedding=chunk["embedding"], ) + # We cannot enable index when the table has no data yet, as LanceDB requires data to train the index. + # See: https://github.com/lancedb/lance/issues/4034 + # Guard it with ENABLE_LANCEDB_VECTOR_INDEX environment variable. + vector_indexes = [] + if ENABLE_LANCEDB_VECTOR_INDEX: + vector_indexes.append( + cocoindex.VectorIndexDef( + "text_embedding", cocoindex.VectorSimilarityMetric.L2_DISTANCE + ) + ) doc_embeddings.export( "doc_embeddings", coco_lancedb.LanceDB(db_uri=LANCEDB_URI, table_name=LANCEDB_TABLE), primary_key_fields=["id"], - # We cannot enable it when the table has no data yet, as LanceDB requires data to train the index. - # See: https://github.com/lancedb/lance/issues/4034 - # - # vector_indexes=[ - # cocoindex.VectorIndexDef( - # "text_embedding", cocoindex.VectorSimilarityMetric.L2_DISTANCE - # ), - # ], + vector_indexes=vector_indexes, )