diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index 68cd4ec2..1e999882 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -1,6 +1,24 @@ -Simple example for cocoindex: build embedding index based on local files. - +# Build text embedding and semantic search 🔍 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb) +[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) + +In this example, we will build index flow from text embedding from local markdown files, and query the index. + +We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. + +## Steps: +🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) + +### Indexing Flow: +Screenshot 2025-05-19 at 5 48 28 PM + +1. We will ingest from a list of local files. +2. For each file, perform chunking (Recursive Split) and then embeddings. +3. We will save the embeddings and the metadata in Postgres with PGVector. + +### Query: +We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. + ## Prerequisite @@ -34,9 +52,8 @@ python main.py ## CocoInsight -CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9). - -Run CocoInsight to understand your RAG data pipeline: +I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. +It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight: ``` python main.py cocoindex server -ci diff --git a/examples/text_embedding/main.py b/examples/text_embedding/main.py index 8177cdfc..e69e1e7c 100644 --- a/examples/text_embedding/main.py +++ b/examples/text_embedding/main.py @@ -1,8 +1,7 @@ -import os from dotenv import load_dotenv from psycopg_pool import ConnectionPool - import cocoindex +import os @cocoindex.transform_flow() def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]: @@ -20,7 +19,7 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind Define an example flow that embeds text into a vector database. """ data_scope["documents"] = flow_builder.add_source( - cocoindex.sources.LocalFile(path="markdown_files", included_patterns=["*.md"])) + cocoindex.sources.LocalFile(path="markdown_files")) doc_embeddings = data_scope.add_collector() @@ -43,33 +42,27 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind field_name="embedding", metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) -# Keep for now to allow CocoInsight to query. -# Will be removed later after we expose `search()` below as a query function (https://github.com/cocoindex-io/cocoindex/issues/502). -cocoindex.query.SimpleSemanticsQueryHandler( - name="SemanticsSearch", - flow=text_embedding_flow, - target_name="doc_embeddings", - query_transform_flow=text_to_embedding, - default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY) def search(pool: ConnectionPool, query: str, top_k: int = 5): + # Get the table name, for the export target in the text_embedding_flow above. table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings") + # Evaluate the transform flow defined above with the input query, to get the embedding. query_vector = text_to_embedding.eval(query) + # Run the query and get the results. with pool.connection() as conn: with conn.cursor() as cur: cur.execute(f""" - SELECT filename, location, text, embedding <=> %s::vector AS distance - FROM {table_name} - ORDER BY distance - LIMIT %s + SELECT filename, text, embedding <=> %s::vector AS distance + FROM {table_name} ORDER BY distance LIMIT %s """, (query_vector, top_k)) return [ - {"filename": row[0], "location": row[1], "text": row[2], "score": 1.0 - row[3]} + {"filename": row[0], "text": row[1], "score": 1.0 - row[2]} for row in cur.fetchall() ] @cocoindex.main_fn() def _run(): + # Initialize the database connection pool. pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) # Run queries in a loop to demonstrate the query capabilities. while True: @@ -77,10 +70,11 @@ def _run(): query = input("Enter search query (or Enter to quit): ") if query == '': break + # Run the query function with the database connection pool and the query. results = search(pool, query) print("\nSearch results:") for result in results: - print(f"[{result['score']:.3f}] {result['filename']} location:{result['location']}") + print(f"[{result['score']:.3f}] {result['filename']}") print(f" {result['text']}") print("---") print() diff --git a/examples/text_embedding/pyproject.toml b/examples/text_embedding/pyproject.toml index 3add7d0b..08129111 100644 --- a/examples/text_embedding/pyproject.toml +++ b/examples/text_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local text files." requires-python = ">=3.10" dependencies = [ - "cocoindex>=0.1.39", + "cocoindex>=0.1.40", "python-dotenv>=1.0.1", "psycopg[binary,pool]", ]