cocoindex-io · badmonster0 · May 20, 2025 · May 18, 2025 · May 18, 2025 · May 18, 2025
diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md
@@ -1,6 +1,24 @@
-Simple example for cocoindex: build embedding index based on local files.
-
+# Build text embedding and semantic search 🔍
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb)
+[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
+
+In this example, we will build index flow from text embedding from local markdown files, and query the index.
+
+We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
+
+## Steps:
+🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart)
+
+### Indexing Flow:
+<img width="461" alt="Screenshot 2025-05-19 at 5 48 28 PM" src="https://github.com/user-attachments/assets/b6825302-a0c7-4b86-9a2d-52da8286b4bd" />
+
+1. We will ingest from a list of local files.
+2. For each file, perform chunking (Recursive Split) and then embeddings. 
+3. We will save the embeddings and the metadata in Postgres with PGVector.
+
+### Query:
+We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow.
+
 
 ## Prerequisite
 
@@ -34,9 +52,8 @@ python main.py
 
 ## CocoInsight
 
-CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
-
-Run CocoInsight to understand your RAG data pipeline:
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. 
+It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight:
 
 ```
 python main.py cocoindex server -ci

diff --git a/examples/text_embedding/main.py b/examples/text_embedding/main.py
@@ -1,8 +1,7 @@
-import os
 from dotenv import load_dotenv
 from psycopg_pool import ConnectionPool
-
 import cocoindex
+import os
 
 @cocoindex.transform_flow()
 def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
@@ -20,7 +19,7 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
     Define an example flow that embeds text into a vector database.
     """
     data_scope["documents"] = flow_builder.add_source(
-        cocoindex.sources.LocalFile(path="markdown_files", included_patterns=["*.md"]))
+        cocoindex.sources.LocalFile(path="markdown_files"))
 
     doc_embeddings = data_scope.add_collector()
 
@@ -43,44 +42,39 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
                 field_name="embedding",
                 metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
 
-# Keep for now to allow CocoInsight to query.
-# Will be removed later after we expose `search()` below as a query function (https://github.com/cocoindex-io/cocoindex/issues/502).
-cocoindex.query.SimpleSemanticsQueryHandler(
-    name="SemanticsSearch",
-    flow=text_embedding_flow,
-    target_name="doc_embeddings",
-    query_transform_flow=text_to_embedding,
-    default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
 
 def search(pool: ConnectionPool, query: str, top_k: int = 5):
+    # Get the table name, for the export target in the text_embedding_flow above.
     table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings")
+    # Evaluate the transform flow defined above with the input query, to get the embedding.
     query_vector = text_to_embedding.eval(query)
+    # Run the query and get the results.
     with pool.connection() as conn:
         with conn.cursor() as cur:
             cur.execute(f"""
-                SELECT filename, location, text, embedding <=> %s::vector AS distance
-                FROM {table_name}
-                ORDER BY distance
-                LIMIT %s
+                SELECT filename, text, embedding <=> %s::vector AS distance
+                FROM {table_name} ORDER BY distance LIMIT %s
             """, (query_vector, top_k))
             return [
-                {"filename": row[0], "location": row[1], "text": row[2], "score": 1.0 - row[3]}
+                {"filename": row[0], "text": row[1], "score": 1.0 - row[2]}
                 for row in cur.fetchall()
             ]
 
 @cocoindex.main_fn()
 def _run():
+    # Initialize the database connection pool.
     pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
     # Run queries in a loop to demonstrate the query capabilities.
     while True:
         try:
             query = input("Enter search query (or Enter to quit): ")
             if query == '':
                 break
+            # Run the query function with the database connection pool and the query.
             results = search(pool, query)
             print("\nSearch results:")
             for result in results:
-                print(f"[{result['score']:.3f}] {result['filename']} location:{result['location']}")
+                print(f"[{result['score']:.3f}] {result['filename']}")
                 print(f"    {result['text']}")
                 print("---")
             print()

diff --git a/examples/text_embedding/pyproject.toml b/examples/text_embedding/pyproject.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "Simple example for cocoindex: build embedding index based on local text files."
 requires-python = ">=3.10"
 dependencies = [
-    "cocoindex>=0.1.39",
+    "cocoindex>=0.1.40",
     "python-dotenv>=1.0.1",
     "psycopg[binary,pool]",
 ]