diff --git a/examples/code_embedding/README.md b/examples/code_embedding/README.md
index 5d716fa6..dd7b9ee0 100644
--- a/examples/code_embedding/README.md
+++ b/examples/code_embedding/README.md
@@ -1,15 +1,32 @@
-# Build embedding index for codebase
+# Build real-time index for codebase
+[](https://github.com/cocoindex-io/cocoindex)
-
+CocoIndex provides built-in support for code base chunking, using Tree-sitter to keep syntax boundary. In this example, we will build real-time index for codebase using CocoIndex.
-In this example, we will build an embedding index for a codebase using CocoIndex. CocoIndex provides built-in support for code base chunking, with native Tree-sitter support. [Tree-sitter](https://en.wikipedia.org/wiki/Tree-sitter_%28parser_generator%29) is a parser generator tool and an incremental parsing library, it is available in Rust 🦀 - [GitHub](https://github.com/tree-sitter/tree-sitter). CocoIndex has built-in Rust integration with Tree-sitter to efficiently parse code and extract syntax trees for various programming languages.
+We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
+
+
+[Tree-sitter](https://en.wikipedia.org/wiki/Tree-sitter_%28parser_generator%29) is a parser generator tool and an incremental parsing library. It is available in Rust 🦀 - [GitHub](https://github.com/tree-sitter/tree-sitter). CocoIndex has built-in Rust integration with Tree-sitter to efficiently parse code and extract syntax trees for various programming languages. Check out the list of supported languages [here](https://cocoindex.io/docs/ops/functions#splitrecursively) - in the `language` section.
-Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us if you like our work. Thank you so much with a warm coconut hug 🥥🤗. [](https://github.com/cocoindex-io/cocoindex)
## Tutorials
-- Blog with step by step tutorial [here](https://cocoindex.io/blogs/index-code-base-for-rag).
-- Video walkthrough [here](https://youtu.be/G3WstvhHO24?si=Bnxu67Ax5Lv8b-J2)
+- Step by step tutorial - Check out the [blog](https://cocoindex.io/blogs/index-code-base-for-rag).
+- Video tutorial - [Youtube](https://youtu.be/G3WstvhHO24?si=Bnxu67Ax5Lv8b-J2).
+
+## Steps
+
+### Indexing Flow
+
+
+
+
+1. We will ingest CocoIndex codebase.
+2. For each file, perform chunking (Tree-sitter) and then embedding.
+3. We will save the embeddings and the metadata in Postgres with PGVector.
+
+### Query:
+We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow.
## Prerequisite
@@ -17,36 +34,38 @@ Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a s
## Run
-Install dependencies:
-```bash
-pip install -e .
-```
-
-Setup:
+- Install dependencies:
+ ```bash
+ pip install -e .
+ ```
-```bash
-python main.py cocoindex setup
-```
+- Setup:
-Update index:
+ ```bash
+ python main.py cocoindex setup
+ ```
-```bash
-python main.py cocoindex update
-```
+- Update index:
+
+ ```bash
+ python main.py cocoindex update
+ ```
-Run:
+- Run:
-```bash
-python main.py
-```
+ ```bash
+ python main.py
+ ```
## CocoInsight
-CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
-
-Run CocoInsight to understand your RAG data pipeline:
+I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline.
+It just connects to your local CocoIndex server, with Zero pipeline data retention. Run the following command to start CocoInsight:
```
python main.py cocoindex server -ci
```
Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
+
+
+
diff --git a/examples/code_embedding/main.py b/examples/code_embedding/main.py
index abd6d7b0..98e551a5 100644
--- a/examples/code_embedding/main.py
+++ b/examples/code_embedding/main.py
@@ -1,5 +1,5 @@
from dotenv import load_dotenv
-
+from psycopg_pool import ConnectionPool
import cocoindex
import os
@@ -8,7 +8,8 @@ def extract_extension(filename: str) -> str:
"""Extract the extension of a filename."""
return os.path.splitext(filename)[1]
-def code_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
+@cocoindex.transform_flow()
+def code_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:
"""
Embed the text using a SentenceTransformer model.
"""
@@ -24,7 +25,7 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
data_scope["files"] = flow_builder.add_source(
cocoindex.sources.LocalFile(path="../..",
included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"],
- excluded_patterns=[".*", "target", "**/node_modules"]))
+ excluded_patterns=["**/.*", "target", "**/node_modules"]))
code_embeddings = data_scope.add_collector()
with data_scope["files"].row() as file:
@@ -47,26 +48,40 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])
-query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
- name="SemanticsSearch",
- flow=code_embedding_flow,
- target_name="code_embeddings",
- query_transform_flow=code_to_embedding,
- default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)
+
+def search(pool: ConnectionPool, query: str, top_k: int = 5):
+ # Get the table name, for the export target in the code_embedding_flow above.
+ table_name = cocoindex.utils.get_target_storage_default_name(code_embedding_flow, "code_embeddings")
+ # Evaluate the transform flow defined above with the input query, to get the embedding.
+ query_vector = code_to_embedding.eval(query)
+ # Run the query and get the results.
+ with pool.connection() as conn:
+ with conn.cursor() as cur:
+ cur.execute(f"""
+ SELECT filename, code, embedding <=> %s::vector AS distance
+ FROM {table_name} ORDER BY distance LIMIT %s
+ """, (query_vector, top_k))
+ return [
+ {"filename": row[0], "code": row[1], "score": 1.0 - row[2]}
+ for row in cur.fetchall()
+ ]
@cocoindex.main_fn()
def _run():
+ # Initialize the database connection pool.
+ pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
# Run queries in a loop to demonstrate the query capabilities.
while True:
try:
query = input("Enter search query (or Enter to quit): ")
if query == '':
break
- results, _ = query_handler.search(query, 10)
+ # Run the query function with the database connection pool and the query.
+ results = search(pool, query)
print("\nSearch results:")
for result in results:
- print(f"[{result.score:.3f}] {result.data['filename']}")
- print(f" {result.data['code']}")
+ print(f"[{result['score']:.3f}] {result['filename']}")
+ print(f" {result['code']}")
print("---")
print()
except KeyboardInterrupt: