diff --git a/examples/code_embedding/README.md b/examples/code_embedding/README.md index 5d716fa6..dd7b9ee0 100644 --- a/examples/code_embedding/README.md +++ b/examples/code_embedding/README.md @@ -1,15 +1,32 @@ -# Build embedding index for codebase +# Build real-time index for codebase +[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) -![Build embedding index for codebase](https://cocoindex.io/blogs/assets/images/cover-9bf0a7cff69b66a40918ab2fc1cea0c7.png) +CocoIndex provides built-in support for code base chunking, using Tree-sitter to keep syntax boundary. In this example, we will build real-time index for codebase using CocoIndex. -In this example, we will build an embedding index for a codebase using CocoIndex. CocoIndex provides built-in support for code base chunking, with native Tree-sitter support. [Tree-sitter](https://en.wikipedia.org/wiki/Tree-sitter_%28parser_generator%29) is a parser generator tool and an incremental parsing library, it is available in Rust 🦀 - [GitHub](https://github.com/tree-sitter/tree-sitter). CocoIndex has built-in Rust integration with Tree-sitter to efficiently parse code and extract syntax trees for various programming languages. +We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. +![Build embedding index for codebase](https://github.com/user-attachments/assets/6dc5ce89-c949-41d4-852f-ad95af163dbd) + +[Tree-sitter](https://en.wikipedia.org/wiki/Tree-sitter_%28parser_generator%29) is a parser generator tool and an incremental parsing library. It is available in Rust 🦀 - [GitHub](https://github.com/tree-sitter/tree-sitter). CocoIndex has built-in Rust integration with Tree-sitter to efficiently parse code and extract syntax trees for various programming languages. Check out the list of supported languages [here](https://cocoindex.io/docs/ops/functions#splitrecursively) - in the `language` section. -Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us if you like our work. Thank you so much with a warm coconut hug 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) ## Tutorials -- Blog with step by step tutorial [here](https://cocoindex.io/blogs/index-code-base-for-rag). -- Video walkthrough [here](https://youtu.be/G3WstvhHO24?si=Bnxu67Ax5Lv8b-J2) +- Step by step tutorial - Check out the [blog](https://cocoindex.io/blogs/index-code-base-for-rag). +- Video tutorial - [Youtube](https://youtu.be/G3WstvhHO24?si=Bnxu67Ax5Lv8b-J2). + +## Steps + +### Indexing Flow +

+ Screenshot 2025-05-19 at 10 14 36 PM +

+ +1. We will ingest CocoIndex codebase. +2. For each file, perform chunking (Tree-sitter) and then embedding. +3. We will save the embeddings and the metadata in Postgres with PGVector. + +### Query: +We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. ## Prerequisite @@ -17,36 +34,38 @@ Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a s ## Run -Install dependencies: -```bash -pip install -e . -``` - -Setup: +- Install dependencies: + ```bash + pip install -e . + ``` -```bash -python main.py cocoindex setup -``` +- Setup: -Update index: + ```bash + python main.py cocoindex setup + ``` -```bash -python main.py cocoindex update -``` +- Update index: + + ```bash + python main.py cocoindex update + ``` -Run: +- Run: -```bash -python main.py -``` + ```bash + python main.py + ``` ## CocoInsight -CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9). - -Run CocoInsight to understand your RAG data pipeline: +I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. +It just connects to your local CocoIndex server, with Zero pipeline data retention. Run the following command to start CocoInsight: ``` python main.py cocoindex server -ci ``` Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). + +Chunking Visualization + diff --git a/examples/code_embedding/main.py b/examples/code_embedding/main.py index abd6d7b0..98e551a5 100644 --- a/examples/code_embedding/main.py +++ b/examples/code_embedding/main.py @@ -1,5 +1,5 @@ from dotenv import load_dotenv - +from psycopg_pool import ConnectionPool import cocoindex import os @@ -8,7 +8,8 @@ def extract_extension(filename: str) -> str: """Extract the extension of a filename.""" return os.path.splitext(filename)[1] -def code_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice: +@cocoindex.transform_flow() +def code_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]: """ Embed the text using a SentenceTransformer model. """ @@ -24,7 +25,7 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind data_scope["files"] = flow_builder.add_source( cocoindex.sources.LocalFile(path="../..", included_patterns=["*.py", "*.rs", "*.toml", "*.md", "*.mdx"], - excluded_patterns=[".*", "target", "**/node_modules"])) + excluded_patterns=["**/.*", "target", "**/node_modules"])) code_embeddings = data_scope.add_collector() with data_scope["files"].row() as file: @@ -47,26 +48,40 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) -query_handler = cocoindex.query.SimpleSemanticsQueryHandler( - name="SemanticsSearch", - flow=code_embedding_flow, - target_name="code_embeddings", - query_transform_flow=code_to_embedding, - default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY) + +def search(pool: ConnectionPool, query: str, top_k: int = 5): + # Get the table name, for the export target in the code_embedding_flow above. + table_name = cocoindex.utils.get_target_storage_default_name(code_embedding_flow, "code_embeddings") + # Evaluate the transform flow defined above with the input query, to get the embedding. + query_vector = code_to_embedding.eval(query) + # Run the query and get the results. + with pool.connection() as conn: + with conn.cursor() as cur: + cur.execute(f""" + SELECT filename, code, embedding <=> %s::vector AS distance + FROM {table_name} ORDER BY distance LIMIT %s + """, (query_vector, top_k)) + return [ + {"filename": row[0], "code": row[1], "score": 1.0 - row[2]} + for row in cur.fetchall() + ] @cocoindex.main_fn() def _run(): + # Initialize the database connection pool. + pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) # Run queries in a loop to demonstrate the query capabilities. while True: try: query = input("Enter search query (or Enter to quit): ") if query == '': break - results, _ = query_handler.search(query, 10) + # Run the query function with the database connection pool and the query. + results = search(pool, query) print("\nSearch results:") for result in results: - print(f"[{result.score:.3f}] {result.data['filename']}") - print(f" {result.data['code']}") + print(f"[{result['score']:.3f}] {result['filename']}") + print(f" {result['code']}") print("---") print() except KeyboardInterrupt: