diff --git a/examples/gdrive_text_embedding/README.md b/examples/gdrive_text_embedding/README.md index 73167ab8..3d79dcb9 100644 --- a/examples/gdrive_text_embedding/README.md +++ b/examples/gdrive_text_embedding/README.md @@ -1,6 +1,23 @@ -This example builds embedding index based on Google Drive files. -It continuously updates the index as files are added / updated / deleted in the source folders: -it keeps the index in sync with the source folders effortlessly. +# Build Google Drive text embedding and semantic search 🔍 +[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) + +In this example, we will build an embedding index based on Google Drive files and perform semantic search. + +It continuously updates the index as files are added / updated / deleted in the source folders. It keeps the index in sync with the source folders in real-time. + +We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. + +## Steps + +### Indexing Flow +Google Drive File Ingestion + +1. We will ingest files from Google Drive folders. +2. For each file, perform chunking (recursively split) and then embedding. +3. We will save the embeddings and the metadata in Postgres with PGVector. + +### Query +We will match against user-provided text by a SQL query, and reuse the embedding operation in the indexing flow. ## Prerequisite @@ -25,32 +42,31 @@ Before running the example, you need to: ## Run -Install dependencies: - -```sh -pip install -e . -``` +- Install dependencies: -Setup: + ```sh + pip install -e . + ``` -```sh -cocoindex setup main.py -``` +- Setup: -Run: + ```sh + cocoindex setup main.py + ``` -```sh -python main.py -``` +- Run: + + ```sh + python main.py + ``` During running, it will keep observing changes in the source folders and update the index automatically. At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index. ## CocoInsight -CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9). - -Run CocoInsight to understand your RAG data pipeline: +I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. +It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight: ```sh cocoindex server -ci main.py @@ -62,4 +78,6 @@ You can also add a `-L` flag to make the server keep updating the index to refle cocoindex server -ci -L main.py ``` -Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). \ No newline at end of file +Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). + +Screenshot 2025-05-20 at 5 06 31 PM diff --git a/examples/gdrive_text_embedding/main.py b/examples/gdrive_text_embedding/main.py index 7e37ca7e..b612e2d5 100644 --- a/examples/gdrive_text_embedding/main.py +++ b/examples/gdrive_text_embedding/main.py @@ -1,9 +1,19 @@ from dotenv import load_dotenv - +from psycopg_pool import ConnectionPool import cocoindex import datetime import os +@cocoindex.transform_flow() +def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]: + """ + Embed the text using a SentenceTransformer model. + This is a shared logic between indexing and querying, so extract it as a function. + """ + return text.transform( + cocoindex.functions.SentenceTransformerEmbed( + model="sentence-transformers/all-MiniLM-L6-v2")) + @cocoindex.flow_def(name="GoogleDriveTextEmbedding") def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope): """ @@ -27,9 +37,7 @@ def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: language="markdown", chunk_size=2000, chunk_overlap=500) with doc["chunks"].row() as chunk: - chunk["embedding"] = chunk["text"].transform( - cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2")) + chunk["embedding"] = text_to_embedding(chunk["text"]) doc_embeddings.collect(filename=doc["filename"], location=chunk["location"], text=chunk["text"], embedding=chunk["embedding"]) @@ -42,33 +50,42 @@ def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: field_name="embedding", metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) -query_handler = cocoindex.query.SimpleSemanticsQueryHandler( - name="SemanticsSearch", - flow=gdrive_text_embedding_flow, - target_name="doc_embeddings", - query_transform_flow=lambda text: text.transform( - cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2")), - default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY) +def search(pool: ConnectionPool, query: str, top_k: int = 5): + # Get the table name, for the export target in the gdrive_text_embedding_flow above. + table_name = cocoindex.utils.get_target_storage_default_name(gdrive_text_embedding_flow, "doc_embeddings") + # Evaluate the transform flow defined above with the input query, to get the embedding. + query_vector = text_to_embedding.eval(query) + # Run the query and get the results. + with pool.connection() as conn: + with conn.cursor() as cur: + cur.execute(f""" + SELECT filename, text, embedding <=> %s::vector AS distance + FROM {table_name} ORDER BY distance LIMIT %s + """, (query_vector, top_k)) + return [ + {"filename": row[0], "text": row[1], "score": 1.0 - row[2]} + for row in cur.fetchall() + ] def _main(): - # Use a `FlowLiveUpdater` to keep the flow data updated. - with cocoindex.FlowLiveUpdater(gdrive_text_embedding_flow): - # Run queries in a loop to demonstrate the query capabilities. - while True: - try: - query = input("Enter search query (or Enter to quit): ") - if query == '': - break - results, _ = query_handler.search(query, 10) - print("\nSearch results:") - for result in results: - print(f"[{result.score:.3f}] {result.data['filename']}") - print(f" {result.data['text']}") - print("---") - print() - except KeyboardInterrupt: + # Initialize the database connection pool. + pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL")) + # Run queries in a loop to demonstrate the query capabilities. + while True: + try: + query = input("Enter search query (or Enter to quit): ") + if query == '': break + # Run the query function with the database connection pool and the query. + results = search(pool, query) + print("\nSearch results:") + for result in results: + print(f"[{result['score']:.3f}] {result['filename']}") + print(f" {result['text']}") + print("---") + print() + except KeyboardInterrupt: + break if __name__ == "__main__": load_dotenv()