From 3cd5106e8fe2faf0f21d372d96bed1674f2812a4 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Sun, 18 May 2025 16:12:57 -0700 Subject: [PATCH 01/27] update product recommendation example description --- README.md | 2 +- examples/product_recommendation/README.md | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8b3cb3a2..3c7a9a93 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ It defines an index flow like this: | [Docs to Knowledge Graph](examples/docs_to_knowledge_graph) | Extract relationships from Markdown documents and build a knowledge graph | | [Embeddings to Qdrant](examples/text_embedding_qdrant) | Index documents in a Qdrant collection for semantic search | | [FastAPI Server with Docker](examples/fastapi_server_docker) | Run the semantic search server in a Dockerized FastAPI setup | -| [Product Recommendation](examples/product_recommendation) | Build real-time product recommendations with LLM and graph database| +| [Recommendation Engine with Knowledge Graph](examples/product_taxonomy_knowledge_graph) | Build real-time product recommendations with LLM and knowledge graph | | [Image Search with Vision API](examples/image_search_example) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend| More coming and stay tuned 👀! diff --git a/examples/product_recommendation/README.md b/examples/product_recommendation/README.md index 96565782..24da1f06 100644 --- a/examples/product_recommendation/README.md +++ b/examples/product_recommendation/README.md @@ -1,8 +1,6 @@ -# Build Real-Time Recommendation Engine with LLM and Graph Database +# Build Real-Time Recommendation Engine with LLM and Knowledge Graph -We will build a real-time product recommendation engine with LLM and graph database. In particular, we will use LLM to understand the category (taxonomy) of a product. In addition, we will use LLM to enumerate the complementary products - users are likely to buy together with the current product (pencil and notebook). - -We will use Graph to explore the relationships between products that can be further used for product recommendations or labeling. +We will process a list of products and use LLM to extract the taxonomy and complimentary taxonomy for each product and find connections between products. Please drop [CocoIndex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us and stay tuned for more updates. Thank you so much 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) From e32cfea69db9d46698dbcb663198f6040292a862 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Sun, 18 May 2025 16:54:12 -0700 Subject: [PATCH 02/27] rename folder --- examples/product_recommendation/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/product_recommendation/README.md b/examples/product_recommendation/README.md index 24da1f06..c554655b 100644 --- a/examples/product_recommendation/README.md +++ b/examples/product_recommendation/README.md @@ -1,6 +1,8 @@ # Build Real-Time Recommendation Engine with LLM and Knowledge Graph -We will process a list of products and use LLM to extract the taxonomy and complimentary taxonomy for each product and find connections between products. +We will build a real-time product recommendation engine with LLM and knowledge graph. In particular, we will use LLM to understand the category (taxonomy) of a product. In addition, we will use LLM to enumerate the complementary products - users are likely to buy together with the current product (pencil and notebook). + +We will use Knowledge Graph to explore the relationships between products that can be further used for product recommendations or labeling. Please drop [CocoIndex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us and stay tuned for more updates. Thank you so much 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) From dddee93334a7ad8121f84d35e6f31adc01e68adb Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Sun, 18 May 2025 16:54:40 -0700 Subject: [PATCH 03/27] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c7a9a93..b09e7621 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ It defines an index flow like this: | [Docs to Knowledge Graph](examples/docs_to_knowledge_graph) | Extract relationships from Markdown documents and build a knowledge graph | | [Embeddings to Qdrant](examples/text_embedding_qdrant) | Index documents in a Qdrant collection for semantic search | | [FastAPI Server with Docker](examples/fastapi_server_docker) | Run the semantic search server in a Dockerized FastAPI setup | -| [Recommendation Engine with Knowledge Graph](examples/product_taxonomy_knowledge_graph) | Build real-time product recommendations with LLM and knowledge graph | +| [Product Recommendation with Knowledge Graph](examples/product_recommendation) | Build real-time product recommendations with LLM and knowledge graph | | [Image Search with Vision API](examples/image_search_example) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend| More coming and stay tuned 👀! From e43a28ebe676ee1584819513d576b4e1b8e6f341 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Sun, 18 May 2025 16:56:18 -0700 Subject: [PATCH 04/27] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b09e7621..8b3cb3a2 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ It defines an index flow like this: | [Docs to Knowledge Graph](examples/docs_to_knowledge_graph) | Extract relationships from Markdown documents and build a knowledge graph | | [Embeddings to Qdrant](examples/text_embedding_qdrant) | Index documents in a Qdrant collection for semantic search | | [FastAPI Server with Docker](examples/fastapi_server_docker) | Run the semantic search server in a Dockerized FastAPI setup | -| [Product Recommendation with Knowledge Graph](examples/product_recommendation) | Build real-time product recommendations with LLM and knowledge graph | +| [Product Recommendation](examples/product_recommendation) | Build real-time product recommendations with LLM and graph database| | [Image Search with Vision API](examples/image_search_example) | Generates detailed captions for images using a vision model, embeds them, enables live-updating semantic search via FastAPI and served on a React frontend| More coming and stay tuned 👀! From ade9afb527f9d7dc8afc14c07419764865ff4031 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Sun, 18 May 2025 17:07:15 -0700 Subject: [PATCH 05/27] Update README.md --- examples/product_recommendation/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/product_recommendation/README.md b/examples/product_recommendation/README.md index c554655b..51552a6b 100644 --- a/examples/product_recommendation/README.md +++ b/examples/product_recommendation/README.md @@ -1,6 +1,6 @@ -# Build Real-Time Recommendation Engine with LLM and Knowledge Graph +# Build Real-Time Recommendation Engine with LLM and Graph Database -We will build a real-time product recommendation engine with LLM and knowledge graph. In particular, we will use LLM to understand the category (taxonomy) of a product. In addition, we will use LLM to enumerate the complementary products - users are likely to buy together with the current product (pencil and notebook). +We will build a real-time product recommendation engine with LLM and graph database. In particular, we will use LLM to understand the category (taxonomy) of a product. In addition, we will use LLM to enumerate the complementary products - users are likely to buy together with the current product (pencil and notebook). We will use Knowledge Graph to explore the relationships between products that can be further used for product recommendations or labeling. From 2bead8794e35eb0eebe2628927f86b6b94a1d0a3 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Sun, 18 May 2025 17:07:46 -0700 Subject: [PATCH 06/27] Update README.md --- examples/product_recommendation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/product_recommendation/README.md b/examples/product_recommendation/README.md index 51552a6b..96565782 100644 --- a/examples/product_recommendation/README.md +++ b/examples/product_recommendation/README.md @@ -2,7 +2,7 @@ We will build a real-time product recommendation engine with LLM and graph database. In particular, we will use LLM to understand the category (taxonomy) of a product. In addition, we will use LLM to enumerate the complementary products - users are likely to buy together with the current product (pencil and notebook). -We will use Knowledge Graph to explore the relationships between products that can be further used for product recommendations or labeling. +We will use Graph to explore the relationships between products that can be further used for product recommendations or labeling. Please drop [CocoIndex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us and stay tuned for more updates. Thank you so much 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) From a37465081314654d65508856fd1378dcdfc5bb1b Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 17:49:08 -0700 Subject: [PATCH 07/27] update text_embedding with new query handler --- examples/text_embedding/README.md | 5 ++++- examples/text_embedding/main.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index 1e999882..fc821c6c 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -1,4 +1,7 @@ -# Build text embedding and semantic search 🔍 +Build text embedding and semantic search based on local files. + +In this example, we will build a text embedding index and a semantic search flow based on local markdown files. + [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb) [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) diff --git a/examples/text_embedding/main.py b/examples/text_embedding/main.py index e69e1e7c..581f62c6 100644 --- a/examples/text_embedding/main.py +++ b/examples/text_embedding/main.py @@ -42,7 +42,6 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind field_name="embedding", metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) - def search(pool: ConnectionPool, query: str, top_k: int = 5): # Get the table name, for the export target in the text_embedding_flow above. table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings") From 3df050d01251413fe920c22cfa04a59bd90bac31 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 18:14:44 -0700 Subject: [PATCH 08/27] Update main.py --- examples/text_embedding/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/text_embedding/main.py b/examples/text_embedding/main.py index 581f62c6..e69e1e7c 100644 --- a/examples/text_embedding/main.py +++ b/examples/text_embedding/main.py @@ -42,6 +42,7 @@ def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind field_name="embedding", metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) + def search(pool: ConnectionPool, query: str, top_k: int = 5): # Get the table name, for the export target in the text_embedding_flow above. table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, "doc_embeddings") From 0471abec519d46af2902862d75fc0dd402d99e32 Mon Sep 17 00:00:00 2001 From: LJ Date: Mon, 19 May 2025 18:01:29 -0700 Subject: [PATCH 09/27] Update README.md --- examples/text_embedding/README.md | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index fc821c6c..9ddc9d7b 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -1,27 +1,15 @@ -Build text embedding and semantic search based on local files. - -In this example, we will build a text embedding index and a semantic search flow based on local markdown files. - +# Build text embedding and semantic search [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb) -[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) -In this example, we will build index flow from text embedding from local markdown files, and query the index. -We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. +In this example, we will build a text embedding index and a semantic search flow based on local markdown files. -## Steps: -🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) -### Indexing Flow: Screenshot 2025-05-19 at 5 48 28 PM -1. We will ingest from a list of local files. -2. For each file, perform chunking (Recursive Split) and then embeddings. -3. We will save the embeddings and the metadata in Postgres with PGVector. - -### Query: -We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. - +We will ingest from a list of local files. For each file, perform chunking (Recursive Split) and then embeddings. +We will save the embeddings and the metadata in Postgres with PGVector. +And then add a simpler query handler for semantic search. ## Prerequisite From c3c09b0d1cab00c4971365e4b2d1b99ac53d4b81 Mon Sep 17 00:00:00 2001 From: LJ Date: Mon, 19 May 2025 18:02:01 -0700 Subject: [PATCH 10/27] Update README.md --- examples/text_embedding/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index 9ddc9d7b..01db5953 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -7,9 +7,10 @@ In this example, we will build a text embedding index and a semantic search flow Screenshot 2025-05-19 at 5 48 28 PM -We will ingest from a list of local files. For each file, perform chunking (Recursive Split) and then embeddings. -We will save the embeddings and the metadata in Postgres with PGVector. -And then add a simpler query handler for semantic search. +- We will ingest from a list of local files. +- For each file, perform chunking (Recursive Split) and then embeddings. +- We will save the embeddings and the metadata in Postgres with PGVector. +- And then add a simpler query handler for semantic search. ## Prerequisite From bab25a4db17b2d79bbe0d39b422eeab312a79d71 Mon Sep 17 00:00:00 2001 From: LJ Date: Mon, 19 May 2025 18:11:22 -0700 Subject: [PATCH 11/27] Update README.md --- examples/text_embedding/README.md | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index 01db5953..618f854c 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -1,16 +1,21 @@ -# Build text embedding and semantic search +# Build text embedding and semantic search 🔍 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb) +[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) +In this example, we will build index flow from text embedding from local markdown files. And build semantic search with simple query handler. -In this example, we will build a text embedding index and a semantic search flow based on local markdown files. - +We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. Screenshot 2025-05-19 at 5 48 28 PM -- We will ingest from a list of local files. -- For each file, perform chunking (Recursive Split) and then embeddings. -- We will save the embeddings and the metadata in Postgres with PGVector. -- And then add a simpler query handler for semantic search. +Steps: +1. We will ingest from a list of local files. +2. For each file, perform chunking (Recursive Split) and then embeddings. +3. We will save the embeddings and the metadata in Postgres with PGVector. +4. And then add a simpler query handler for semantic search. + +🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) + ## Prerequisite From 4f0b607ffb95a8eb805757f8c2fcfe7868927228 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 18:20:44 -0700 Subject: [PATCH 12/27] Update README.md --- examples/text_embedding/README.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index 618f854c..aba3e16d 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -2,17 +2,20 @@ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb) [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) -In this example, we will build index flow from text embedding from local markdown files. And build semantic search with simple query handler. +In this example, we will build index flow from text embedding from local markdown files. And provide an simple example to query the index. We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. Screenshot 2025-05-19 at 5 48 28 PM Steps: -1. We will ingest from a list of local files. -2. For each file, perform chunking (Recursive Split) and then embeddings. -3. We will save the embeddings and the metadata in Postgres with PGVector. -4. And then add a simpler query handler for semantic search. +- Indexing Flow: + 1. We will ingest from a list of local files. + 2. For each file, perform chunking (Recursive Split) and then embeddings. + 3. We will save the embeddings and the metadata in Postgres with PGVector. + +- Query: +1. We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. 🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) From 9ffe7ed4a73363abfdebaa5bd349249c94c83c1c Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 18:21:39 -0700 Subject: [PATCH 13/27] Update README.md --- examples/text_embedding/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index aba3e16d..809b2de7 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -2,7 +2,7 @@ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cocoindex-io/cocoindex/blob/main/examples/text_embedding/Text_Embedding.ipynb) [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) -In this example, we will build index flow from text embedding from local markdown files. And provide an simple example to query the index. +In this example, we will build index flow from text embedding from local markdown files, and query the index. We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. From 631cad75c4625b8fc6f68736d6dbb0235575e18f Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 18:29:22 -0700 Subject: [PATCH 14/27] Update README.md --- examples/text_embedding/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index 809b2de7..32c91d1b 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -15,7 +15,7 @@ Steps: 3. We will save the embeddings and the metadata in Postgres with PGVector. - Query: -1. We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. +We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. 🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) From 8008d85d91b6c02c5691fe6ecc01dfebe3f1d3d3 Mon Sep 17 00:00:00 2001 From: LJ Date: Mon, 19 May 2025 18:23:39 -0700 Subject: [PATCH 15/27] Update README.md --- examples/text_embedding/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index 32c91d1b..f5b39f97 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -6,18 +6,18 @@ In this example, we will build index flow from text embedding from local markdow We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. -Screenshot 2025-05-19 at 5 48 28 PM - -Steps: -- Indexing Flow: - 1. We will ingest from a list of local files. - 2. For each file, perform chunking (Recursive Split) and then embeddings. - 3. We will save the embeddings and the metadata in Postgres with PGVector. +## Steps: +🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) -- Query: -We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. +### Indexing Flow: +Screenshot 2025-05-19 at 5 48 28 PM -🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) +1. We will ingest from a list of local files. +2. For each file, perform chunking (Recursive Split) and then embeddings. +3. We will save the embeddings and the metadata in Postgres with PGVector. + +### Query: +1. We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. ## Prerequisite From 2aa1bc8c3048d4ae1e9367aeb67cea5ba3e80c41 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 19:13:27 -0700 Subject: [PATCH 16/27] qdrant --- examples/text_embedding_qdrant/README.md | 16 ++++--- examples/text_embedding_qdrant/main.py | 43 +++++++++++-------- examples/text_embedding_qdrant/pyproject.toml | 2 +- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/examples/text_embedding_qdrant/README.md b/examples/text_embedding_qdrant/README.md index 5e2ea059..55091faf 100644 --- a/examples/text_embedding_qdrant/README.md +++ b/examples/text_embedding_qdrant/README.md @@ -1,6 +1,10 @@ ## Description +# Build text embedding and semantic search 🔍 with Qdrant -Example to build a vector index in Qdrant based on local files. +[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) + +In this example, we will build index flow from text embedding from local markdown files, and query the index. +We will use **Qdrant** as the vector database. ## Pre-requisites @@ -57,13 +61,13 @@ python main.py ``` ## CocoInsight - -CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9). - -Run CocoInsight to understand your RAG data pipeline: +I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. +It just connects to your local CocoIndex server, with Zero pipeline data retention. Run following command to start CocoInsight: ```bash python main.py cocoindex server -ci ``` -Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). +Open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). + + diff --git a/examples/text_embedding_qdrant/main.py b/examples/text_embedding_qdrant/main.py index 57f27a45..fd15ba73 100644 --- a/examples/text_embedding_qdrant/main.py +++ b/examples/text_embedding_qdrant/main.py @@ -1,21 +1,22 @@ from dotenv import load_dotenv +from qdrant_client import QdrantClient +from qdrant_client.http.models import Filter, FieldCondition, MatchValue import cocoindex -def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice: +@cocoindex.transform_flow() +def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]: """ Embed the text using a SentenceTransformer model. This is a shared logic between indexing and querying, so extract it as a function. """ return text.transform( cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2" - ) - ) + model="sentence-transformers/all-MiniLM-L6-v2")) -@cocoindex.flow_def(name="TextEmbedding") +@cocoindex.flow_def(name="TextEmbeddingWithQdrant") def text_embedding_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope ): @@ -57,28 +58,34 @@ def text_embedding_flow( ) -query_handler = cocoindex.query.SimpleSemanticsQueryHandler( - name="SemanticsSearch", - flow=text_embedding_flow, - target_name="doc_embeddings", - query_transform_flow=text_to_embedding, - default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, -) - - @cocoindex.main_fn() def _run(): + # Initialize Qdrant client + client = QdrantClient(host="localhost", port=6333) + # Run queries in a loop to demonstrate the query capabilities. while True: try: query = input("Enter search query (or Enter to quit): ") if query == "": break - results, _ = query_handler.search(query, 10, "text_embedding") + + # Get the embedding for the query + query_embedding = text_to_embedding.eval(query) + + # Search in Qdrant + search_results = client.search( + collection_name="cocoindex", + query_vector=("text_embedding", query_embedding), + limit=10 + ) + print("\nSearch results:") - for result in results: - print(f"[{result.score:.3f}] {result.data['filename']}") - print(f" {result.data['text']}") + for result in search_results: + score = result.score + payload = result.payload + print(f"[{score:.3f}] {payload['filename']}") + print(f" {payload['text']}") print("---") print() except KeyboardInterrupt: diff --git a/examples/text_embedding_qdrant/pyproject.toml b/examples/text_embedding_qdrant/pyproject.toml index 25b2663c..70454200 100644 --- a/examples/text_embedding_qdrant/pyproject.toml +++ b/examples/text_embedding_qdrant/pyproject.toml @@ -3,7 +3,7 @@ name = "text-embedding-qdrant" version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local text files." requires-python = ">=3.10" -dependencies = ["cocoindex>=0.1.39", "python-dotenv>=1.0.1"] +dependencies = ["cocoindex>=0.1.39", "python-dotenv>=1.0.1", "qdrant-client>=1.6.0"] [tool.setuptools] packages = [] From c76db2dd9f4ed5d39f35d52801a3497abecb2c6c Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 19:15:44 -0700 Subject: [PATCH 17/27] Update README.md --- examples/text_embedding/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index f5b39f97..1e999882 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -17,7 +17,7 @@ We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c 3. We will save the embeddings and the metadata in Postgres with PGVector. ### Query: -1. We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. +We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. ## Prerequisite From f5e965df731d3ee0f667d388d56a0cfaceeda167 Mon Sep 17 00:00:00 2001 From: LJ Date: Mon, 19 May 2025 19:22:36 -0700 Subject: [PATCH 18/27] Update README.md --- examples/text_embedding_qdrant/README.md | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/examples/text_embedding_qdrant/README.md b/examples/text_embedding_qdrant/README.md index 55091faf..17fb118b 100644 --- a/examples/text_embedding_qdrant/README.md +++ b/examples/text_embedding_qdrant/README.md @@ -1,14 +1,28 @@ -## Description # Build text embedding and semantic search 🔍 with Qdrant [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) -In this example, we will build index flow from text embedding from local markdown files, and query the index. -We will use **Qdrant** as the vector database. +CocoIndex supports Qdrant natively - [documentation](https://cocoindex.io/docs/ops/storages#qdrant). In this example, we will build index flow from text embedding from local markdown files, and query the index. We will use **Qdrant** as the vector database. + +We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. + +coco feat qdrant + +## Steps: +### Indexing Flow: +Screenshot 2025-05-19 at 7 19 50 PM + +1. We will ingest from a list of local files. +2. For each file, perform chunking (Recursive Split) and then embeddings. +3. We will save the embeddings and the metadata in Postgres with PGVector. + +### Query: +We will be use Qdrant client to query the index, reusing the embedding operation in the indexing flow. + ## Pre-requisites -- [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. +- [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. Even the target store is Qdrant, CocoIndex uses Postgress to track the data lineage for incremental processing. - Run Qdrant. From eed7a258a510ba7b5be28ca46eed51b92d96214b Mon Sep 17 00:00:00 2001 From: LJ Date: Mon, 19 May 2025 19:25:08 -0700 Subject: [PATCH 19/27] Update README.md --- examples/text_embedding_qdrant/README.md | 70 ++++++++++++------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/examples/text_embedding_qdrant/README.md b/examples/text_embedding_qdrant/README.md index 17fb118b..bb497f6a 100644 --- a/examples/text_embedding_qdrant/README.md +++ b/examples/text_embedding_qdrant/README.md @@ -6,7 +6,7 @@ CocoIndex supports Qdrant natively - [documentation](https://cocoindex.io/docs/o We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. -coco feat qdrant +Screenshot 2025-05-19 at 7 24 13 PM ## Steps: ### Indexing Flow: @@ -26,53 +26,53 @@ We will be use Qdrant client to query the index, reusing the embedding operation - Run Qdrant. -```bash -docker run -d -p 6334:6334 -p 6333:6333 qdrant/qdrant -``` + ```bash + docker run -d -p 6334:6334 -p 6333:6333 qdrant/qdrant + ``` - [Create a collection](https://qdrant.tech/documentation/concepts/vectors/#named-vectors) to export the embeddings to. -```bash -curl -X PUT \ - 'http://localhost:6333/collections/cocoindex' \ - --header 'Content-Type: application/json' \ - --data-raw '{ - "vectors": { - "text_embedding": { - "size": 384, - "distance": "Cosine" - } - } -}' -``` - -You can view the collections and data with the Qdrant dashboard at . + ```bash + curl -X PUT \ + 'http://localhost:6333/collections/cocoindex' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "vectors": { + "text_embedding": { + "size": 384, + "distance": "Cosine" + } + } + }' + ``` + + You can view the collections and data with the Qdrant dashboard at . ## Run -Install dependencies: +- Install dependencies: -```bash -pip install -e . -``` + ```bash + pip install -e . + ``` -Setup: +- Setup: -```bash -python main.py cocoindex setup -``` + ```bash + python main.py cocoindex setup + ``` -Update index: +- Update index: -```bash -python main.py cocoindex update -``` + ```bash + python main.py cocoindex update + ``` -Run: +- Run: -```bash -python main.py -``` + ```bash + python main.py + ``` ## CocoInsight I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. From 12fc9be73573cc85d3298d078e6e13e2c8ceef04 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 20:57:29 -0700 Subject: [PATCH 20/27] Update README.md --- examples/text_embedding_qdrant/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_embedding_qdrant/README.md b/examples/text_embedding_qdrant/README.md index bb497f6a..55a7c20f 100644 --- a/examples/text_embedding_qdrant/README.md +++ b/examples/text_embedding_qdrant/README.md @@ -4,7 +4,7 @@ CocoIndex supports Qdrant natively - [documentation](https://cocoindex.io/docs/ops/storages#qdrant). In this example, we will build index flow from text embedding from local markdown files, and query the index. We will use **Qdrant** as the vector database. -We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. +We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. Screenshot 2025-05-19 at 7 24 13 PM From 407eda24d51664cf871170ef605934bb92a7bab7 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 21:00:34 -0700 Subject: [PATCH 21/27] Update README.md --- examples/text_embedding/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index 1e999882..63291a15 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -4,7 +4,7 @@ In this example, we will build index flow from text embedding from local markdown files, and query the index. -We appreicate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. +We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. ## Steps: 🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) From 44d93dde6aa9e5ac8ba6bf2cf017a1f28c74e8bc Mon Sep 17 00:00:00 2001 From: LJ Date: Mon, 19 May 2025 21:06:21 -0700 Subject: [PATCH 22/27] Update README.md --- examples/text_embedding_qdrant/README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/text_embedding_qdrant/README.md b/examples/text_embedding_qdrant/README.md index 55a7c20f..3f91dc95 100644 --- a/examples/text_embedding_qdrant/README.md +++ b/examples/text_embedding_qdrant/README.md @@ -6,23 +6,23 @@ CocoIndex supports Qdrant natively - [documentation](https://cocoindex.io/docs/o We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. -Screenshot 2025-05-19 at 7 24 13 PM +CocoIndex supports Qdrant -## Steps: -### Indexing Flow: -Screenshot 2025-05-19 at 7 19 50 PM +## Steps +### Indexing Flow +Index flow for text embedding -1. We will ingest from a list of local files. -2. For each file, perform chunking (Recursive Split) and then embeddings. +1. We will ingest a list of local files. +2. For each file, perform chunking (recursively split) and then embedding. 3. We will save the embeddings and the metadata in Postgres with PGVector. -### Query: -We will be use Qdrant client to query the index, reusing the embedding operation in the indexing flow. +### Query +We use Qdrant client to query the index, and reuse the embedding operation in the indexing flow. ## Pre-requisites -- [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. Even the target store is Qdrant, CocoIndex uses Postgress to track the data lineage for incremental processing. +- [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. Although the target store is Qdrant, CocoIndex uses Postgress to track the data lineage for incremental processing. - Run Qdrant. From 689d359d495eb72bad88ae686e750699e8936824 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 21:07:00 -0700 Subject: [PATCH 23/27] Update README.md --- examples/text_embedding/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/text_embedding/README.md b/examples/text_embedding/README.md index 63291a15..2dd1dbb8 100644 --- a/examples/text_embedding/README.md +++ b/examples/text_embedding/README.md @@ -6,18 +6,18 @@ In this example, we will build index flow from text embedding from local markdow We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. -## Steps: +## Steps 🌱 A detailed step by step tutorial can be found here: [Get Started Documentation](https://cocoindex.io/docs/getting_started/quickstart) -### Indexing Flow: +### Indexing Flow Screenshot 2025-05-19 at 5 48 28 PM -1. We will ingest from a list of local files. -2. For each file, perform chunking (Recursive Split) and then embeddings. +1. We will ingest a list of local files. +2. For each file, perform chunking (recursively split) and then embedding. 3. We will save the embeddings and the metadata in Postgres with PGVector. -### Query: -We will match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. +### Query +We will match against user-provided text by a SQL query, and reuse the embedding operation in the indexing flow. ## Prerequisite From eb56fe2e12880c3e48e250c175508c219abfb138 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 21:21:25 -0700 Subject: [PATCH 24/27] Update main.py --- examples/text_embedding_qdrant/main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/text_embedding_qdrant/main.py b/examples/text_embedding_qdrant/main.py index fd15ba73..7bc81edb 100644 --- a/examples/text_embedding_qdrant/main.py +++ b/examples/text_embedding_qdrant/main.py @@ -4,6 +4,10 @@ import cocoindex +# Define Qdrant connection constants +QDRANT_URL = "http://localhost:6333" +QDRANT_COLLECTION = "cocoindex" + @cocoindex.transform_flow() def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]: @@ -51,7 +55,7 @@ def text_embedding_flow( doc_embeddings.export( "doc_embeddings", cocoindex.storages.Qdrant( - collection_name="cocoindex", grpc_url="http://localhost:6334/" + collection_name=QDRANT_COLLECTION, grpc_url=QDRANT_URL ), primary_key_fields=["id"], setup_by_user=True, @@ -61,7 +65,7 @@ def text_embedding_flow( @cocoindex.main_fn() def _run(): # Initialize Qdrant client - client = QdrantClient(host="localhost", port=6333) + client = QdrantClient(url=QDRANT_URL) # Run queries in a loop to demonstrate the query capabilities. while True: @@ -75,7 +79,7 @@ def _run(): # Search in Qdrant search_results = client.search( - collection_name="cocoindex", + collection_name=QDRANT_COLLECTION, query_vector=("text_embedding", query_embedding), limit=10 ) From e69c212426719be5cc73d6f6f4c5fe15d948b25e Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 21:44:49 -0700 Subject: [PATCH 25/27] Update main.py --- examples/text_embedding_qdrant/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_embedding_qdrant/main.py b/examples/text_embedding_qdrant/main.py index 7bc81edb..20341e25 100644 --- a/examples/text_embedding_qdrant/main.py +++ b/examples/text_embedding_qdrant/main.py @@ -5,7 +5,7 @@ import cocoindex # Define Qdrant connection constants -QDRANT_URL = "http://localhost:6333" +QDRANT_URL = "http://localhost:6334" QDRANT_COLLECTION = "cocoindex" From 9272a8a1438dd4edc2700614f348733eb4cb5b96 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 22:59:28 -0700 Subject: [PATCH 26/27] Update main.py --- examples/text_embedding_qdrant/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_embedding_qdrant/main.py b/examples/text_embedding_qdrant/main.py index 20341e25..7bc81edb 100644 --- a/examples/text_embedding_qdrant/main.py +++ b/examples/text_embedding_qdrant/main.py @@ -5,7 +5,7 @@ import cocoindex # Define Qdrant connection constants -QDRANT_URL = "http://localhost:6334" +QDRANT_URL = "http://localhost:6333" QDRANT_COLLECTION = "cocoindex" From 79679140ad8a8929111bdbb732b6602aa65362e4 Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Mon, 19 May 2025 23:29:59 -0700 Subject: [PATCH 27/27] Update main.py --- examples/text_embedding_qdrant/main.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/text_embedding_qdrant/main.py b/examples/text_embedding_qdrant/main.py index 7bc81edb..b2892c43 100644 --- a/examples/text_embedding_qdrant/main.py +++ b/examples/text_embedding_qdrant/main.py @@ -5,7 +5,7 @@ import cocoindex # Define Qdrant connection constants -QDRANT_URL = "http://localhost:6333" +QDRANT_GRPC_URL = "http://localhost:6334" QDRANT_COLLECTION = "cocoindex" @@ -55,7 +55,7 @@ def text_embedding_flow( doc_embeddings.export( "doc_embeddings", cocoindex.storages.Qdrant( - collection_name=QDRANT_COLLECTION, grpc_url=QDRANT_URL + collection_name=QDRANT_COLLECTION, grpc_url=QDRANT_GRPC_URL ), primary_key_fields=["id"], setup_by_user=True, @@ -65,7 +65,7 @@ def text_embedding_flow( @cocoindex.main_fn() def _run(): # Initialize Qdrant client - client = QdrantClient(url=QDRANT_URL) + client = QdrantClient(url=QDRANT_GRPC_URL, prefer_grpc=True) # Run queries in a loop to demonstrate the query capabilities. while True: @@ -77,13 +77,11 @@ def _run(): # Get the embedding for the query query_embedding = text_to_embedding.eval(query) - # Search in Qdrant search_results = client.search( collection_name=QDRANT_COLLECTION, query_vector=("text_embedding", query_embedding), limit=10 ) - print("\nSearch results:") for result in search_results: score = result.score