From 803a3ef8e301ad24942413de4d1b45178663133d Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Thu, 28 Aug 2025 18:45:41 -0700 Subject: [PATCH 1/3] bump cocoindex version in dexamples --- examples/amazon_s3_embedding/pyproject.toml | 2 +- examples/azure_blob_embedding/pyproject.toml | 2 +- examples/code_embedding/pyproject.toml | 2 +- examples/custom_output_files/pyproject.toml | 2 +- examples/docs_to_knowledge_graph/pyproject.toml | 2 +- examples/face_recognition/pyproject.toml | 2 +- examples/fastapi_server_docker/requirements.txt | 2 +- examples/gdrive_text_embedding/pyproject.toml | 2 +- examples/image_search/pyproject.toml | 2 +- examples/live_updates/pyproject.toml | 2 +- examples/manuals_llm_extraction/pyproject.toml | 2 +- examples/multi_format_indexing/pyproject.toml | 2 +- examples/paper_metadata/pyproject.toml | 2 +- examples/patient_intake_extraction/pyproject.toml | 2 +- examples/pdf_embedding/pyproject.toml | 2 +- examples/postgres_source/pyproject.toml | 2 +- examples/product_recommendation/pyproject.toml | 2 +- examples/text_embedding/pyproject.toml | 2 +- examples/text_embedding_qdrant/pyproject.toml | 2 +- 19 files changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/amazon_s3_embedding/pyproject.toml b/examples/amazon_s3_embedding/pyproject.toml index 035912b7..f7029f13 100644 --- a/examples/amazon_s3_embedding/pyproject.toml +++ b/examples/amazon_s3_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on Amazon S3 files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.1.79", + "cocoindex[embeddings]>=0.2.1", "python-dotenv>=1.0.1", "psycopg[binary,pool]", ] diff --git a/examples/azure_blob_embedding/pyproject.toml b/examples/azure_blob_embedding/pyproject.toml index f5373c52..2c569708 100644 --- a/examples/azure_blob_embedding/pyproject.toml +++ b/examples/azure_blob_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on Azure Blob Storage files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.1.79", + "cocoindex[embeddings]>=0.2.1", "python-dotenv>=1.0.1", "psycopg[binary,pool]", ] diff --git a/examples/code_embedding/pyproject.toml b/examples/code_embedding/pyproject.toml index 3c42858a..5bcbf6bb 100644 --- a/examples/code_embedding/pyproject.toml +++ b/examples/code_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on source code." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.1.79", + "cocoindex[embeddings]>=0.2.1", "python-dotenv>=1.0.1", "pgvector>=0.4.1", "psycopg[binary,pool]", diff --git a/examples/custom_output_files/pyproject.toml b/examples/custom_output_files/pyproject.toml index 8f87073b..6e180c46 100644 --- a/examples/custom_output_files/pyproject.toml +++ b/examples/custom_output_files/pyproject.toml @@ -3,7 +3,7 @@ name = "custom-output-files" version = "0.1.0" description = "Simple example for cocoindex: convert markdown files to HTML files and save them to a local directory." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.1.79", "markdown-it-py[linkify,plugins]"] +dependencies = ["cocoindex>=0.2.1", "markdown-it-py[linkify,plugins]"] [tool.setuptools] packages = [] diff --git a/examples/docs_to_knowledge_graph/pyproject.toml b/examples/docs_to_knowledge_graph/pyproject.toml index 16908acf..80e9dba0 100644 --- a/examples/docs_to_knowledge_graph/pyproject.toml +++ b/examples/docs_to_knowledge_graph/pyproject.toml @@ -3,7 +3,7 @@ name = "manuals-to-kg" version = "0.1.0" description = "Simple example for cocoindex: extract triples from files and build knowledge graph." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.1.79"] +dependencies = ["cocoindex>=0.2.1"] [tool.setuptools] packages = [] diff --git a/examples/face_recognition/pyproject.toml b/examples/face_recognition/pyproject.toml index 45e24737..52020292 100644 --- a/examples/face_recognition/pyproject.toml +++ b/examples/face_recognition/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Build index for papers with both metadata and content embeddings" requires-python = ">=3.11" dependencies = [ - "cocoindex>=0.1.79", + "cocoindex>=0.2.1", "face-recognition>=1.3.0", "pillow>=10.0.0", "numpy>=1.26.0", diff --git a/examples/fastapi_server_docker/requirements.txt b/examples/fastapi_server_docker/requirements.txt index 8718448c..8c4129a6 100644 --- a/examples/fastapi_server_docker/requirements.txt +++ b/examples/fastapi_server_docker/requirements.txt @@ -1,4 +1,4 @@ -cocoindex[embeddings]>=0.1.79 +cocoindex[embeddings]>=0.2.1 python-dotenv>=1.0.1 fastapi==0.115.12 fastapi-cli==0.0.7 diff --git a/examples/gdrive_text_embedding/pyproject.toml b/examples/gdrive_text_embedding/pyproject.toml index 9deb4ad7..1203c1f1 100644 --- a/examples/gdrive_text_embedding/pyproject.toml +++ b/examples/gdrive_text_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on Google Drive files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.1.79", + "cocoindex[embeddings]>=0.2.1", "python-dotenv>=1.0.1", "psycopg[binary,pool]", ] diff --git a/examples/image_search/pyproject.toml b/examples/image_search/pyproject.toml index f6800c02..707c7e89 100644 --- a/examples/image_search/pyproject.toml +++ b/examples/image_search/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Image search examples for cocoindex: CLIP and ColPali-based embedding." requires-python = ">=3.11" dependencies = [ - "cocoindex[colpali]>=0.1.79", + "cocoindex[colpali]>=0.2.1", "python-dotenv>=1.0.1", "fastapi>=0.100.0", "torch>=2.0.0", diff --git a/examples/live_updates/pyproject.toml b/examples/live_updates/pyproject.toml index 41cb4ae6..a2effa3a 100644 --- a/examples/live_updates/pyproject.toml +++ b/examples/live_updates/pyproject.toml @@ -3,7 +3,7 @@ name = "live-updates-example" version = "0.1.0" description = "Simple example for cocoindex: perform live updates based on local markdown files." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.1.79", "python-dotenv>=1.1.0"] +dependencies = ["cocoindex>=0.2.1", "python-dotenv>=1.1.0"] [tools.setuptools] packages = [] diff --git a/examples/manuals_llm_extraction/pyproject.toml b/examples/manuals_llm_extraction/pyproject.toml index e01d6291..2a715fb2 100644 --- a/examples/manuals_llm_extraction/pyproject.toml +++ b/examples/manuals_llm_extraction/pyproject.toml @@ -3,7 +3,7 @@ name = "manuals-llm-extraction" version = "0.1.0" description = "Simple example for cocoindex: extract structured information from a Markdown file using LLM." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.1.79", "marker-pdf>=1.8.5"] +dependencies = ["cocoindex>=0.2.1", "marker-pdf>=1.8.5"] [tool.setuptools] packages = [] diff --git a/examples/multi_format_indexing/pyproject.toml b/examples/multi_format_indexing/pyproject.toml index f726c003..cc53082c 100644 --- a/examples/multi_format_indexing/pyproject.toml +++ b/examples/multi_format_indexing/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local PDF files." requires-python = ">=3.11" dependencies = [ - "cocoindex[colpali]>=0.1.79", + "cocoindex[colpali]>=0.2.1", "python-dotenv>=1.0.1", "pdf2image>=1.17.0", "qdrant-client>=1.15.0", diff --git a/examples/paper_metadata/pyproject.toml b/examples/paper_metadata/pyproject.toml index de95aaf2..88cead1e 100644 --- a/examples/paper_metadata/pyproject.toml +++ b/examples/paper_metadata/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Build index for papers with both metadata and content embeddings" requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.1.83", + "cocoindex[embeddings]>=0.2.1", "pypdf>=5.7.0", "marker-pdf>=1.8.5", ] diff --git a/examples/patient_intake_extraction/pyproject.toml b/examples/patient_intake_extraction/pyproject.toml index bb049260..72f96836 100644 --- a/examples/patient_intake_extraction/pyproject.toml +++ b/examples/patient_intake_extraction/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Extract structured information from patient intake forms using LLM." requires-python = ">=3.10" dependencies = [ - "cocoindex>=0.1.79", + "cocoindex>=0.2.1", "python-dotenv>=1.0.1", "markitdown>=0.1.2", "openai>=1.68.2", diff --git a/examples/pdf_embedding/pyproject.toml b/examples/pdf_embedding/pyproject.toml index b532ba82..dfb5124b 100644 --- a/examples/pdf_embedding/pyproject.toml +++ b/examples/pdf_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local PDF files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.1.79", + "cocoindex[embeddings]>=0.2.1", "python-dotenv>=1.0.1", "marker-pdf>=1.8.5", "psycopg[binary,pool]", diff --git a/examples/postgres_source/pyproject.toml b/examples/postgres_source/pyproject.toml index b0391b5a..83876f07 100644 --- a/examples/postgres_source/pyproject.toml +++ b/examples/postgres_source/pyproject.toml @@ -3,7 +3,7 @@ name = "postgres-source" version = "0.1.0" description = "Demonstrate how to use Postgres tables as the source for CocoIndex." requires-python = ">=3.11" -dependencies = ["cocoindex[embeddings]>=0.1.83"] +dependencies = ["cocoindex[embeddings]>=0.2.1"] [tool.setuptools] packages = [] diff --git a/examples/product_recommendation/pyproject.toml b/examples/product_recommendation/pyproject.toml index 63e09059..14867847 100644 --- a/examples/product_recommendation/pyproject.toml +++ b/examples/product_recommendation/pyproject.toml @@ -3,7 +3,7 @@ name = "cocoindex-ecommerce-taxonomy" version = "0.1.0" description = "Simple example for CocoIndex: extract taxonomy from e-commerce products and build knowledge graph." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.1.79", "jinja2>=3.1.6"] +dependencies = ["cocoindex>=0.2.1", "jinja2>=3.1.6"] [tool.setuptools] packages = [] diff --git a/examples/text_embedding/pyproject.toml b/examples/text_embedding/pyproject.toml index c092c89e..2bce1b92 100644 --- a/examples/text_embedding/pyproject.toml +++ b/examples/text_embedding/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local text files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.1.79", + "cocoindex[embeddings]>=0.2.1", "python-dotenv>=1.0.1", "pgvector>=0.4.1", "psycopg[binary,pool]", diff --git a/examples/text_embedding_qdrant/pyproject.toml b/examples/text_embedding_qdrant/pyproject.toml index 42595025..d8047c39 100644 --- a/examples/text_embedding_qdrant/pyproject.toml +++ b/examples/text_embedding_qdrant/pyproject.toml @@ -4,7 +4,7 @@ version = "0.1.0" description = "Simple example for cocoindex: build embedding index based on local text files." requires-python = ">=3.11" dependencies = [ - "cocoindex[embeddings]>=0.1.79", + "cocoindex[embeddings]>=0.2.1", "python-dotenv>=1.0.1", "qdrant-client>=1.6.0", ] From 0831eb06428c7c3b996d46775099f5610e52f11a Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Thu, 28 Aug 2025 18:53:18 -0700 Subject: [PATCH 2/3] simplify postgres source example to be a product data mapping/embedding --- examples/postgres_source/README.md | 9 ++-- examples/postgres_source/main.py | 54 +------------------ .../postgres_source/prepare_source_data.sql | 33 +----------- 3 files changed, 6 insertions(+), 90 deletions(-) diff --git a/examples/postgres_source/README.md b/examples/postgres_source/README.md index f5adb742..93075486 100644 --- a/examples/postgres_source/README.md +++ b/examples/postgres_source/README.md @@ -3,14 +3,13 @@ [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) This example demonstrates how to use Postgres tables as the source for CocoIndex. -It reads structured data from existing PostgreSQL tables, performs calculations, generates embeddings, and stores them in a separate CocoIndex table. +It reads structured product data from existing PostgreSQL tables, performs calculations, generates embeddings, and stores them in a separate CocoIndex table. We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. -This example contains two flows: +This example contains one flow: -1. `postgres_message_indexing_flow`: Read from a simpler table `source_messages` (single primary key), and generate embeddings for the `message` column. -2. `postgres_product_indexing_flow`: Read from a more complex table `source_products` (composite primary key), compute additional fields and generates embeddings. +`postgres_product_indexing_flow`: Read from a table `source_products` (composite primary key), compute additional fields like total value and full description, then generate embeddings for semantic search. ## Prerequisites @@ -25,7 +24,7 @@ Before running the example, you need to: 2. Follow the [CocoIndex PostgreSQL setup guide](https://cocoindex.io/docs/getting_started/quickstart) to install and configure PostgreSQL with pgvector extension. -3. Create source tables `source_messages` and `source_products` with sample data: +3. Create source table `source_products` with sample data: ```bash $ psql "postgres://cocoindex:cocoindex@localhost/cocoindex" -f ./prepare_source_data.sql diff --git a/examples/postgres_source/main.py b/examples/postgres_source/main.py index ecb087a6..d43a6082 100644 --- a/examples/postgres_source/main.py +++ b/examples/postgres_source/main.py @@ -2,58 +2,6 @@ import os -@cocoindex.flow_def(name="PostgresMessageIndexing") -def postgres_message_indexing_flow( - flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -) -> None: - """ - Define a flow that reads data from a PostgreSQL table, generates embeddings, - and stores them in another PostgreSQL table with pgvector. - """ - - data_scope["messages"] = flow_builder.add_source( - cocoindex.sources.Postgres( - table_name="source_messages", - # Optional. Use the default CocoIndex database if not specified. - database=cocoindex.add_transient_auth_entry( - cocoindex.sources.DatabaseConnectionSpec( - url=os.getenv("SOURCE_DATABASE_URL"), - ) - ), - # Optional. - ordinal_column="created_at", - ) - ) - - indexed_messages = data_scope.add_collector() - with data_scope["messages"].row() as message_row: - # Use the indexing column for embedding generation - message_row["embedding"] = message_row["message"].transform( - cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2" - ) - ) - # Collect the data - include key columns and content - indexed_messages.collect( - id=message_row["id"], - author=message_row["author"], - message=message_row["message"], - embedding=message_row["embedding"], - ) - - indexed_messages.export( - "output", - cocoindex.targets.Postgres(), - primary_key_fields=["id"], - vector_indexes=[ - cocoindex.VectorIndexDef( - field_name="embedding", - metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, - ) - ], - ) - - @cocoindex.op.function() def calculate_total_value( price: float, @@ -76,7 +24,7 @@ def postgres_product_indexing_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope ) -> None: """ - Define a flow that reads data from a PostgreSQL table, generates embeddings, + Define a flow that reads product data from a PostgreSQL table, generates embeddings, and stores them in another PostgreSQL table with pgvector. """ data_scope["products"] = flow_builder.add_source( diff --git a/examples/postgres_source/prepare_source_data.sql b/examples/postgres_source/prepare_source_data.sql index a2060cbe..e01a6bee 100644 --- a/examples/postgres_source/prepare_source_data.sql +++ b/examples/postgres_source/prepare_source_data.sql @@ -1,38 +1,7 @@ -- Usage: run with psql from your shell, for example: -- $ psql "postgres://cocoindex:cocoindex@localhost/cocoindex" -f ./prepare_source_data.sql -- ======================================== --- Simple schema: source_messages (single primary key) --- ======================================== -DROP TABLE IF EXISTS source_messages CASCADE; -CREATE TABLE source_messages ( - id uuid NOT NULL PRIMARY KEY DEFAULT gen_random_uuid(), - author text NOT NULL, - message text NOT NULL, - created_at timestamp DEFAULT CURRENT_TIMESTAMP -); -INSERT INTO source_messages (author, message) -VALUES ( - 'Jane Smith', - 'Hello world! This is a test message.' - ), - ( - 'John Doe', - 'PostgreSQL source integration is working great!' - ), - ( - 'Jane Smith', - 'CocoIndex makes database processing so much easier.' - ), - ( - 'John Doe', - 'Embeddings and vector search are powerful tools.' - ), - ( - 'John Doe', - 'Natural language processing meets database technology.' - ) ON CONFLICT DO NOTHING; --- ======================================== --- Multiple schema: source_products (composite primary key) +-- Product schema: source_products (composite primary key) -- ======================================== DROP TABLE IF EXISTS source_products CASCADE; CREATE TABLE source_products ( From bcde5b0fc056f9edcb8d594f84cece4de4d12dce Mon Sep 17 00:00:00 2001 From: Linghua Jin Date: Fri, 29 Aug 2025 13:51:28 -0700 Subject: [PATCH 3/3] Delete .env.example --- examples/postgres_source/.env.example | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 examples/postgres_source/.env.example diff --git a/examples/postgres_source/.env.example b/examples/postgres_source/.env.example deleted file mode 100644 index 0736871c..00000000 --- a/examples/postgres_source/.env.example +++ /dev/null @@ -1,22 +0,0 @@ -# Database Configuration -# CocoIndex Database (for storing embeddings) -COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex - -# Database URLs -SOURCE_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/source_data - -# ======================================== -# Configuration for test_simple table -# ======================================== -TABLE_NAME=test_simple -KEY_COLUMN_FOR_SINGLE_KEY=id -INDEXING_COLUMN=message -ORDINAL_COLUMN=created_at - -# ======================================== -# Configuration for test_multiple table -# ======================================== -TABLE_NAME=test_multiple -KEY_COLUMNS_FOR_MULTIPLE_KEYS=product_category,product_name -INDEXING_COLUMN=description -ORDINAL_COLUMN=modified_time