diff --git a/examples/postgres_source/.env.example b/examples/postgres_source/.env.example deleted file mode 100644 index 0736871c..00000000 --- a/examples/postgres_source/.env.example +++ /dev/null @@ -1,22 +0,0 @@ -# Database Configuration -# CocoIndex Database (for storing embeddings) -COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex - -# Database URLs -SOURCE_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/source_data - -# ======================================== -# Configuration for test_simple table -# ======================================== -TABLE_NAME=test_simple -KEY_COLUMN_FOR_SINGLE_KEY=id -INDEXING_COLUMN=message -ORDINAL_COLUMN=created_at - -# ======================================== -# Configuration for test_multiple table -# ======================================== -TABLE_NAME=test_multiple -KEY_COLUMNS_FOR_MULTIPLE_KEYS=product_category,product_name -INDEXING_COLUMN=description -ORDINAL_COLUMN=modified_time diff --git a/examples/postgres_source/README.md b/examples/postgres_source/README.md index f5adb742..93075486 100644 --- a/examples/postgres_source/README.md +++ b/examples/postgres_source/README.md @@ -3,14 +3,13 @@ [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) This example demonstrates how to use Postgres tables as the source for CocoIndex. -It reads structured data from existing PostgreSQL tables, performs calculations, generates embeddings, and stores them in a separate CocoIndex table. +It reads structured product data from existing PostgreSQL tables, performs calculations, generates embeddings, and stores them in a separate CocoIndex table. We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. -This example contains two flows: +This example contains one flow: -1. `postgres_message_indexing_flow`: Read from a simpler table `source_messages` (single primary key), and generate embeddings for the `message` column. -2. `postgres_product_indexing_flow`: Read from a more complex table `source_products` (composite primary key), compute additional fields and generates embeddings. +`postgres_product_indexing_flow`: Read from a table `source_products` (composite primary key), compute additional fields like total value and full description, then generate embeddings for semantic search. ## Prerequisites @@ -25,7 +24,7 @@ Before running the example, you need to: 2. Follow the [CocoIndex PostgreSQL setup guide](https://cocoindex.io/docs/getting_started/quickstart) to install and configure PostgreSQL with pgvector extension. -3. Create source tables `source_messages` and `source_products` with sample data: +3. Create source table `source_products` with sample data: ```bash $ psql "postgres://cocoindex:cocoindex@localhost/cocoindex" -f ./prepare_source_data.sql diff --git a/examples/postgres_source/main.py b/examples/postgres_source/main.py index ecb087a6..d43a6082 100644 --- a/examples/postgres_source/main.py +++ b/examples/postgres_source/main.py @@ -2,58 +2,6 @@ import os -@cocoindex.flow_def(name="PostgresMessageIndexing") -def postgres_message_indexing_flow( - flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope -) -> None: - """ - Define a flow that reads data from a PostgreSQL table, generates embeddings, - and stores them in another PostgreSQL table with pgvector. - """ - - data_scope["messages"] = flow_builder.add_source( - cocoindex.sources.Postgres( - table_name="source_messages", - # Optional. Use the default CocoIndex database if not specified. - database=cocoindex.add_transient_auth_entry( - cocoindex.sources.DatabaseConnectionSpec( - url=os.getenv("SOURCE_DATABASE_URL"), - ) - ), - # Optional. - ordinal_column="created_at", - ) - ) - - indexed_messages = data_scope.add_collector() - with data_scope["messages"].row() as message_row: - # Use the indexing column for embedding generation - message_row["embedding"] = message_row["message"].transform( - cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2" - ) - ) - # Collect the data - include key columns and content - indexed_messages.collect( - id=message_row["id"], - author=message_row["author"], - message=message_row["message"], - embedding=message_row["embedding"], - ) - - indexed_messages.export( - "output", - cocoindex.targets.Postgres(), - primary_key_fields=["id"], - vector_indexes=[ - cocoindex.VectorIndexDef( - field_name="embedding", - metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, - ) - ], - ) - - @cocoindex.op.function() def calculate_total_value( price: float, @@ -76,7 +24,7 @@ def postgres_product_indexing_flow( flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope ) -> None: """ - Define a flow that reads data from a PostgreSQL table, generates embeddings, + Define a flow that reads product data from a PostgreSQL table, generates embeddings, and stores them in another PostgreSQL table with pgvector. """ data_scope["products"] = flow_builder.add_source( diff --git a/examples/postgres_source/prepare_source_data.sql b/examples/postgres_source/prepare_source_data.sql index a2060cbe..e01a6bee 100644 --- a/examples/postgres_source/prepare_source_data.sql +++ b/examples/postgres_source/prepare_source_data.sql @@ -1,38 +1,7 @@ -- Usage: run with psql from your shell, for example: -- $ psql "postgres://cocoindex:cocoindex@localhost/cocoindex" -f ./prepare_source_data.sql -- ======================================== --- Simple schema: source_messages (single primary key) --- ======================================== -DROP TABLE IF EXISTS source_messages CASCADE; -CREATE TABLE source_messages ( - id uuid NOT NULL PRIMARY KEY DEFAULT gen_random_uuid(), - author text NOT NULL, - message text NOT NULL, - created_at timestamp DEFAULT CURRENT_TIMESTAMP -); -INSERT INTO source_messages (author, message) -VALUES ( - 'Jane Smith', - 'Hello world! This is a test message.' - ), - ( - 'John Doe', - 'PostgreSQL source integration is working great!' - ), - ( - 'Jane Smith', - 'CocoIndex makes database processing so much easier.' - ), - ( - 'John Doe', - 'Embeddings and vector search are powerful tools.' - ), - ( - 'John Doe', - 'Natural language processing meets database technology.' - ) ON CONFLICT DO NOTHING; --- ======================================== --- Multiple schema: source_products (composite primary key) +-- Product schema: source_products (composite primary key) -- ======================================== DROP TABLE IF EXISTS source_products CASCADE; CREATE TABLE source_products (