From 1a809aee4c13aaf2e0f1cedcb6fd15818e4805e9 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Sun, 21 Sep 2025 09:11:38 -0700 Subject: [PATCH] cleanup: run pre-commit hooks to cleanup files --- docs/docs/contributing/guide.md | 2 +- .../contributing/setup_dev_environment.md | 2 +- .../examples/academic_papers_index.md | 22 ++++++------- docs/docs/examples/examples/codebase_index.md | 16 ++++----- docs/docs/examples/examples/custom_targets.md | 20 +++++------ .../examples/docs_to_knowledge_graph.md | 21 ++++++------ docs/docs/examples/examples/document_ai.md | 2 +- docs/docs/examples/examples/image_search.md | 2 +- .../examples/examples/manual_extraction.md | 3 +- .../examples/examples/multi_format_index.md | 16 ++++----- .../examples/patient_form_extraction.md | 12 +++---- docs/docs/examples/examples/photo_search.md | 10 +++--- .../docs/examples/examples/postgres_source.md | 32 +++++++++--------- .../examples/product_recommendation.md | 33 +++++++++---------- .../examples/examples/simple_vector_index.md | 12 +++---- docs/docs/examples/index.md | 4 +-- docs/docs/getting_started/quickstart.md | 8 ++--- docs/docusaurus.config.ts | 2 +- docs/src/theme/DocCard/index.tsx | 6 ++-- docs/src/theme/DocCard/styles.module.css | 8 ++--- docs/src/theme/DocCardList/index.tsx | 2 +- docs/src/theme/DocCardList/styles.module.css | 14 ++++---- examples/postgres_source/.env | 2 +- 23 files changed, 122 insertions(+), 129 deletions(-) diff --git a/docs/docs/contributing/guide.md b/docs/docs/contributing/guide.md index d53a9d99..d176b989 100644 --- a/docs/docs/contributing/guide.md +++ b/docs/docs/contributing/guide.md @@ -5,7 +5,7 @@ description: How to contribute to CocoIndex [CocoIndex](https://github.com/cocoindex-io/cocoindex) is an open source project. We are respectful, open and friendly. This guide explains how to get involved and contribute to [CocoIndex](https://github.com/cocoindex-io/cocoindex). -Our [Discord server](https://discord.com/invite/zpA9S2DR7s) is constantly open. +Our [Discord server](https://discord.com/invite/zpA9S2DR7s) is constantly open. If you are unsure about anything, it is a good place to discuss! We'd love to collaborate and will always be friendly. ## Good First Issues diff --git a/docs/docs/contributing/setup_dev_environment.md b/docs/docs/contributing/setup_dev_environment.md index f560b756..40a97708 100644 --- a/docs/docs/contributing/setup_dev_environment.md +++ b/docs/docs/contributing/setup_dev_environment.md @@ -44,4 +44,4 @@ Follow the steps below to get CocoIndex built on the latest codebase locally - i - Before running a specific example, set extra environment variables, for exposing extra traces, allowing dev UI, etc. ```sh . ./.env.lib_debug - ``` \ No newline at end of file + ``` diff --git a/docs/docs/examples/examples/academic_papers_index.md b/docs/docs/examples/examples/academic_papers_index.md index 89899674..2f2f6519 100644 --- a/docs/docs/examples/examples/academic_papers_index.md +++ b/docs/docs/examples/examples/academic_papers_index.md @@ -21,10 +21,10 @@ import { GitHubButton, YouTubeButton, DocumentationButton } from '../../../src/c 1. Extract the paper metadata, including file name, title, author information, abstract, and number of pages. -2. Build vector embeddings for the metadata, such as the title and abstract, for semantic search. +2. Build vector embeddings for the metadata, such as the title and abstract, for semantic search. This enables better metadata-driven semantic search results. For example, you can match text queries against titles and abstracts. -3. Build an index of authors and all the file names associated with each author +3. Build an index of authors and all the file names associated with each author to answer questions like "Give me all the papers by Jeff Dean." 4. If you want to perform full PDF embedding for the paper, you can extend the flow. @@ -108,7 +108,7 @@ After this step, we should have the basic info of each paper. We will convert the first page to Markdown using Marker. Alternatively, you can easily plug in any PDF parser, such as Docling using CocoIndex's [custom function](https://cocoindex.io/docs/custom_ops/custom_functions). -Define a marker converter function and cache it, since its initialization is resource-intensive. +Define a marker converter function and cache it, since its initialization is resource-intensive. This ensures that the same converter instance is reused for different input files. ```python @@ -137,7 +137,7 @@ def pdf_to_markdown(content: bytes) -> str: Pass it to your transform ```python -with data_scope["documents"].row() as doc: +with data_scope["documents"].row() as doc: # ... process doc["first_page_md"] = doc["basic_info"]["first_page"].transform( pdf_to_markdown @@ -200,7 +200,7 @@ paper_metadata.collect( Just collect anything you need :) ### Collect `author` to `filename` information -We’ve already extracted author list. Here we want to collect Author → Papers in a separate table to build a look up functionality. +We’ve already extracted author list. Here we want to collect Author → Papers in a separate table to build a look up functionality. Simply collect by author. ```python @@ -229,8 +229,8 @@ doc["title_embedding"] = doc["metadata"]["title"].transform( ### Abstract -Split abstract into chunks, embed each chunk and collect their embeddings. -Sometimes the abstract could be very long. +Split abstract into chunks, embed each chunk and collect their embeddings. +Sometimes the abstract could be very long. ```python doc["abstract_chunks"] = doc["metadata"]["abstract"].transform( @@ -308,7 +308,7 @@ author_papers.export( "author_papers", cocoindex.targets.Postgres(), primary_key_fields=["author_name", "filename"], -) +) metadata_embeddings.export( "metadata_embeddings", cocoindex.targets.Postgres(), @@ -328,9 +328,9 @@ In this example we use PGVector as embedding store. With CocoIndex, you can do o ## Query the index -You can refer to this section of [Text Embeddings](https://cocoindex.io/blogs/text-embeddings-101#3-query-the-index) about -how to build query against embeddings. -For now CocoIndex doesn't provide additional query interface. We can write SQL or rely on the query engine by the target storage. +You can refer to this section of [Text Embeddings](https://cocoindex.io/blogs/text-embeddings-101#3-query-the-index) about +how to build query against embeddings. +For now CocoIndex doesn't provide additional query interface. We can write SQL or rely on the query engine by the target storage. - Many databases already have optimized query implementations with their own best practices - The query space has excellent solutions for querying, reranking, and other search-related functionality. diff --git a/docs/docs/examples/examples/codebase_index.md b/docs/docs/examples/examples/codebase_index.md index 6db5d708..981bf81e 100644 --- a/docs/docs/examples/examples/codebase_index.md +++ b/docs/docs/examples/examples/codebase_index.md @@ -19,7 +19,7 @@ import { GitHubButton, YouTubeButton, DocumentationButton } from '../../../src/c ![Codebase Index](/img/examples/codebase_index/cover.png) ## Overview -In this tutorial, we will build codebase index. [CocoIndex](https://github.com/cocoindex-io/cocoindex) provides built-in support for codebase chunking, with native Tree-sitter support. It works with large codebases, and can be updated in near real-time with incremental processing - only reprocess what's changed. +In this tutorial, we will build codebase index. [CocoIndex](https://github.com/cocoindex-io/cocoindex) provides built-in support for codebase chunking, with native Tree-sitter support. It works with large codebases, and can be updated in near real-time with incremental processing - only reprocess what's changed. ## Use Cases A wide range of applications can be built with an effective codebase index that is always up-to-date. @@ -44,14 +44,14 @@ The flow is composed of the following steps: - Generate embeddings for each chunk - Store in a vector database for retrieval -## Setup +## Setup - Install Postgres, follow [installation guide](https://cocoindex.io/docs/getting_started/installation#-install-postgres). - Install CocoIndex ```bash pip install -U cocoindex ``` -## Add the codebase as a source. +## Add the codebase as a source. We will index the CocoIndex codebase. Here we use the `LocalFile` source to ingest files from the CocoIndex codebase root directory. ```python @@ -67,7 +67,7 @@ def code_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoind - Include files with the extensions of `.py`, `.rs`, `.toml`, `.md`, `.mdx` - Exclude files and directories starting `.`, `target` in the root and `node_modules` under any directory. -`flow_builder.add_source` will create a table with sub fields (`filename`, `content`). +`flow_builder.add_source` will create a table with sub fields (`filename`, `content`). @@ -96,14 +96,14 @@ with data_scope["files"].row() as file: file["extension"] = file["filename"].transform(extract_extension) file["chunks"] = file["content"].transform( cocoindex.functions.SplitRecursively(), - language=file["extension"], chunk_size=1000, chunk_overlap=300) + language=file["extension"], chunk_size=1000, chunk_overlap=300) ``` ![SplitRecursively](/img/examples/codebase_index/chunk.png) ### Embed the chunks -We use `SentenceTransformerEmbed` to embed the chunks. +We use `SentenceTransformerEmbed` to embed the chunks. ```python @cocoindex.transform_flow() @@ -141,7 +141,7 @@ code_embeddings.export( vector_indexes=[cocoindex.VectorIndex("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) ``` -We use [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) to measure the similarity between the query and the indexed data. +We use [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) to measure the similarity between the query and the indexed data. ## Query the index We match against user-provided text by a SQL query, reusing the embedding operation in the indexing flow. @@ -230,4 +230,4 @@ Follow the url from the terminal - `https://cocoindex.io/cocoinsight` to access SplitRecursively has native support for all major programming languages. - \ No newline at end of file + diff --git a/docs/docs/examples/examples/custom_targets.md b/docs/docs/examples/examples/custom_targets.md index 4785c45a..f67c75ae 100644 --- a/docs/docs/examples/examples/custom_targets.md +++ b/docs/docs/examples/examples/custom_targets.md @@ -35,7 +35,7 @@ flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope refresh_interval=timedelta(seconds=5), ) ``` -This ingestion creates a table with `filename` and `content` fields. +This ingestion creates a table with `filename` and `content` fields. ## Process each file and collect @@ -92,7 +92,7 @@ class LocalFileTargetConnector: ``` -The `describe()` method returns a human-readable string that describes the target, which is displayed in the CLI logs. +The `describe()` method returns a human-readable string that describes the target, which is displayed in the CLI logs. For example, it prints: `Target: Local directory ./data/output` @@ -104,10 +104,10 @@ def describe(key: str) -> str: return f"Local directory {key}" ``` -`apply_setup_change()` applies setup changes to the backend. The previous and current specs are passed as arguments, +`apply_setup_change()` applies setup changes to the backend. The previous and current specs are passed as arguments, and the method is expected to update the backend setup to match the current state. -A `None` spec indicates non-existence, so when `previous` is `None`, we need to create it, +A `None` spec indicates non-existence, so when `previous` is `None`, we need to create it, and when `current` is `None`, we need to delete it. @@ -135,8 +135,8 @@ def apply_setup_change( os.rmdir(previous.directory) ``` -The `mutate()` method is called by CocoIndex to apply data changes to the target, -batching mutations to potentially multiple targets of the same type. +The `mutate()` method is called by CocoIndex to apply data changes to the target, +batching mutations to potentially multiple targets of the same type. This allows the target connector flexibility in implementation (e.g., atomic commits, or processing items with dependencies in a specific order). Each element in the batch corresponds to a specific target and is represented by a tuple containing: @@ -151,8 +151,8 @@ class LocalFileTargetValues: html: str ``` -The value type of the `dict` is `LocalFileTargetValues | None`, -where a non-`None` value means an upsert and `None` value means a delete. Similar to `apply_setup_changes()`, +The value type of the `dict` is `LocalFileTargetValues | None`, +where a non-`None` value means an upsert and `None` value means a delete. Similar to `apply_setup_changes()`, idempotency is expected here. ```python @@ -217,7 +217,5 @@ This keeps your knowledge graph continuously synchronized with your document sou Sometimes there may be an internal/homegrown tool or API (e.g. within a company) that's not publicly available. These can only be connected through custom targets. -### Faster adoption of new export logic +### Faster adoption of new export logic When a new tool, database, or API joins your stack, simply define a Target Spec and Target Connector — start exporting right away, with no pipeline refactoring required. - - diff --git a/docs/docs/examples/examples/docs_to_knowledge_graph.md b/docs/docs/examples/examples/docs_to_knowledge_graph.md index 9c21ca30..0ea9ab31 100644 --- a/docs/docs/examples/examples/docs_to_knowledge_graph.md +++ b/docs/docs/examples/examples/docs_to_knowledge_graph.md @@ -36,7 +36,7 @@ and then build a knowledge graph. - CocoIndex can direct map the collected data to Neo4j nodes and relationships. ## Setup -* [Install PostgreSQL](https://cocoindex.io/docs/getting_started/installation#-install-postgres). CocoIndex uses PostgreSQL internally for incremental processing. +* [Install PostgreSQL](https://cocoindex.io/docs/getting_started/installation#-install-postgres). CocoIndex uses PostgreSQL internally for incremental processing. * [Install Neo4j](https://cocoindex.io/docs/ops/targets#neo4j-dev-instance), a graph database. * [Configure your OpenAI API key](https://cocoindex.io/docs/ai/llm#openai). Alternatively, we have native support for Gemini, Ollama, LiteLLM. You can choose your favorite LLM provider and work completely on-premises. @@ -51,7 +51,7 @@ and then build a knowledge graph. ### Add documents as source -We will process CocoIndex documentation markdown files (`.md`, `.mdx`) from the `docs/core` directory ([markdown files](https://github.com/cocoindex-io/cocoindex/tree/main/docs/docs/core), [deployed docs](https://cocoindex.io/docs/core/basics)). +We will process CocoIndex documentation markdown files (`.md`, `.mdx`) from the `docs/core` directory ([markdown files](https://github.com/cocoindex-io/cocoindex/tree/main/docs/docs/core), [deployed docs](https://cocoindex.io/docs/core/basics)). ```python @cocoindex.flow_def(name="DocsToKG") @@ -141,7 +141,7 @@ Next, we will use `cocoindex.functions.ExtractByLlm` to extract the relationship doc["relationships"] = doc["content"].transform( cocoindex.functions.ExtractByLlm( llm_spec=cocoindex.LlmSpec( - api_type=cocoindex.LlmApiType.OPENAI, + api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o" ), output_type=list[Relationship], @@ -187,7 +187,7 @@ with doc["relationships"].row() as relationship: ### Build knowledge graph - + #### Basic concepts All nodes for Neo4j need two things: 1. Label: The type of the node. E.g., `Document`, `Entity`. @@ -236,10 +236,10 @@ This exports Neo4j nodes with label `Document` from the `document_node` collecto #### Export `RELATIONSHIP` and `Entity` nodes to Neo4j -We don't have explicit collector for `Entity` nodes. +We don't have explicit collector for `Entity` nodes. They are part of the `entity_relationship` collector and fields are collected during the relationship extraction. -To export them as Neo4j nodes, we need to first declare `Entity` nodes. +To export them as Neo4j nodes, we need to first declare `Entity` nodes. ```python flow_builder.declare( @@ -289,7 +289,7 @@ In a relationship, there's: 2. A relationship connecting the source and target. Note that different relationships may share the same source and target nodes. -`NodeFromFields` takes the fields from the `entity_relationship` collector and creates `Entity` nodes. +`NodeFromFields` takes the fields from the `entity_relationship` collector and creates `Entity` nodes. #### Export the `entity_mention` to Neo4j. @@ -334,7 +334,7 @@ It creates relationships by: ```sh cocoindex update --setup main.py ``` - + You'll see the index updates state in the terminal. For example, ``` @@ -343,7 +343,7 @@ It creates relationships by: ## CocoInsight -I used CocoInsight to troubleshoot the index generation and understand the data lineage of the pipeline. It is in free beta now, you can give it a try. +I used CocoInsight to troubleshoot the index generation and understand the data lineage of the pipeline. It is in free beta now, you can give it a try. ```sh cocoindex server -ci main @@ -369,7 +369,7 @@ MATCH p=()-->() RETURN p ## Kuzu Cocoindex natively supports Kuzu - a high performant, embedded open source graph database. - + The GraphDB interface in CocoIndex is standardized, you just need to **switch the configuration** without any additional code changes. CocoIndex supports exporting to Kuzu through its API server. You can bring up a Kuzu API server locally by running: @@ -391,4 +391,3 @@ kuzu_conn_spec = cocoindex.add_auth_entry( ``` - diff --git a/docs/docs/examples/examples/document_ai.md b/docs/docs/examples/examples/document_ai.md index b9b6f9a1..6ef86de8 100644 --- a/docs/docs/examples/examples/document_ai.md +++ b/docs/docs/examples/examples/document_ai.md @@ -21,7 +21,7 @@ CocoIndex is a flexible ETL framework with incremental processing. We don’t b ## Set up - [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. -- Configure Project and Processor ID for Document AI API +- Configure Project and Processor ID for Document AI API - [Official Google document AI API](https://cloud.google.com/document-ai/docs/try-docai) with free live demo. - Sign in to [Google Cloud Console](https://console.cloud.google.com/), create or open a project, and enable Document AI API. - ![image.png](/img/examples/document_ai/document_ai.png) diff --git a/docs/docs/examples/examples/image_search.md b/docs/docs/examples/examples/image_search.md index 3c682bba..783108c0 100644 --- a/docs/docs/examples/examples/image_search.md +++ b/docs/docs/examples/examples/image_search.md @@ -21,7 +21,7 @@ import { GitHubButton, YouTubeButton, DocumentationButton } from '../../../src/c CocoIndex supports native integration with ColPali - with just a few lines of code, you embed and index images with ColPali’s late-interaction architecture. We also build a light weight image search application with FastAPI. -## ColPali +## ColPali **ColPali (Contextual Late-interaction over Patches)** is a powerful model for multimodal retrieval. diff --git a/docs/docs/examples/examples/manual_extraction.md b/docs/docs/examples/examples/manual_extraction.md index f8bb2d71..21c0367d 100644 --- a/docs/docs/examples/examples/manual_extraction.md +++ b/docs/docs/examples/examples/manual_extraction.md @@ -188,7 +188,7 @@ def summarize_module(module_info: ModuleInfo) -> ModuleSummary: num_classes=len(module_info.classes), num_methods=len(module_info.methods), ) -``` +``` ### Plug in the function into the flow ```python @@ -249,4 +249,3 @@ SELECT filename, module_info->'title' AS title, module_summary FROM modules_info cocoindex server -ci main ``` CocoInsight dashboard is here `https://cocoindex.io/cocoinsight`. It connects to your local CocoIndex server with zero data retention. - diff --git a/docs/docs/examples/examples/multi_format_index.md b/docs/docs/examples/examples/multi_format_index.md index 27c82976..b062fa29 100644 --- a/docs/docs/examples/examples/multi_format_index.md +++ b/docs/docs/examples/examples/multi_format_index.md @@ -1,5 +1,5 @@ --- -title: Index PDFs, Images, Slides without OCR +title: Index PDFs, Images, Slides without OCR description: Build a visual document indexing pipeline using ColPali to index scanned documents, PDFs, academic papers, presentation slides, and standalone images — all mixed together with charts, tables, and figures - into the same vector space. sidebar_class_name: hidden slug: /examples/multi_format_index @@ -20,7 +20,7 @@ import { GitHubButton, YouTubeButton, DocumentationButton } from '../../../src/c ## Overview Do you have a messy collection of scanned documents, PDFs, academic papers, presentation slides, and standalone images — all mixed together with charts, tables, and figures — that you want to process into the same vector space for semantic search or to power an AI agent? -In this example, we’ll walk through how to build a visual document indexing pipeline using ColPali for embedding both PDFs and images — and then query the index using natural language. +In this example, we’ll walk through how to build a visual document indexing pipeline using ColPali for embedding both PDFs and images — and then query the index using natural language. We’ll skip OCR entirely — ColPali can directly understand document layouts, tables, and figures from images, making it perfect for semantic search across visual-heavy content. @@ -57,7 +57,7 @@ data_scope["documents"] = flow_builder.add_source( ## Convert Files to Pages -We classify files by MIME type and process accordingly. +We classify files by MIME type and process accordingly. Define a dataclass: @@ -112,7 +112,7 @@ In the flow we convert all the files to pages. this makes each pages and all ima ## Generate Visual Embeddings -We use ColPali to generate embeddings for images on each page. +We use ColPali to generate embeddings for images on each page. ```python with doc["pages"].row() as page: @@ -132,7 +132,7 @@ with doc["pages"].row() as page: ![Embedding](/img/examples/multi_format_index/embed.png) -ColPali Architecture fundamentally rethinks how documents, especially visually complex or image-rich ones, are represented and searched. Instead of reducing each image or page to a single dense vector (as in traditional bi-encoders), ColPali breaks an image into many smaller patches, preserving local spatial and semantic structure. +ColPali Architecture fundamentally rethinks how documents, especially visually complex or image-rich ones, are represented and searched. Instead of reducing each image or page to a single dense vector (as in traditional bi-encoders), ColPali breaks an image into many smaller patches, preserving local spatial and semantic structure. Each patch receives its own embedding, which together form a multi-vector representation of the complete document. @@ -143,7 +143,7 @@ Each patch receives its own embedding, which together form a multi-vector repres ## Export to Qdrant -Note the way to embed image and query are different, as they’re two different types of data. +Note the way to embed image and query are different, as they’re two different types of data. Create a function to embed query: @@ -200,9 +200,7 @@ cocoindex server -ci main Follow the url `https://cocoindex.io/cocoinsight`. It connects to your local CocoIndex server, with zero pipeline data retention. You can use it to view extracted pages, see embedding vectors and metadata. -## Connect to other sources +## Connect to other sources CocoIndex natively supports Google Drive, Amazon S3, Azure Blob Storage, and more. - - diff --git a/docs/docs/examples/examples/patient_form_extraction.md b/docs/docs/examples/examples/patient_form_extraction.md index 93bb317d..9681b34a 100644 --- a/docs/docs/examples/examples/patient_form_extraction.md +++ b/docs/docs/examples/examples/patient_form_extraction.md @@ -22,7 +22,7 @@ import { GitHubButton, YouTubeButton, DocumentationButton } from '../../../src/c With CocoIndex, you can easily define nested schema in Python dataclass and use LLM to extract structured data from unstructured data. This example shows how to extract structured data from patient intake forms. :::info -The extraction quality is highly dependent on the OCR quality. You can use CocoIndex with any commercial parser or open source ones that is tailored for your domain for better results. For example, Document AI from Google Cloud and more. +The extraction quality is highly dependent on the OCR quality. You can use CocoIndex with any commercial parser or open source ones that is tailored for your domain for better results. For example, Document AI from Google Cloud and more. ::: ## Flow Overview @@ -69,7 +69,7 @@ def patient_intake_extraction_flow( ## Parse documents with different formats to Markdown -Define a custom function to parse documents in any format to Markdown. Here we use [MarkItDown](https://github.com/microsoft/markitdown) to convert the file to Markdown. It also provides options to parse by LLM, like `gpt-4o`. At present, MarkItDown supports: PDF, Word, Excel, Images (EXIF metadata and OCR), etc. +Define a custom function to parse documents in any format to Markdown. Here we use [MarkItDown](https://github.com/microsoft/markitdown) to convert the file to Markdown. It also provides options to parse by LLM, like `gpt-4o`. At present, MarkItDown supports: PDF, Word, Excel, Images (EXIF metadata and OCR), etc. ```python class ToMarkdown(cocoindex.op.FunctionSpec): @@ -104,7 +104,7 @@ with data_scope["documents"].row() as doc: ![Markdown](/img/examples/patient_form_extraction/tomarkdown.png) -## Define output schema +## Define output schema We are going to define the patient info schema for structured extraction. One of the best examples to define a patient info schema is probably following the [FHIR standard - Patient Resource](https://build.fhir.org/patient.html#resource). @@ -275,7 +275,7 @@ For mission-critical use cases, it is important to evaluate the quality of the e ## Troubleshooting -If extraction is not ideal, this is how I troubleshoot. My original golden file for this record is [this one](https://github.com/cocoindex-io/patient-intake-extraction/blob/main/data/example_forms/Patient_Intake_Form_Joe_Artificial.pdf). +If extraction is not ideal, this is how I troubleshoot. My original golden file for this record is [this one](https://github.com/cocoindex-io/patient-intake-extraction/blob/main/data/example_forms/Patient_Intake_Form_Joe_Artificial.pdf). We could troubleshoot in two steps: 1. Convert to Markdown @@ -293,7 +293,7 @@ Go to `https://cocoindex.io/cocoinsight`. You could see an interactive UI to exp Click on the `markdown` column for `Patient_Intake_Form_Joe.pdf`, you could see the Markdown content. We could try a few different models with the Markdown converter/LLM to iterate and see if we can get better results, or needs manual correction. -## Connect to other sources +## Connect to other sources CocoIndex natively supports Google Drive, Amazon S3, Azure Blob Storage, and more. - \ No newline at end of file + diff --git a/docs/docs/examples/examples/photo_search.md b/docs/docs/examples/examples/photo_search.md index b87eaf14..b267a3fb 100644 --- a/docs/docs/examples/examples/photo_search.md +++ b/docs/docs/examples/examples/photo_search.md @@ -18,10 +18,10 @@ import { GitHubButton, YouTubeButton, DocumentationButton } from '../../../src/c ![Photo Search](/img/examples/photo_search/cover.png) ## Overview -We’ll walk through a comprehensive example of building a scalable face recognition pipeline. We’ll +We’ll walk through a comprehensive example of building a scalable face recognition pipeline. We’ll - Detect all faces in the image and extract their bounding boxes - Crop and encode each face image into a 128-dimensional face embedding -- Store metadata and vectors in a structured index to support queries like: +- Store metadata and vectors in a structured index to support queries like: “Find all similar faces to this one” or “Search images that include this person” With this, you can build your own photo search app with face detection and search. @@ -65,8 +65,8 @@ def face_recognition_flow(flow_builder, data_scope): This creates a table with `filename` and `content` fields. 📂 -You can connect it to your [S3 Buckets](https://cocoindex.io/docs/ops/sources#amazons3) (with SQS integration, [example](https://cocoindex.io/blogs/s3-incremental-etl)) -or [Azure Blob store](https://cocoindex.io/docs/ops/sources#azureblob). +You can connect it to your [S3 Buckets](https://cocoindex.io/docs/ops/sources#amazons3) (with SQS integration, [example](https://cocoindex.io/blogs/s3-incremental-etl)) +or [Azure Blob store](https://cocoindex.io/docs/ops/sources#azureblob). ## Detect and Extract Faces @@ -198,7 +198,7 @@ You can now build facial search apps or dashboards. For example: For querying embeddings, check out [Image Search project](https://cocoindex.io/blogs/live-image-search). -If you’d like to see a full example on the query path with image match, give it a shout at +If you’d like to see a full example on the query path with image match, give it a shout at [our group](https://discord.com/invite/zpA9S2DR7s). ## CocoInsight diff --git a/docs/docs/examples/examples/postgres_source.md b/docs/docs/examples/examples/postgres_source.md index 29e0f0b1..1c253c63 100644 --- a/docs/docs/examples/examples/postgres_source.md +++ b/docs/docs/examples/examples/postgres_source.md @@ -34,7 +34,7 @@ import { GitHubButton, YouTubeButton, DocumentationButton } from '../../../src/c ```python @cocoindex.flow_def(name="PostgresProductIndexing") def postgres_product_indexing_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope) -> None: - + data_scope["products"] = flow_builder.add_source( cocoindex.sources.Postgres( table_name="source_products", @@ -57,16 +57,16 @@ This step adds source data from PostgreSQL table `source_products` to the flow a CocoIndex incrementally sync data from Postgres. When new or updated rows are found, only those rows run through the pipeline, so downstream indexes and search results reflect the latest data while unchanged rows are untouched. The following two arguments (both are optional) make this more efficient: - `notification` enables change capture based on Postgres LISTEN/NOTIFY. Each change triggers an incremental processing on the specific row immediately. -- Regardless if `notification` is provided or not, CocoIndex still needs to scan the full table to detect changes in some scenarios (e.g. between two `update` invocation), and the `ordinal_column` provides a field that CocoIndex can use to quickly detect which row has changed without reading value columns. +- Regardless if `notification` is provided or not, CocoIndex still needs to scan the full table to detect changes in some scenarios (e.g. between two `update` invocation), and the `ordinal_column` provides a field that CocoIndex can use to quickly detect which row has changed without reading value columns. -Check [Postgres source](https://cocoindex.io/docs/ops/sources#postgres) for more details. +Check [Postgres source](https://cocoindex.io/docs/ops/sources#postgres) for more details. If you use the Postgres database hosted by Supabase, please click Connect on your project dashboard and find the URL there. Check [DatabaseConnectionSpec](https://cocoindex.io/docs/core/settings#databaseconnectionspec) for more details. ## Simple Data Mapping / Transformation -Create a simple transformation to calculate the total price. +Create a simple transformation to calculate the total price. ```python @cocoindex.op.function() @@ -116,15 +116,15 @@ with data_scope["products"].row() as product: product["product_name"], product["description"], ) - + # Generate embeddings product["embedding"] = product["full_description"].transform( cocoindex.functions.SentenceTransformerEmbed( model="sentence-transformers/all-MiniLM-L6-v2" ) ) - - # Collect data + + # Collect data indexed_product.collect( product_category=product["product_category"], product_name=product["product_name"], @@ -138,7 +138,7 @@ with data_scope["products"].row() as product: This takes each product row, and does the following: -1. builds a rich description. +1. builds a rich description. ![Make Full Description](/img/examples/postgres_source/description.png) @@ -182,25 +182,25 @@ For example, the following image shows the lineage of the `embedding` field, you ## Running the Pipeline 1. Set up dependencies: - + ```bash pip install -e . ``` - + 2. Create the source table with sample data: - + ```bash psql "postgres://cocoindex:cocoindex@localhost/cocoindex" -f ./prepare_source_data.sql ``` - + 3. Setup tables and update the index: - + ```bash cocoindex update --setup main.py ``` - + 4. Run CocoInsight: - + ```bash cocoindex server -ci main ``` @@ -299,4 +299,4 @@ This [example](https://cocoindex.io/docs/examples/image_search#fast-api-applicat - Reliable consistency: Embeddings and derived data always reflect the accurate, transformed state of each row, ensuring results are dependable and current in a single flow. -- Streamlined operations: A single deployment manages everything, providing clear data lineage and reducing the complexity of the data stack. \ No newline at end of file +- Streamlined operations: A single deployment manages everything, providing clear data lineage and reducing the complexity of the data stack. diff --git a/docs/docs/examples/examples/product_recommendation.md b/docs/docs/examples/examples/product_recommendation.md index 032b64cf..b7284ff7 100644 --- a/docs/docs/examples/examples/product_recommendation.md +++ b/docs/docs/examples/examples/product_recommendation.md @@ -20,7 +20,7 @@ import { GitHubButton, YouTubeButton, DocumentationButton } from '../../../src/c ## Overview We will build a real-time product recommendation engine with LLM and graph database. In particular, we will: -- Use LLM to understand the category (taxonomy) of a product. +- Use LLM to understand the category (taxonomy) of a product. - Use LLM to enumerate the complementary products - users are likely to buy together with the current product (pencil and notebook). - Use Graph to explore the relationships between products that can be further used for product recommendations or labeling. @@ -29,7 +29,7 @@ Product taxonomy is a way to organize product catalogs in a logical and hierarch ## Prerequisites -* [Install PostgreSQL](https://cocoindex.io/docs/getting_started/installation#-install-postgres). CocoIndex uses PostgreSQL internally for incremental processing. +* [Install PostgreSQL](https://cocoindex.io/docs/getting_started/installation#-install-postgres). CocoIndex uses PostgreSQL internally for incremental processing. * [Install Neo4j](https://cocoindex.io/docs/ops/storages#Neo4j), a graph database. * - [Configure your OpenAI API key](https://cocoindex.io/docs/ai/llm#openai). Create a `.env` file from `.env.example`, and fill `OPENAI_API_KEY`. @@ -47,11 +47,11 @@ The core flow is about [~100 lines of python code](https://github.com/cocoindex- We are going to declare a data flow 1. ingest products (in JSON) -2. for each product, +2. for each product, - parse JSON - - map & clean up data + - map & clean up data - extract taxonomy from the mapped data -3. collect data +3. collect data 4. export data to neo4j @@ -83,7 +83,7 @@ product_complementary_taxonomy = data_scope.add_collector() ## Process each product -We will parse the JSON file for each product, and transform the data to the format that we need for downstream processing. +We will parse the JSON file for each product, and transform the data to the format that we need for downstream processing. ### Data mapping @@ -99,7 +99,7 @@ def extract_product_info(product: cocoindex.typing.Json, filename: str) -> Produ ) ``` -Here we define a function for data mapping, e.g., +Here we define a function for data mapping, e.g., - clean up the `id` field - map `title` -> `title` - clean up the `price` field @@ -131,7 +131,7 @@ It performs the following transformations: -## Extract taxonomy and complementary taxonomy +## Extract taxonomy and complementary taxonomy ![Product Taxonomy Info](/img/examples/product_recommendation/taxonomy.png) @@ -144,7 +144,7 @@ Since we are using LLM to extract product taxonomy, we need to provide a detaile class ProductTaxonomy: """ Taxonomy for the product. - + A taxonomy is a concise noun (or short noun phrase), based on its core functionality, without specific details such as branding, style, etc. Always use the most common words in US English. @@ -175,7 +175,7 @@ class ProductTaxonomyInfo: ``` -For each product, we want some insight about its taxonomy and complementary taxonomy and we could use that as bridge to find related product using knowledge graph. +For each product, we want some insight about its taxonomy and complementary taxonomy and we could use that as bridge to find related product using knowledge graph. @@ -210,7 +210,7 @@ with taxonomy['complementary_taxonomies'].row() as t: ## Build knowledge graph - + ### Basic concepts All nodes for Neo4j need two things: 1. Label: The type of the node. E.g., `Product`, `Taxonomy`. @@ -222,7 +222,7 @@ CocoIndex uses the primary key field to match the nodes and deduplicate them. If There are two ways to map nodes: 1. When you have a collector just for the node, you can directly export it to Neo4j. For example `Product`. We've collected each product explicitly. -2. When you have a collector for relationships connecting to the node, you can map nodes from selected fields in the relationship collector. You must declare a node label and primary key field. +2. When you have a collector for relationships connecting to the node, you can map nodes from selected fields in the relationship collector. You must declare a node label and primary key field. For example, @@ -267,10 +267,10 @@ This exports Neo4j nodes with label `Product` from the `product_node` collector. ### Export `Taxonomy` nodes to Neo4j -We don't have explicit collector for `Taxonomy` nodes. +We don't have explicit collector for `Taxonomy` nodes. They are part of the `product_taxonomy` and `product_complementary_taxonomy` collectors and fields are collected during the taxonomy extraction. -To export them as Neo4j nodes, we need to first declare `Taxonomy` nodes. +To export them as Neo4j nodes, we need to first declare `Taxonomy` nodes. ```python flow_builder.declare( @@ -352,7 +352,7 @@ In a relationship, there's: 2. A relationship connecting the source and target. Note that different relationships may share the same source and target nodes. -`NodeFromFields` takes the fields from the `entity_relationship` collector and creates `Taxonomy` nodes. +`NodeFromFields` takes the fields from the `entity_relationship` collector and creates `Taxonomy` nodes. ## Run the flow @@ -366,7 +366,7 @@ Note that different relationships may share the same source and target nodes. ```sh cocoindex update --setup main.py ``` - + You'll see the index updates state in the terminal. For example, you'll see the following output: ``` @@ -398,4 +398,3 @@ cocoindex server -ci main ``` And then open the url `https://cocoindex.io/cocoinsight`. It just connects to your local CocoIndex server, with Zero pipeline data retention. - diff --git a/docs/docs/examples/examples/simple_vector_index.md b/docs/docs/examples/examples/simple_vector_index.md index 1b467bed..dc55d2d2 100644 --- a/docs/docs/examples/examples/simple_vector_index.md +++ b/docs/docs/examples/examples/simple_vector_index.md @@ -18,7 +18,7 @@ import { GitHubButton, YouTubeButton, DocumentationButton } from '../../../src/c ![Simple Vector Index](/img/examples/simple_vector_index/cover.png) ## Overview -In this tutorial, we will build index with text embeddings and query it with natural language. +In this tutorial, we will build index with text embeddings and query it with natural language. We try to keep it minimalistic and focus on the gist of the indexing flow. @@ -33,7 +33,7 @@ We try to keep it minimalistic and focus on the gist of the indexing flow. ## Prerequisites - [Install Postgres](https://cocoindex.io/docs/getting_started/installation). -CocoIndex uses Postgres to keep track of data lineage for incremental processing. +CocoIndex uses Postgres to keep track of data lineage for incremental processing. ## Add Source @@ -69,7 +69,7 @@ with data_scope["documents"].row() as doc: -### Embed each chunk +### Embed each chunk ```python with doc["chunks"].row() as chunk: @@ -77,7 +77,7 @@ with doc["chunks"].row() as chunk: cocoindex.functions.SentenceTransformerEmbed( model="sentence-transformers/all-MiniLM-L6-v2" ) - ) + ) doc_embeddings.collect(filename=doc["filename"], location=chunk["location"], text=chunk["text"], embedding=chunk["embedding"]) ``` @@ -124,7 +124,7 @@ def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[lis This code defines a transformation function that converts text into vector embeddings using the SentenceTransformer model. `@cocoindex.transform_flow()` is needed to share the transformation across indexing and query. -This decorator marks this as a reusable transformation flow that can be called on specific input data from user code using `eval()`, as shown in the search function below. +This decorator marks this as a reusable transformation flow that can be called on specific input data from user code using `eval()`, as shown in the search function below. ### Write query @@ -212,4 +212,4 @@ You can walk through the project step by step in [CocoInsight](https://www.youtu cocoindex server -ci main ``` -Follow the url `https://cocoindex.io/cocoinsight`. It connects to your local CocoIndex server, with zero pipeline data retention. \ No newline at end of file +Follow the url `https://cocoindex.io/cocoinsight`. It connects to your local CocoIndex server, with zero pipeline data retention. diff --git a/docs/docs/examples/index.md b/docs/docs/examples/index.md index a0cdadd7..cb676725 100644 --- a/docs/docs/examples/index.md +++ b/docs/docs/examples/index.md @@ -1,11 +1,11 @@ --- description: Learn to implement real-world solutions with CocoIndex through practical examples -title: Featured Examples +title: Featured Examples canonicalUrl: '/examples' slug: '/examples' --- import DocCardList from '@theme/DocCardList'; - \ No newline at end of file + diff --git a/docs/docs/getting_started/quickstart.md b/docs/docs/getting_started/quickstart.md index 6d0f1e49..959764ca 100644 --- a/docs/docs/getting_started/quickstart.md +++ b/docs/docs/getting_started/quickstart.md @@ -68,7 +68,7 @@ doc_embeddings = data_scope.add_collector() -### Process each document +### Process each document With CocoIndex, it is easy to process nested data structures. @@ -98,7 +98,7 @@ We extend a new field `chunks` to each row by *transforming* the `content` field ```python title="main.py" with doc["chunks"].row() as chunk: - # embed + # embed chunk["embedding"] = chunk["text"].transform( cocoindex.functions.SentenceTransformerEmbed( model="sentence-transformers/all-MiniLM-L6-v2" @@ -119,7 +119,7 @@ This code embeds each chunk using the SentenceTransformer library and collects t ![Embedding](/img/examples/simple_vector_index/embed.png) - + ### Export the embeddings to Postgres ```python title="main.py" @@ -174,4 +174,4 @@ If you want to build a end to end query flow that also searches the index, you c Next, you may want to: * Learn about [CocoIndex Basics](../core/basics.md). -* Explore more of what you can build with CocoIndex in the [examples](https://cocoindex.io/docs/examples) directory. +* Explore more of what you can build with CocoIndex in the [examples](https://cocoindex.io/docs/examples) directory. diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts index 0e93fcc8..01ff5cc7 100644 --- a/docs/docusaurus.config.ts +++ b/docs/docusaurus.config.ts @@ -123,7 +123,7 @@ const config: Config = { type: 'doc', docId: 'examples/index', position: 'left', - }, + }, { to: 'https://cocoindex.io/blogs/', label: 'Blog', position: 'left', target: '_self' }, { type: 'html', diff --git a/docs/src/theme/DocCard/index.tsx b/docs/src/theme/DocCard/index.tsx index 6fa5aa5c..504683a8 100644 --- a/docs/src/theme/DocCard/index.tsx +++ b/docs/src/theme/DocCard/index.tsx @@ -65,8 +65,8 @@ export default function DocCard({ item }: Props): ReactNode { const doc = useDocById(item.docId ?? undefined); // Extract tags from customProps or doc metadata - const tags: string[] | undefined = - (item?.customProps?.tags as string[]) || + const tags: string[] | undefined = + (item?.customProps?.tags as string[]) || undefined; return ( @@ -78,4 +78,4 @@ export default function DocCard({ item }: Props): ReactNode { tags={tags} /> ); -} \ No newline at end of file +} diff --git a/docs/src/theme/DocCard/styles.module.css b/docs/src/theme/DocCard/styles.module.css index 1022a7cb..94c056b0 100644 --- a/docs/src/theme/DocCard/styles.module.css +++ b/docs/src/theme/DocCard/styles.module.css @@ -10,11 +10,11 @@ /* transform: translateY(-2px); */ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08); } - + .cardTitleContainer { display: flex; } - + .cardTitle { font-size: 1rem; margin-top: 16px; @@ -53,8 +53,8 @@ border: 1px solid var(--ifm-color-emphasis-300); background: var(--ifm-background-color); color: var(--ifm-color-emphasis-700); - transition: + transition: background 0.15s, border-color 0.15s, color 0.15s; -} \ No newline at end of file +} diff --git a/docs/src/theme/DocCardList/index.tsx b/docs/src/theme/DocCardList/index.tsx index 4b8b234c..440289f4 100644 --- a/docs/src/theme/DocCardList/index.tsx +++ b/docs/src/theme/DocCardList/index.tsx @@ -83,4 +83,4 @@ export default function DocCardList(props: Props): ReactNode { ); -} \ No newline at end of file +} diff --git a/docs/src/theme/DocCardList/styles.module.css b/docs/src/theme/DocCardList/styles.module.css index cda42d17..a033455d 100644 --- a/docs/src/theme/DocCardList/styles.module.css +++ b/docs/src/theme/DocCardList/styles.module.css @@ -39,7 +39,7 @@ white-space: nowrap; min-height: 28px; text-align: center; - transition: + transition: background 0.15s, border-color 0.15s, color 0.15s, @@ -85,7 +85,7 @@ user-select: none; min-height: 28px; text-align: center; - transition: + transition: background 0.15s, border-color 0.15s, color 0.15s, @@ -122,18 +122,18 @@ padding: 0.25rem 0; margin-bottom: 1rem; } - + .tagSelectorGrid { gap: 0.4rem; } - + .tagLabel, .allTagsLabel { padding: 0.15rem 0.6rem; font-size: 0.75rem; min-height: 24px; } - + .tagSelectorTitle { font-size: 1rem; margin-bottom: 0.75rem; @@ -144,11 +144,11 @@ .tagSelectorContainer { padding: 0.1rem 0; } - + .tagSelectorGrid { gap: 0.25rem; } - + .tagLabel, .allTagsLabel { padding: 0.12rem 0.5rem; diff --git a/examples/postgres_source/.env b/examples/postgres_source/.env index 03199437..54ea8e87 100644 --- a/examples/postgres_source/.env +++ b/examples/postgres_source/.env @@ -4,4 +4,4 @@ COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex # Source Database, for data source -SOURCE_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex \ No newline at end of file +SOURCE_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex