diff --git a/README.md b/README.md index 84bfda1f..206c6070 100644 --- a/README.md +++ b/README.md @@ -202,6 +202,7 @@ It defines an index flow like this: | [Custom Output Files](examples/custom_output_files) | Convert markdown files to HTML files and save them to a local directory, using *CocoIndex Custom Targets* | | [Patient intake form extraction](examples/patient_intake_extraction) | Use LLM to extract structured data from patient intake forms with different formats | | [HackerNews Trending Topics](examples/hn_trending_topics) | Extract trending topics from HackerNews threads and comments, using *CocoIndex Custom Source* and LLM | +| [Patient Intake Form Extraction with BAML](examples/patient_intake_extraction_baml) | Extract structured data from patient intake forms using BAML | More coming and stay tuned 👀! diff --git a/examples/patient_intake_extraction_baml/.env.example b/examples/patient_intake_extraction_baml/.env.example new file mode 100644 index 00000000..5ce3a5a3 --- /dev/null +++ b/examples/patient_intake_extraction_baml/.env.example @@ -0,0 +1,4 @@ +# Postgres database address for cocoindex +COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex + +GEMINI_API_KEY= diff --git a/examples/patient_intake_extraction_baml/.gitignore b/examples/patient_intake_extraction_baml/.gitignore new file mode 100644 index 00000000..7bc98a23 --- /dev/null +++ b/examples/patient_intake_extraction_baml/.gitignore @@ -0,0 +1,20 @@ +# BAML generated files +baml_client/ +.baml/ + +# Environment files +.env + +# Python +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.so +*.egg-info/ +dist/ +build/ + +# CocoIndex +.cocoindex/ diff --git a/examples/patient_intake_extraction_baml/README.md b/examples/patient_intake_extraction_baml/README.md new file mode 100644 index 00000000..a3aaed19 --- /dev/null +++ b/examples/patient_intake_extraction_baml/README.md @@ -0,0 +1,53 @@ +# Extract structured data from patient intake forms with BAML + +[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) +We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful. + +This example shows how to use [BAML](https://boundaryml.com/) to extract structured data from patient intake PDFs. BAML provides type-safe structured data extraction with native PDF support. + +- **BAML Schema** (`baml_src/patient.baml`) - Defines the data structure and extraction function +- **CocoIndex Flow** (`main.py`) - Wraps BAML in a custom function, provide the flow to and process files incrementally. + +## Prerequisites + +1. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. + +2. Install dependencies + + ```sh + pip install -U cocoindex baml-py + ``` + +3. **Generate BAML client code** (required step!) + + ```sh + baml generate + ``` + + This generates the `baml_client/` directory with Python code to call your BAML functions. + +4. Create a `.env` file. You can copy it from `.env.example` first: + + ```sh + cp .env.example .env + ``` + + Then edit the file to fill in your `GEMINI_API_KEY`. + +## Run + +Update index: + +```sh +cocoindex update main +``` + +## CocoInsight + +I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. It just connects to your local CocoIndex server, with zero pipeline data retention. Run following command to start CocoInsight: + +```sh +cocoindex server -ci main +``` + +Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). diff --git a/examples/patient_intake_extraction_baml/baml_src/generators.baml b/examples/patient_intake_extraction_baml/baml_src/generators.baml new file mode 100644 index 00000000..e3d55f98 --- /dev/null +++ b/examples/patient_intake_extraction_baml/baml_src/generators.baml @@ -0,0 +1,7 @@ +// BAML Generator Configuration + +generator python_client { + output_type python/pydantic + output_dir "../" + version "0.212.0" +} diff --git a/examples/patient_intake_extraction_baml/baml_src/patient.baml b/examples/patient_intake_extraction_baml/baml_src/patient.baml new file mode 100644 index 00000000..e47d1385 --- /dev/null +++ b/examples/patient_intake_extraction_baml/baml_src/patient.baml @@ -0,0 +1,91 @@ +// BAML Schema for Patient Intake Form Extraction + +class Contact { + name string + phone string + relationship string +} + +class Address { + street string + city string + state string + zip_code string +} + +class Pharmacy { + name string + phone string + address Address +} + +class Insurance { + provider string + policy_number string + group_number string? + policyholder_name string + relationship_to_patient string +} + +class Condition { + name string + diagnosed bool +} + +class Medication { + name string + dosage string +} + +class Allergy { + name string +} + +class Surgery { + name string + date string +} + +class Patient { + name string + dob string + gender string + address Address + phone string + email string + preferred_contact_method string + emergency_contact Contact + insurance Insurance? + reason_for_visit string + symptoms_duration string + past_conditions Condition[] + current_medications Medication[] + allergies Allergy[] + surgeries Surgery[] + occupation string? + pharmacy Pharmacy? + consent_given bool + consent_date string? +} + +function ExtractPatientInfo(intake_form: pdf) -> Patient { + client Gemini + prompt #" + Extract all patient information from the following intake form document. + Please be thorough and extract all available information accurately. + + {{ intake_form }} + + Fill in with "N/A" for required fields if the information is not available. + + {{ ctx.output_format }} + "# +} + +client Gemini { + provider google-ai + options { + model gemini-2.5-flash + api_key env.GEMINI_API_KEY + } +} diff --git a/examples/patient_intake_extraction_baml/data/README.md b/examples/patient_intake_extraction_baml/data/README.md new file mode 100644 index 00000000..43f941f6 --- /dev/null +++ b/examples/patient_intake_extraction_baml/data/README.md @@ -0,0 +1,4 @@ +## Note: +Example files here are purely artificial and not real, for testing purposes only. +Please do not use these examples for any other purpose. + diff --git a/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_Form_David_Artificial.pdf b/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_Form_David_Artificial.pdf new file mode 100644 index 00000000..5fd43832 Binary files /dev/null and b/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_Form_David_Artificial.pdf differ diff --git a/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_Form_Emily_Artificial.pdf b/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_Form_Emily_Artificial.pdf new file mode 100644 index 00000000..09cff13c Binary files /dev/null and b/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_Form_Emily_Artificial.pdf differ diff --git a/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_Form_Joe_Artificial.pdf b/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_Form_Joe_Artificial.pdf new file mode 100644 index 00000000..cc15c5f0 Binary files /dev/null and b/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_Form_Joe_Artificial.pdf differ diff --git a/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_From_Jane_Artificial.pdf b/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_From_Jane_Artificial.pdf new file mode 100644 index 00000000..02e26bf0 Binary files /dev/null and b/examples/patient_intake_extraction_baml/data/patient_forms/Patient_Intake_From_Jane_Artificial.pdf differ diff --git a/examples/patient_intake_extraction_baml/main.py b/examples/patient_intake_extraction_baml/main.py new file mode 100644 index 00000000..95c91ce8 --- /dev/null +++ b/examples/patient_intake_extraction_baml/main.py @@ -0,0 +1,54 @@ +import os +import base64 +from baml_client import b +from baml_client.types import Patient +import baml_py +import cocoindex + + +@cocoindex.op.function(cache=True, behavior_version=1) +async def extract_patient_info(content: bytes) -> Patient: + pdf = baml_py.Pdf.from_base64(base64.b64encode(content).decode("utf-8")) + return await b.ExtractPatientInfo(pdf) + + +@cocoindex.flow_def(name="PatientIntakeExtractionBaml") +def patient_intake_extraction_flow( + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope +) -> None: + """ + Define a flow that extracts patient information from intake forms using BAML. + + This flow: + 1. Reads patient intake documents (PDF, DOCX, etc.) as binary + 2. Directly extracts structured patient information using BAML's native PDF/Image support + 3. Stores the results in a Postgres database + """ + # Load documents from local file source (binary mode) + data_scope["documents"] = flow_builder.add_source( + cocoindex.sources.LocalFile( + path=os.path.join("data", "patient_forms"), binary=True + ) + ) + + # Create collector for patient data + patients_index = data_scope.add_collector() + + # Process each document + with data_scope["documents"].row() as doc: + # Extract patient information using BAML directly from file bytes + # BAML natively supports PDF and Image inputs + doc["patient_info"] = doc["content"].transform(extract_patient_info) + + # Collect the extracted patient information + patients_index.collect( + filename=doc["filename"], + patient_info=doc["patient_info"], + ) + + # Export to Postgres + patients_index.export( + "patients", + cocoindex.storages.Postgres(), + primary_key_fields=["filename"], + ) diff --git a/examples/patient_intake_extraction_baml/pyproject.toml b/examples/patient_intake_extraction_baml/pyproject.toml new file mode 100644 index 00000000..8ba0e6de --- /dev/null +++ b/examples/patient_intake_extraction_baml/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "patient-intake-extraction-baml" +version = "0.1.0" +description = "Extract structured information from patient intake forms using BAML." +requires-python = ">=3.10" +dependencies = ["cocoindex>=0.3.2", "python-dotenv>=1.0.1", "baml-py>=0.212.0"] + +[tool.setuptools] +packages = []