In [52]:
import os
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from ollama import AsyncClient
from typing import List, Sequence, Any
from db.db_connection_pool_using_pycopg2 import get_connection, release_connection, close_pool
from db.db_connection_pool import get_engine, get_conn
from pydantic import BaseModel, Field
from typing import List, Optional, Dict
from db.schema import Document_Chunk
from openai import OpenAI
from datetime import datetime
from zoneinfo import ZoneInfo
import requests
import logging
import asyncio
import aiohttp
from sqlalchemy import text
from fastembed import TextEmbedding
from dotenv import load_dotenv
from tqdm import tqdm

In [53]:
load_dotenv(override=True)

True

In [3]:
# convert pdf to markdown
# https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html#pymupdf4llm-api
FOLDER_PATH = r"C:\Users\aibag\git_repo\policy_wording"

FILE_NAME = "state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF"


In [4]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)

logger = logging.getLogger(__name__)

current_nz_datetime = datetime.now(tz=ZoneInfo("Pacific/Auckland"))


#### Ingest

In [5]:
# function to load pdf file and convert a pdf file to a markdown file
async def load_pdf_file(filepath:str, filename:str, mode:str ="single") -> List[Document]:

    full_path = os.path.join(filepath, filename)
    logger.info(f"Processing file: {full_path}")

    if not os.path.exists(full_path):
        raise FileNotFoundError(f"File not found: {full_path}")

    if not filename.lower().strip().endswith(".pdf"):
        raise TypeError ("Invalid File Type; only PDFs are allowed.")

    # custom pages_delimiter to identify where are ends of pages in single mode 
    # page = load each page as a Document object; single = load entire PDF as a single Document object
    doc_loader = PyMuPDF4LLMLoader(full_path 
                                   ,mode=mode
                                   ,pages_delimiter="<<-- PAGE BREAK -->>\n\n"
                                   ,table_strategy="lines_strict" # lines, text, lines_strict, lines_strict is default
                                   #,page_separators=True
                                  )
                            
    # lazy loading
    docs = []
    async for doc in doc_loader.alazy_load():
        docs.append(doc)

    logger.info(f"Successfully processed file: {filename}; Total Pages: {docs[0].metadata["total_pages"]}")

    return docs

doc_obj =  await load_pdf_file(FOLDER_PATH, FILE_NAME, mode="single")


2025-10-11 22:03:11,707 [INFO] __main__: Processing file: C:\Users\aibag\git_repo\policy_wording\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF
2025-10-11 22:03:20,855 [INFO] __main__: Successfully processed file: state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF; Total Pages: 61


#### Chunking

In [6]:
# functions to chunk/split a markdown file into chunks
def chunk_header_splitter(doc_contents)->list[Document]:

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ('####', "Header 4"),
        ('#####', "Header 5"),
        ('######', "Header 6"),
        ('#######', "Header 7")
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)

    return markdown_splitter.split_text(doc_contents)

def chunk_header_recursivesplitter(doc_contents)->list:
    text_splitter = RecursiveCharacterTextSplitter(
        separators=[". ", "! ", "? ", "\n\n", "\n", " ", ""],
        chunk_size=2000,
        chunk_overlap=200,
    )
    chunks = text_splitter.split_text(doc_contents)
    return chunks

#print(chunk_header_splitter(doc_obj[0].page_content))
#print(chunk_header_recursivesplitter(doc[0].page_content))

In [7]:
# model of the vector db
class DocumentChunk(BaseModel):
    embedding: List[float]
    chunk_text: str
    metadata: Optional[Dict[str, str]] = Field(default_factory=dict)
    file_name: str
    tags: Optional[List[str]] = Field(default_factory=list)
    isActive: bool = Field(default=False)
    chunk_enrichment: str = None
    version: Optional[str] = None
    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None

# model for the llm enrichment output
class ChunkEnrichment(BaseModel):
    chunk_summary: str
    hypotetical_questions: list[str]
    has_table: bool = Field(default=False)
    table_summary: Optional[str] = None

In [8]:
def chunk_document(doc_obj)->list[str]:

    # get metadata from the doc object 
    source_path = doc_obj[0].metadata.get("source", "")
    file_name = source_path.split("\\")[-1] if source_path else "unknown"

    doc_metadata = {
        "source"        : doc_obj[0].metadata.get("source"),
        "file_name"     : file_name,
        "total_pages"   : str(doc_obj[0].metadata.get("total_pages")),
        "creation_date" : doc_obj[0].metadata.get("creationdate"),
    }

    try:
        chunks = chunk_header_splitter(doc_obj[0].page_content)
        logger.info(f"Chunk Size for document '{file_name}': {len(chunks)}")
    except Exception as e:
        logger.error(f"Call to chunk_header_splitter failed: {e}")    

    doc_chunks_list = []

    for chunk in chunks:

        # skip pages that only have 30 char or less
        if len(chunk.page_content) < 30: 
            continue 

        # additional metadata 
        chunk_metadata = doc_metadata.copy()

        header_key, header_value = next(iter(chunk.metadata.items()), (None, None))

        chunk_metadata["header_key"] = header_key 
        chunk_metadata["header_value"] = header_value.replace("*","").replace("#","")

        # create an instance of DocumentChunk
        doc_chunk = DocumentChunk(
            embedding = [],
            chunk_text = chunk.page_content,
            metadata = chunk_metadata,
            file_name = file_name,
            isActive = True,
            chunk_enrichment = "",
            version = "1",
            tags = ["Home", "State"],
            created_at = current_nz_datetime,
            updated_at= current_nz_datetime,
        )

        doc_chunks_list.append(doc_chunk)

    return doc_chunks_list

doc_chunks = chunk_document(doc_obj)
#print(chunk_document(doc_obj))

2025-10-11 22:03:20,886 [INFO] __main__: Chunk Size for document 'state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF': 135


#### Enrich Chunks

In [None]:
# enrich each chunks 
# https://www.anthropic.com/engineering/contextual-retrieval
enrichment_system_prompt = ChatPromptTemplate.from_template("""
You are an expert general insurance underwriting product manager.  
You will be given a document and a chunk of text from the document:

Please give a short context to situate this chunk within the overall document for
the purposes of improving search retrieval of the chunk.

ONLY IF the chunk includes a table where you will provide a summary of the table.

Also provide 3 to 5 hypotetical questions that the chunk will able to answer.

Answer only with the succinct context and a list of hypotetical questions, nothing else. 

Here's the document:
{document_text}
""")

enrichment_user_prompt = ChatPromptTemplate.from_template("""
Here's the chunk:
{chunk_text}
""")


In [10]:
# invoke - local lm studio llm model
def invoke_llm_using_openai(system_prompt, query_prompt):
    client = OpenAI(base_url="http://localhost:1234/v1", api_key="lmstudio")

    response = client.chat.completions.parse(
        model= "google/gemma-3-4b",  
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query_prompt}
        ],
        temperature=0.0,
        max_tokens=500,
        top_p=0.9,
        response_format = ChunkEnrichment,
    )


    #return response.choices[0].message.content
    return response.choices[0].message.parsed


In [17]:
# async invoke - local lm studio llm model
async def ainvoke_llm_api_generate(prompt):
    async with aiohttp.ClientSession() as session:
        async with session.post(
            "http://localhost:11434/api/generate",

            json={
                "model": "gemma3:4b",
                "prompt": prompt,
                "stream": False,
                "temperature": 0,
                "top_p": 0.90,
            }
        ) as response:
            resp_json = await response.json()
            return resp_json["response"]

In [57]:
# Using Ollama Model
# https://github.com/ollama/ollama-python
async def invokeOllamaModel(system_prompt, query_prompt, formatModel):
    model='gemma3:4b'
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": query_prompt}
    ]
    response = await AsyncClient().chat(model=model, messages=messages, format=formatModel.model_json_schema())
    response_text = formatModel.model_validate_json(response.message.content)
    return response_text


In [None]:
system_prompt = enrichment_system_prompt.format(document_text=doc_obj[0].page_content)
query_prompt = enrichment_user_prompt.format(chunk_text=doc_chunks[0].chunk_text)

response = await invokeOllamaModel(system_prompt, query_prompt, ChunkEnrichment)
print(response)

2025-10-11 22:31:11,302 [INFO] httpx: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


chunk_summary='This is an introductory paragraph from State Insurance, thanking the customer and explaining the importance of reading the policy wording and schedule for claim information.' hypotetical_questions=["What does 'policy wording' refer to?", 'Why is it important to keep the policy schedule and wording handy?', 'What happens if I need to make a claim?'] has_table=False table_summary=None


#### Chunk Enrichment and Embedding

In [62]:
embedding_model = TextEmbedding(model_name="BAAI/bge-base-en-v1.5")  

def invoke_embedding(chunk_text):
    embeddings = embedding_model.embed(chunk_text)
    # TextEmbedding.embed returns a generator so I used next() 
    return next(embeddings) 

In [None]:
# Progress bar for chunk enrichment within the current file
system_prompt = enrichment_system_prompt.format(document_text = doc_obj[0].page_content)

with tqdm(total=len(doc_chunks), desc=f"Enriching Chunks", leave=False) as pbar_chunks:
    for idx, chunk in enumerate(doc_chunks):
        
        print("-"*50)
        print(chunk)

        # enrich chunk using LLM
        query_prompt =  enrichment_user_prompt.format(chunk_text = chunk.chunk_text)
        enriched_text = await invokeOllamaModel(system_prompt, query_prompt, formatModel=ChunkEnrichment)

        enriched_parts = []
        if enriched_text:
            # pre-pend to the chunk:
            enriched_parts = [
            f"Chunk Summary: {enriched_text.chunk_summary}\n",
            f"Questions this chunk may answer: {"\n".join(f"- {q}" for q in enriched_text.hypotetical_questions)}\n",
            ]
             
            if enriched_text.has_table and enriched_text.table_summary:
                enriched_parts.append(f"Table Summary: {enriched_text.table_summary}\n")
        else:
            logger.warning(f"Doc: {chunk.file_name} Chunk: {idx} failed enrichment.")

        enriched_parts.append(f"Chunk Text:\n{chunk.chunk_text}")

        enriched_chunk = "\n".join(enriched_parts)

        # embed enriched_chunk
        embedded_chunk = invoke_embedding(enriched_chunk)
        
        # write to the DocumentChunk object
        if embedded_chunk is not None and len(embedded_chunk) > 0:
            chunk.embedding = embedded_chunk

        if enriched_chunk:
            chunk.chunk_enrichment = enriched_chunk            

        # Update chunk-level progress bar
        pbar_chunks.update(1)

        #if idx > 3: break

Enriching Chunks:   0%|          | 0/134 [00:00<?, ?it/s]

--------------------------------------------------
embedding=[] chunk_text='###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.' metadata={'source': 'C:\\Users\\aibag\\git_repo\\policy_wording\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'file_name': 'state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'total_pages': '61', 'creation_date': '2024-12-02T10:18:24+13:00', 'header_key': 'Header 6', 'header_value': 'Thank you for choosing State Insurance'} file_name='state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF' tags=['Home', 'State'] isActive=True chunk_enrichment='' version='1' created_at=datetime.datetime(2025, 10, 11, 2

2025-10-11 22:40:14,202 [INFO] httpx: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Enriching Chunks:   1%|          | 1/134 [00:05<12:05,  5.45s/it]

--------------------------------------------------
embedding=[] chunk_text='###### **How to contact us**  \nIn New Zealand, just call 0800 80 24 24. If you have any questions, need help, or want to\n[make a claim, our contact centres are available 7 days a week or visit state.co.nz](https://www.state.co.nz/)  \nCall us free from Australia 1 800 887 863  \nUnited States 1 800 593 9482  \nUnited Kingdom 0800 096 5308  \nCall us direct from Somewhere else overseas 64 9 969 1150  \nOur promise to customers includes communicating clearly.  \nThis document meets the WriteMark quality award, independent  \nproof we have achieved a high standard of plain language.  \nPolicy wording SI6995/2 12/24  \n<<-- PAGE BREAK -->>' metadata={'source': 'C:\\Users\\aibag\\git_repo\\policy_wording\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'file_name': 'state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'total_pages

2025-10-11 22:40:22,381 [INFO] httpx: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Enriching Chunks:   1%|▏         | 2/134 [00:13<15:34,  7.08s/it]

--------------------------------------------------
embedding=[] chunk_text='#### **Contents**  \nImportant information about your policy 3  \nSection 1 – Home Comprehensive Insurance 8  \n- Your cover at a glance 8  \n- Part one – cover for your home 10  \n- Part one – automatic benefits 18  \n- Part one – optional benefits 24  \n- Part two – your legal liability 25  \nSection 2 – Contents Comprehensive Insurance 26  \n- Your cover at a glance 26  \n- Part one – cover for contents 28  \n- Part one – automatic benefits 33  \n- Part one – optional benefits 39  \n- Part two – legal liability 40  \nExclusions – what we do not cover 41  \nClaims – what you need to do 49  \nConditions of your cover 50  \nDefinitions 54  \nHow we pay claims – some examples 56  \nPage 2 of 59  \n<<-- PAGE BREAK -->>' metadata={'source': 'C:\\Users\\aibag\\git_repo\\policy_wording\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'file_name': 'state-home-comprehensive

2025-10-11 22:40:29,028 [INFO] httpx: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Enriching Chunks:   2%|▏         | 3/134 [00:20<15:01,  6.89s/it]

--------------------------------------------------
embedding=[] chunk_text='#### **Important information about your policy**  \n‘You’ and ‘your’ mean any person or entity shown as the Insured in your policy schedule. If you have\nContents Insurance, ‘you’ and ‘your’ include any partner of the Insured. ‘We’, ‘us’ and ‘our’ mean IAG  \nNew Zealand Limited.' metadata={'source': 'C:\\Users\\aibag\\git_repo\\policy_wording\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'file_name': 'state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'total_pages': '61', 'creation_date': '2024-12-02T10:18:24+13:00', 'header_key': 'Header 4', 'header_value': 'Important information about your policy'} file_name='state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF' tags=['Home', 'State'] isActive=True chunk_enrichment='' version='1' created_at=datetime.datetime(2025, 10, 11, 22, 3, 11,

2025-10-11 22:40:36,445 [INFO] httpx: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Enriching Chunks:   3%|▎         | 4/134 [00:27<15:20,  7.08s/it]

--------------------------------------------------
embedding=[] chunk_text='###### **What your policy is**  \nYour policy is a contract between you and us, and has three parts:  \n- This policy wording. It explains what we cover and do not cover, your responsibilities, how to contact us  \nand how to make a claim.  \n- Your policy schedule. This contains information specific to you, such as the type of insurance and cover  \nlevel you have, who and what is insured, your sums insured, the period of insurance and the premium  \nthat applies.  \n- Any addendum, endorsement or warranty that we apply. This may add special terms and conditions to  \nyour policy. It may be a separate document or printed in your policy schedule.' metadata={'source': 'C:\\Users\\aibag\\git_repo\\policy_wording\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'file_name': 'state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'to

2025-10-11 22:40:42,255 [INFO] httpx: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Enriching Chunks:   4%|▎         | 5/134 [00:33<14:14,  6.62s/it]

--------------------------------------------------
embedding=[] chunk_text='###### **Receiving your policy documents**  \nYou may choose to receive your policy documents by email or post:  \n- If we send your policy documents to you by email, we will send them to the person and email address you  \nnominated for receiving policy documents. Any policy documents we send to this email address will be  \nconsidered to have been received by you 24 hours after we send them.  \n- If we send your policy documents to you by post, we will send them to the person and mailing address you  \nnominated for receiving policy documents.  \nYou are responsible for making sure the person and email or mailing address we have for your policy  \ndocuments are correct. If any of these contact details change, you must let us know as soon as possible. We  \nwill consider you have received all policy documents we send to the person and email or mailing address you  \nnominated, even if those details are no long

2025-10-11 22:40:50,792 [INFO] httpx: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Enriching Chunks:   4%|▍         | 6/134 [00:42<15:32,  7.29s/it]

--------------------------------------------------
embedding=[] chunk_text='###### **How to read your policy**  \nWords in bold have a special meaning. These words and what they mean are listed in the section ‘Definitions’.  \nHeadings, examples and comments are a guide only. They do not change the meaning of your policy. The  \nheadings help you find your way around this policy wording. The examples and comments in boxes help you  \nunderstand how your policy works.' metadata={'source': 'C:\\Users\\aibag\\git_repo\\policy_wording\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'file_name': 'state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF', 'total_pages': '61', 'creation_date': '2024-12-02T10:18:24+13:00', 'header_key': 'Header 4', 'header_value': 'Important information about your policy'} file_name='state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF' tags=['

2025-10-11 22:40:56,899 [INFO] httpx: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
                                                                 

In [None]:
doc_chunks[5].chunk_enrichment

DocumentChunk(embedding=array([ 3.04781888e-02, -2.19033770e-02, -1.59611832e-02,  2.54837368e-02,
        3.70973423e-02,  2.73491349e-02, -3.24338488e-02,  3.17719318e-02,
        1.31254792e-02, -4.08883095e-02, -2.59651300e-02, -4.77782488e-02,
       -1.25312591e-02,  2.56642606e-02,  2.61306092e-02,  3.65858600e-02,
        4.28138822e-02,  1.90601517e-02, -3.33063714e-02,  2.18432043e-02,
       -2.67962855e-03,  3.24141030e-04,  8.82302970e-03,  7.00652367e-03,
        6.17988147e-02, -1.25312591e-02, -2.61757392e-02, -1.42311780e-02,
       -5.22913039e-02, -2.85375733e-02, -4.71238559e-03, -2.60102618e-02,
       -1.01242950e-02,  2.99065355e-02,  1.66325015e-03, -2.56191287e-02,
       -1.09366458e-02,  2.69429591e-02, -1.87385955e-03, -2.43141036e-03,
        1.34188272e-02,  6.47623930e-03, -4.89516407e-02, -2.70181783e-02,
       -3.99856977e-02,  2.24900758e-03, -2.42652111e-02,  1.19445622e-02,
       -7.69476499e-03,  4.11816593e-03, -4.54916321e-02,  3.67663838e-02,
 

In [69]:
# Using SQLALCHEMY
# create the engine
db_engine = get_engine()

In [70]:
# test
async with get_conn(db_engine) as conn:
    stmt = text("SELECT 1")
    result = await conn.execute(stmt)
    print(result.scalar())


2025-10-11 22:45:50,167 INFO sqlalchemy.engine.Engine select pg_catalog.version()


2025-10-11 22:45:50,167 [INFO] sqlalchemy.engine.Engine: select pg_catalog.version()


2025-10-11 22:45:50,168 INFO sqlalchemy.engine.Engine [raw sql] ()


2025-10-11 22:45:50,168 [INFO] sqlalchemy.engine.Engine: [raw sql] ()


2025-10-11 22:45:50,172 INFO sqlalchemy.engine.Engine select current_schema()


2025-10-11 22:45:50,172 [INFO] sqlalchemy.engine.Engine: select current_schema()


2025-10-11 22:45:50,172 INFO sqlalchemy.engine.Engine [raw sql] ()


2025-10-11 22:45:50,172 [INFO] sqlalchemy.engine.Engine: [raw sql] ()


2025-10-11 22:45:50,184 INFO sqlalchemy.engine.Engine show standard_conforming_strings


2025-10-11 22:45:50,184 [INFO] sqlalchemy.engine.Engine: show standard_conforming_strings


2025-10-11 22:45:50,186 INFO sqlalchemy.engine.Engine [raw sql] ()


2025-10-11 22:45:50,186 [INFO] sqlalchemy.engine.Engine: [raw sql] ()
2025-10-11 22:45:50,200 [INFO] db.db_connection_pool: DB connection opened


2025-10-11 22:45:50,200 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-11 22:45:50,200 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-10-11 22:45:50,200 INFO sqlalchemy.engine.Engine SELECT 1


2025-10-11 22:45:50,200 [INFO] sqlalchemy.engine.Engine: SELECT 1


2025-10-11 22:45:50,200 INFO sqlalchemy.engine.Engine [generated in 0.00237s] ()


2025-10-11 22:45:50,200 [INFO] sqlalchemy.engine.Engine: [generated in 0.00237s] ()


1
2025-10-11 22:45:50,207 INFO sqlalchemy.engine.Engine ROLLBACK


2025-10-11 22:45:50,207 [INFO] sqlalchemy.engine.Engine: ROLLBACK
2025-10-11 22:45:50,207 [INFO] db.db_connection_pool: DB connection closed


In [71]:
# create schema and tables using sqlalchemy
async with get_conn(db_engine) as conn:
    # You can wrap everything in an explicit transaction if you want
    # an atomic create/commit block.
    # the transaction is automatically committed when the block exits
    async with conn.begin():   

        await conn.execute(text("SET search_path TO public, document;"))

        # Create schema
        await conn.execute(
            text("CREATE SCHEMA IF NOT EXISTS document;")
        )

        # Create table
        create_table_sql = """
            CREATE TABLE IF NOT EXISTS document.document_chunk (
                id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                embedding       VECTOR(1536),
                chunk_text      TEXT,
                doc_metadata    JSONB,
                file_name       TEXT,
                doc_tags        TEXT[],
                isActive        BOOLEAN,
                version         TEXT,
                created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at      TIMESTAMP
            );
        """
        await conn.execute(text(create_table_sql))

        # Create table index
        create_index_sql = """
            CREATE INDEX IF NOT EXISTS documents_embedding_idx
            ON document.document_chunk
            USING ivfflat (embedding vector_l2_ops)
            WITH (lists = 100);
        """
        await conn.execute(text(create_index_sql))

    logger.info("Document schema initialisation finished")

2025-10-11 22:45:59,225 [INFO] db.db_connection_pool: DB connection opened


2025-10-11 22:45:59,226 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-11 22:45:59,226 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-10-11 22:45:59,228 INFO sqlalchemy.engine.Engine SET search_path TO public, document;


2025-10-11 22:45:59,228 [INFO] sqlalchemy.engine.Engine: SET search_path TO public, document;


2025-10-11 22:45:59,228 INFO sqlalchemy.engine.Engine [generated in 0.00066s] ()


2025-10-11 22:45:59,228 [INFO] sqlalchemy.engine.Engine: [generated in 0.00066s] ()


2025-10-11 22:45:59,232 INFO sqlalchemy.engine.Engine CREATE SCHEMA IF NOT EXISTS document;


2025-10-11 22:45:59,232 [INFO] sqlalchemy.engine.Engine: CREATE SCHEMA IF NOT EXISTS document;


2025-10-11 22:45:59,233 INFO sqlalchemy.engine.Engine [generated in 0.00139s] ()


2025-10-11 22:45:59,233 [INFO] sqlalchemy.engine.Engine: [generated in 0.00139s] ()


2025-10-11 22:45:59,235 INFO sqlalchemy.engine.Engine 
            CREATE TABLE IF NOT EXISTS document.document_chunk (
                id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                embedding       VECTOR(1536),
                chunk_text      TEXT,
                doc_metadata    JSONB,
                file_name       TEXT,
                doc_tags        TEXT[],
                isActive        BOOLEAN,
                version         TEXT,
                created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at      TIMESTAMP
            );
        


2025-10-11 22:45:59,235 [INFO] sqlalchemy.engine.Engine: 
            CREATE TABLE IF NOT EXISTS document.document_chunk (
                id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                embedding       VECTOR(1536),
                chunk_text      TEXT,
                doc_metadata    JSONB,
                file_name       TEXT,
                doc_tags        TEXT[],
                isActive        BOOLEAN,
                version         TEXT,
                created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at      TIMESTAMP
            );
        


2025-10-11 22:45:59,236 INFO sqlalchemy.engine.Engine [generated in 0.00071s] ()


2025-10-11 22:45:59,236 [INFO] sqlalchemy.engine.Engine: [generated in 0.00071s] ()


2025-10-11 22:45:59,238 INFO sqlalchemy.engine.Engine 
            CREATE INDEX IF NOT EXISTS documents_embedding_idx
            ON document.document_chunk
            USING ivfflat (embedding vector_l2_ops)
            WITH (lists = 100);
        


2025-10-11 22:45:59,238 [INFO] sqlalchemy.engine.Engine: 
            CREATE INDEX IF NOT EXISTS documents_embedding_idx
            ON document.document_chunk
            USING ivfflat (embedding vector_l2_ops)
            WITH (lists = 100);
        


2025-10-11 22:45:59,239 INFO sqlalchemy.engine.Engine [generated in 0.00138s] ()


2025-10-11 22:45:59,239 [INFO] sqlalchemy.engine.Engine: [generated in 0.00138s] ()


2025-10-11 22:45:59,256 INFO sqlalchemy.engine.Engine COMMIT


2025-10-11 22:45:59,256 [INFO] sqlalchemy.engine.Engine: COMMIT
2025-10-11 22:45:59,259 [INFO] __main__: Document schema initialisation finished
2025-10-11 22:45:59,259 [INFO] db.db_connection_pool: DB connection closed


In [None]:
# Batch insert the doc chunks to the document_chunk table
async def insert_chunk_batch(conn, rows: Sequence[Dict[str, Any]]):

    insert_sql = """
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, doc_tags, isActive, version, created_at, updated_at)
    VALUES
      (:embedding, :chunktext, :docmetadata, :filename, :doctags, :isActive, :version, :createdat, :updatedat)
    """
    await conn.execute(text(insert_sql), rows)


async def ingest_chunks(engine, docchunks: List[Any]):

    async with get_conn(engine) as conn:

        BATCH_SIZE = 10

        for i in range(0, len(docchunks), BATCH_SIZE):
            batch = docchunks[i:i+BATCH_SIZE]

            texts = [d.chunktext for d in batch]
            embeddings = await client.aembed(texts)

            rows = []
            for d, emb in zip(batch, embeddings):
                # Align columns per DocumentChunkBase
                doctags = ",".join(d.tags) if getattr(d, "tags", None) else None
                rows.append({
                    "embedding": emb,
                    "chunktext": d.chunktext,
                    "docmetadata": json.dumps(d.metadata or {}) if isinstance(d.metadata, dict) else d.metadata,
                    "filename": getattr(d, "filename", None),
                    "doctags": doctags,
                    "isActive": getattr(d, "isActive", True),
                    "version": getattr(d, "version", None),
                    "createdat": getattr(d, "createdat", current_nz_datetime),
                    "updatedat": getattr(d, "updatedat", None),
                })

            async with conn.begin():
                await insert_chunk_batch(conn, rows)