In [52]:
import os
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from ollama import AsyncClient
from typing import List, Sequence, Any
from db.db_connection_pool_using_pycopg2 import get_connection, release_connection, close_pool
from db.db_connection_pool import get_engine, get_conn
from pydantic import BaseModel, Field
from typing import List, Optional, Dict
from db.schema import Document_Chunk
from openai import OpenAI
from datetime import datetime
from zoneinfo import ZoneInfo
import requests
import logging
import asyncio
import aiohttp
from sqlalchemy import text
from fastembed import TextEmbedding
from dotenv import load_dotenv
from tqdm import tqdm

In [53]:
load_dotenv(override=True)

True

In [3]:
# convert pdf to markdown
# https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html#pymupdf4llm-api
FOLDER_PATH = r"C:\Users\aibag\git_repo\policy_wording"

FILE_NAME = "state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF"


In [4]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)

logger = logging.getLogger(__name__)

current_nz_datetime = datetime.now(tz=ZoneInfo("Pacific/Auckland"))


#### Ingest

In [5]:
# function to load pdf file and convert a pdf file to a markdown file
async def load_pdf_file(filepath:str, filename:str, mode:str ="single") -> List[Document]:

    full_path = os.path.join(filepath, filename)
    logger.info(f"Processing file: {full_path}")

    if not os.path.exists(full_path):
        raise FileNotFoundError(f"File not found: {full_path}")

    if not filename.lower().strip().endswith(".pdf"):
        raise TypeError ("Invalid File Type; only PDFs are allowed.")

    # custom pages_delimiter to identify where are ends of pages in single mode 
    # page = load each page as a Document object; single = load entire PDF as a single Document object
    doc_loader = PyMuPDF4LLMLoader(full_path 
                                   ,mode=mode
                                   ,pages_delimiter="<<-- PAGE BREAK -->>\n\n"
                                   ,table_strategy="lines_strict" # lines, text, lines_strict, lines_strict is default
                                   #,page_separators=True
                                  )
                            
    # lazy loading
    docs = []
    async for doc in doc_loader.alazy_load():
        docs.append(doc)

    logger.info(f"Successfully processed file: {filename}; Total Pages: {docs[0].metadata["total_pages"]}")

    return docs

doc_obj =  await load_pdf_file(FOLDER_PATH, FILE_NAME, mode="single")


2025-10-11 22:03:11,707 [INFO] __main__: Processing file: C:\Users\aibag\git_repo\policy_wording\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF
2025-10-11 22:03:20,855 [INFO] __main__: Successfully processed file: state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF; Total Pages: 61


#### Chunking

In [6]:
# functions to chunk/split a markdown file into chunks
def chunk_header_splitter(doc_contents)->list[Document]:

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ('####', "Header 4"),
        ('#####', "Header 5"),
        ('######', "Header 6"),
        ('#######', "Header 7")
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)

    return markdown_splitter.split_text(doc_contents)

def chunk_header_recursivesplitter(doc_contents)->list:
    text_splitter = RecursiveCharacterTextSplitter(
        separators=[". ", "! ", "? ", "\n\n", "\n", " ", ""],
        chunk_size=2000,
        chunk_overlap=200,
    )
    chunks = text_splitter.split_text(doc_contents)
    return chunks

#print(chunk_header_splitter(doc_obj[0].page_content))
#print(chunk_header_recursivesplitter(doc[0].page_content))

In [7]:
# model of the vector db
class DocumentChunk(BaseModel):
    embedding: List[float]
    chunk_text: str
    metadata: Optional[Dict[str, str]] = Field(default_factory=dict)
    file_name: str
    tags: Optional[List[str]] = Field(default_factory=list)
    isActive: bool = Field(default=False)
    chunk_enrichment: str = None
    version: Optional[str] = None
    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None

# model for the llm enrichment output
class ChunkEnrichment(BaseModel):
    chunk_summary: str
    hypotetical_questions: list[str]
    has_table: bool = Field(default=False)
    table_summary: Optional[str] = None

In [8]:
def chunk_document(doc_obj)->list[str]:

    # get metadata from the doc object 
    source_path = doc_obj[0].metadata.get("source", "")
    file_name = source_path.split("\\")[-1] if source_path else "unknown"

    doc_metadata = {
        "source"        : doc_obj[0].metadata.get("source"),
        "file_name"     : file_name,
        "total_pages"   : str(doc_obj[0].metadata.get("total_pages")),
        "creation_date" : doc_obj[0].metadata.get("creationdate"),
    }

    try:
        chunks = chunk_header_splitter(doc_obj[0].page_content)
        logger.info(f"Chunk Size for document '{file_name}': {len(chunks)}")
    except Exception as e:
        logger.error(f"Call to chunk_header_splitter failed: {e}")    

    doc_chunks_list = []

    for chunk in chunks:

        # skip pages that only have 30 char or less
        if len(chunk.page_content) < 30: 
            continue 

        # additional metadata 
        chunk_metadata = doc_metadata.copy()

        header_key, header_value = next(iter(chunk.metadata.items()), (None, None))

        chunk_metadata["header_key"] = header_key 
        chunk_metadata["header_value"] = header_value.replace("*","").replace("#","")

        # create an instance of DocumentChunk
        doc_chunk = DocumentChunk(
            embedding = [],
            chunk_text = chunk.page_content,
            metadata = chunk_metadata,
            file_name = file_name,
            isActive = True,
            chunk_enrichment = "",
            version = "1",
            tags = ["Home", "State"],
            created_at = current_nz_datetime,
            updated_at= current_nz_datetime,
        )

        doc_chunks_list.append(doc_chunk)

    return doc_chunks_list

doc_chunks = chunk_document(doc_obj)
#print(chunk_document(doc_obj))

2025-10-11 22:03:20,886 [INFO] __main__: Chunk Size for document 'state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF': 135


#### Enrich Chunks

In [None]:
# enrich each chunks 
# https://www.anthropic.com/engineering/contextual-retrieval
enrichment_system_prompt = ChatPromptTemplate.from_template("""
You are an expert general insurance underwriting product manager.  
You will be given a document and a chunk of text from the document:

Please give a short context to situate this chunk within the overall document for
the purposes of improving search retrieval of the chunk.

ONLY IF the chunk includes a table where you will provide a summary of the table.

Also provide 3 to 5 hypotetical questions that the chunk will able to answer.

Answer only with the succinct context and a list of hypotetical questions, nothing else. 

Here's the document:
{document_text}
""")

enrichment_user_prompt = ChatPromptTemplate.from_template("""
Here's the chunk:
{chunk_text}
""")


In [10]:
# invoke - local lm studio llm model
def invoke_llm_using_openai(system_prompt, query_prompt):
    client = OpenAI(base_url="http://localhost:1234/v1", api_key="lmstudio")

    response = client.chat.completions.parse(
        model= "google/gemma-3-4b",  
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query_prompt}
        ],
        temperature=0.0,
        max_tokens=500,
        top_p=0.9,
        response_format = ChunkEnrichment,
    )


    #return response.choices[0].message.content
    return response.choices[0].message.parsed


In [17]:
# async invoke - local lm studio llm model
async def ainvoke_llm_api_generate(prompt):
    async with aiohttp.ClientSession() as session:
        async with session.post(
            "http://localhost:11434/api/generate",

            json={
                "model": "gemma3:4b",
                "prompt": prompt,
                "stream": False,
                "temperature": 0,
                "top_p": 0.90,
            }
        ) as response:
            resp_json = await response.json()
            return resp_json["response"]

In [57]:
# Using Ollama Model
# https://github.com/ollama/ollama-python
async def invokeOllamaModel(system_prompt, query_prompt, formatModel):
    model='gemma3:4b'
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": query_prompt}
    ]
    response = await AsyncClient().chat(model=model, messages=messages, format=formatModel.model_json_schema())
    response_text = formatModel.model_validate_json(response.message.content)
    return response_text


In [None]:
system_prompt = enrichment_system_prompt.format(document_text=doc_obj[0].page_content)
query_prompt = enrichment_user_prompt.format(chunk_text=doc_chunks[0].chunk_text)

response = await invokeOllamaModel(system_prompt, query_prompt, ChunkEnrichment)
print(response)

2025-10-11 22:31:11,302 [INFO] httpx: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


chunk_summary='This is an introductory paragraph from State Insurance, thanking the customer and explaining the importance of reading the policy wording and schedule for claim information.' hypotetical_questions=["What does 'policy wording' refer to?", 'Why is it important to keep the policy schedule and wording handy?', 'What happens if I need to make a claim?'] has_table=False table_summary=None


#### Chunk Enrichment and Embedding

In [None]:
embedding_model = TextEmbedding(model_name="BAAI/bge-base-en-v1.5")  

def invoke_embedding(chunk_text):
    embeddings = embedding_model.embed(chunk_text)
    # TextEmbedding.embed returns a generator so I used next() 
    return next(embeddings) 

In [None]:
# Progress bar for chunk enrichment within the current file
system_prompt = enrichment_system_prompt.format(document_text = doc_obj[0].page_content)

with tqdm(total=len(doc_chunks), desc=f"Enriching Chunks", leave=False) as pbar_chunks:
    for idx, chunk in enumerate(doc_chunks):
        
        print("-"*50)
        print(chunk)

        # enrich chunk using LLM
        query_prompt =  enrichment_user_prompt.format(chunk_text = chunk.chunk_text)
        enriched_text = invokeOllamaModel(system_prompt, query_prompt)

        enriched_parts = []
        if enriched_text:
            # pre-pend to the chunk:
            enriched_parts = [
            f"Chunk Summary: {enriched_text.chunk_summary}\n",
            f"Questions this chunk may answer: {"\n".join(f"- {q}" for q in enriched_text.hypotetical_questions)}\n",
            ]
             
            if enriched_text.has_table and enriched_text.table_summary:
                enriched_parts.append(f"Table Summary: {enriched_text.table_summary}\n")
        else:
            logger.warning(f"Doc: {chunk.file_name} Chunk: {idx} failed enrichment.")

        enriched_parts.append(f"Chunk Text:\n{chunk.chunk_text}")

        enriched_chunk = "\n".join(enriched_parts)

        # embed enriched_chunk
        embedded_chunk = invoke_embedding(enriched_chunk)
        
        # write to the DocumentChunk object
        if embedded_chunk is not None and len(embedded_chunk) > 0:
            chunk.embedding = embedded_chunk

        if enriched_chunk:
            chunk.chunk_enrichment = enriched_chunk            

        # Update chunk-level progress bar
        pbar_chunks.update(1)

        if idx > 0: break

Enriching Chunks:   0%|          | 0/134 [00:00<?, ?it/s]

In [None]:
doc_chunks[0].chunk_enrichment

"Chunk Summary: This chunk discusses the concept of 'flow' or 'being in the zone,' a psychological state where individuals are fully immersed and engaged in an activity, experiencing heightened focus, enjoyment, and performance. It explains that flow is characterized by a balance between challenge and skill level; when challenges exceed skills, anxiety arises, while when skills exceed challenges, boredom occurs. The ideal scenario for flow is when the challenge slightly exceeds the individual's current skill level, pushing them to grow and improve.  The chunk also mentions several key elements of flow, including clear goals, immediate feedback, a sense of control, loss of self-consciousness, transformation of time perception (time seems to fly by), and intrinsic motivation.\n\nQuestions this chunk may answer: - Can you describe a personal experience where you felt 'in the zone' or experienced flow?\n- How does the balance between challenge and skill contribute to achieving a state of f

In [15]:
# Using SQLALCHEMY
# create the engine
db_engine = get_engine()

In [16]:
# test
async with get_conn(db_engine) as conn:
    stmt = text("SELECT 1")
    result = await conn.execute(stmt)
    print(result.scalar())


2025-09-25 20:57:32,639 INFO sqlalchemy.engine.Engine select pg_catalog.version()


2025-09-25 20:57:32,639 [INFO] sqlalchemy.engine.Engine: select pg_catalog.version()


2025-09-25 20:57:32,642 INFO sqlalchemy.engine.Engine [raw sql] ()


2025-09-25 20:57:32,642 [INFO] sqlalchemy.engine.Engine: [raw sql] ()


2025-09-25 20:57:32,644 INFO sqlalchemy.engine.Engine select current_schema()


2025-09-25 20:57:32,644 [INFO] sqlalchemy.engine.Engine: select current_schema()


2025-09-25 20:57:32,644 INFO sqlalchemy.engine.Engine [raw sql] ()


2025-09-25 20:57:32,644 [INFO] sqlalchemy.engine.Engine: [raw sql] ()


2025-09-25 20:57:32,651 INFO sqlalchemy.engine.Engine show standard_conforming_strings


2025-09-25 20:57:32,651 [INFO] sqlalchemy.engine.Engine: show standard_conforming_strings


2025-09-25 20:57:32,651 INFO sqlalchemy.engine.Engine [raw sql] ()


2025-09-25 20:57:32,651 [INFO] sqlalchemy.engine.Engine: [raw sql] ()
2025-09-25 20:57:32,656 [INFO] db.db_connection_pool: DB connection opened


2025-09-25 20:57:32,657 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-09-25 20:57:32,657 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-09-25 20:57:32,658 INFO sqlalchemy.engine.Engine SELECT 1


2025-09-25 20:57:32,658 [INFO] sqlalchemy.engine.Engine: SELECT 1


2025-09-25 20:57:32,659 INFO sqlalchemy.engine.Engine [generated in 0.00263s] ()


2025-09-25 20:57:32,659 [INFO] sqlalchemy.engine.Engine: [generated in 0.00263s] ()


1
2025-09-25 20:57:32,663 INFO sqlalchemy.engine.Engine ROLLBACK


2025-09-25 20:57:32,663 [INFO] sqlalchemy.engine.Engine: ROLLBACK
2025-09-25 20:57:32,665 [INFO] db.db_connection_pool: DB connection closed


In [None]:
# create schema and tables using sqlalchemy

async with get_conn(db_engine) as conn:
    # You can wrap everything in an explicit transaction if you want
    # an atomic create/commit block.
    # the transaction is automatically committed when the block exits
    async with conn.begin():   

        await conn.execute(text("SET search_path TO public, document;"))

        # Create schema
        await conn.execute(
            text("CREATE SCHEMA IF NOT EXISTS document;")
        )

        # Create table
        create_table_sql = """
            CREATE TABLE IF NOT EXISTS document.document_chunk (
                id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                embedding       VECTOR(1536),
                chunk_text      TEXT,
                doc_metadata    JSONB,
                file_name       TEXT,
                doc_tags        TEXT[],
                isActive        BOOLEAN,
                version         TEXT,
                created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at      TIMESTAMP
            );
        """
        await conn.execute(text(create_table_sql))

        # Create table index
        create_index_sql = """
            CREATE INDEX IF NOT EXISTS documents_embedding_idx
            ON document.document_chunk
            USING ivfflat (embedding vector_l2_ops)
            WITH (lists = 100);
        """
        await conn.execute(text(create_index_sql))

    logger.info("Document schema initialisation finished")

2025-09-25 20:57:37,324 [INFO] db.db_connection_pool: DB connection opened


2025-09-25 20:57:37,325 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-09-25 20:57:37,325 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-09-25 20:57:37,326 INFO sqlalchemy.engine.Engine SET search_path TO public, document;


2025-09-25 20:57:37,326 [INFO] sqlalchemy.engine.Engine: SET search_path TO public, document;


2025-09-25 20:57:37,327 INFO sqlalchemy.engine.Engine [generated in 0.00090s] ()


2025-09-25 20:57:37,327 [INFO] sqlalchemy.engine.Engine: [generated in 0.00090s] ()


2025-09-25 20:57:37,332 INFO sqlalchemy.engine.Engine CREATE SCHEMA IF NOT EXISTS document;


2025-09-25 20:57:37,332 [INFO] sqlalchemy.engine.Engine: CREATE SCHEMA IF NOT EXISTS document;


2025-09-25 20:57:37,332 INFO sqlalchemy.engine.Engine [generated in 0.00092s] ()


2025-09-25 20:57:37,332 [INFO] sqlalchemy.engine.Engine: [generated in 0.00092s] ()


2025-09-25 20:57:37,334 INFO sqlalchemy.engine.Engine 
            CREATE TABLE IF NOT EXISTS document.document_chunk (
                id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                embedding       VECTOR(1536),
                chunk_text      TEXT,
                doc_metadata    JSONB,
                file_name       TEXT,
                doc_tags        TEXT[],
                isActive        BOOLEAN,
                version         TEXT,
                created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at      TIMESTAMP
            );
        


2025-09-25 20:57:37,334 [INFO] sqlalchemy.engine.Engine: 
            CREATE TABLE IF NOT EXISTS document.document_chunk (
                id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                embedding       VECTOR(1536),
                chunk_text      TEXT,
                doc_metadata    JSONB,
                file_name       TEXT,
                doc_tags        TEXT[],
                isActive        BOOLEAN,
                version         TEXT,
                created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at      TIMESTAMP
            );
        


2025-09-25 20:57:37,334 INFO sqlalchemy.engine.Engine [generated in 0.00076s] ()


2025-09-25 20:57:37,334 [INFO] sqlalchemy.engine.Engine: [generated in 0.00076s] ()


2025-09-25 20:57:37,337 INFO sqlalchemy.engine.Engine 
            CREATE INDEX IF NOT EXISTS documents_embedding_idx
            ON document.document_chunk
            USING ivfflat (embedding vector_l2_ops)
            WITH (lists = 100);
        


2025-09-25 20:57:37,337 [INFO] sqlalchemy.engine.Engine: 
            CREATE INDEX IF NOT EXISTS documents_embedding_idx
            ON document.document_chunk
            USING ivfflat (embedding vector_l2_ops)
            WITH (lists = 100);
        


2025-09-25 20:57:37,338 INFO sqlalchemy.engine.Engine [generated in 0.00116s] ()


2025-09-25 20:57:37,338 [INFO] sqlalchemy.engine.Engine: [generated in 0.00116s] ()


2025-09-25 20:57:37,342 INFO sqlalchemy.engine.Engine COMMIT


2025-09-25 20:57:37,342 [INFO] sqlalchemy.engine.Engine: COMMIT
2025-09-25 20:57:37,344 [INFO] __main__: Document schema initialisation finished
2025-09-25 20:57:37,346 [INFO] db.db_connection_pool: DB connection closed


In [None]:
# Batch insert the doc chunks to the document_chunk table
async def insert_chunk_batch(conn, rows: Sequence[Dict[str, Any]]):

    insert_sql = """
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, doc_tags, isActive, version, created_at, updated_at)
    VALUES
      (:embedding, :chunktext, :docmetadata, :filename, :doctags, :isActive, :version, :createdat, :updatedat)
    """
    await conn.execute(text(insert_sql), rows)


async def ingest_chunks(engine, docchunks: List[Any]):

    async with get_conn(engine) as conn:

        # Prepare batches like OKA BATCH_SIZE=10
        BATCH_SIZE = 10

        for i in range(0, len(docchunks), BATCH_SIZE):
            batch = docchunks[i:i+BATCH_SIZE]

            texts = [d.chunktext for d in batch]
            embeddings = await client.aembed(texts)

            rows = []
            for d, emb in zip(batch, embeddings):
                # Align columns per DocumentChunkBase
                doctags = ",".join(d.tags) if getattr(d, "tags", None) else None
                rows.append({
                    "embedding": emb,
                    "chunktext": d.chunktext,
                    "docmetadata": json.dumps(d.metadata or {}) if isinstance(d.metadata, dict) else d.metadata,
                    "filename": getattr(d, "filename", None),
                    "doctags": doctags,
                    "isActive": getattr(d, "isActive", True),
                    "version": getattr(d, "version", None),
                    "createdat": getattr(d, "createdat", current_nz_datetime),
                    "updatedat": getattr(d, "updatedat", None),
                })

            async with conn.begin():
                await insert_chunk_batch(conn, rows)

<psycopg2.pool.ThreadedConnectionPool at 0x187e9afbc50>