In [1]:
import os
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from ollama import AsyncClient
from typing import List, Sequence, Any
from db.db_connection_pool_using_pycopg2 import get_connection, release_connection, close_pool
from db.db_connection_pool import get_engine, get_conn
from pydantic import BaseModel, Field
from typing import List, Optional, Dict
from db.schema import Document_Chunk
from openai import OpenAI
from datetime import datetime
from zoneinfo import ZoneInfo
import requests
import logging
import asyncio
import aiohttp
from sqlalchemy import text
from fastembed import TextEmbedding
from dotenv import load_dotenv
from tqdm import tqdm
import json 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv(override=True)

True

In [3]:
# convert pdf to markdown
# https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/api.html#pymupdf4llm-api
FOLDER_PATH = r"C:\Users\aibag\git_repo\policy_wording"

FILE_NAME = "state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF"


In [4]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)

logger = logging.getLogger(__name__)

# Suppress httpx logging to prevent it from breaking the progress bar
logging.getLogger("httpx").setLevel(logging.WARNING)

current_nz_datetime = datetime.now()


#### Ingest

In [5]:
# function to load pdf file and convert a pdf file to a markdown file
async def load_pdf_file(filepath:str, filename:str, mode:str ="single") -> List[Document]:

    full_path = os.path.join(filepath, filename)
    logger.info(f"Processing file: {full_path}")

    if not os.path.exists(full_path):
        raise FileNotFoundError(f"File not found: {full_path}")

    if not filename.lower().strip().endswith(".pdf"):
        raise TypeError ("Invalid File Type; only PDFs are allowed.")

    # custom pages_delimiter to identify where are ends of pages in single mode 
    # page = load each page as a Document object; single = load entire PDF as a single Document object
    doc_loader = PyMuPDF4LLMLoader(full_path 
                                   ,mode=mode
                                   ,pages_delimiter="<<-- PAGE BREAK -->>\n\n"
                                   ,table_strategy="lines_strict" # lines, text, lines_strict, lines_strict is default
                                   #,page_separators=True
                                  )
                            
    # lazy loading
    docs = []
    async for doc in doc_loader.alazy_load():
        docs.append(doc)

    logger.info(f"Successfully processed file: {filename}; Total Pages: {docs[0].metadata["total_pages"]}")

    return docs

doc_obj =  await load_pdf_file(FOLDER_PATH, FILE_NAME, mode="single")


2025-10-14 20:54:53,831 [INFO] __main__: Processing file: C:\Users\aibag\git_repo\policy_wording\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF
2025-10-14 20:55:02,675 [INFO] __main__: Successfully processed file: state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF; Total Pages: 61


#### Chunking

In [6]:
# functions to chunk/split a markdown file into chunks
def chunk_header_splitter(doc_contents)->list[Document]:

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ('####', "Header 4"),
        ('#####', "Header 5"),
        ('######', "Header 6"),
        ('#######', "Header 7")
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)

    return markdown_splitter.split_text(doc_contents)

def chunk_header_recursivesplitter(doc_contents)->list:
    text_splitter = RecursiveCharacterTextSplitter(
        separators=[". ", "! ", "? ", "\n\n", "\n", " ", ""],
        chunk_size=2000,
        chunk_overlap=200,
    )
    chunks = text_splitter.split_text(doc_contents)
    return chunks

#print(chunk_header_splitter(doc_obj[0].page_content))
#print(chunk_header_recursivesplitter(doc[0].page_content))

In [7]:
# model of the vector db
class DocumentChunk(BaseModel):
    embedding: List[float]
    chunk_text: str
    metadata: Optional[Dict[str, str]] = Field(default_factory=dict)
    file_name: str
    tags: Optional[List[str]] = Field(default_factory=list)
    isActive: bool = Field(default=False)
    chunk_enrichment: str = None
    version: Optional[str] = None
    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None

# model for the llm enrichment output
class ChunkEnrichment(BaseModel):
    chunk_summary: str
    hypotetical_questions: list[str]
    has_table: bool = Field(default=False)
    table_summary: Optional[str] = None

In [8]:
def chunk_document(doc_obj)->list[str]:

    # get metadata from the doc object 
    source_path = doc_obj[0].metadata.get("source", "")
    file_name = source_path.split("\\")[-1] if source_path else "unknown"

    doc_metadata = {
        "source"        : doc_obj[0].metadata.get("source"),
        "file_name"     : file_name,
        "total_pages"   : str(doc_obj[0].metadata.get("total_pages")),
        "creation_date" : doc_obj[0].metadata.get("creationdate"),
    }

    try:
        chunks = chunk_header_splitter(doc_obj[0].page_content)
        logger.info(f"Chunk Size for document '{file_name}': {len(chunks)}")
    except Exception as e:
        logger.error(f"Call to chunk_header_splitter failed: {e}")    

    doc_chunks_list = []

    for chunk in chunks:

        # skip pages that only have 30 char or less
        if len(chunk.page_content) < 30: 
            continue 

        # additional metadata 
        chunk_metadata = doc_metadata.copy()

        header_key, header_value = next(iter(chunk.metadata.items()), (None, None))

        chunk_metadata["header_key"] = header_key 
        chunk_metadata["header_value"] = header_value.replace("*","").replace("#","")

        # create an instance of DocumentChunk
        doc_chunk = DocumentChunk(
            embedding = [],
            chunk_text = chunk.page_content,
            metadata = chunk_metadata,
            file_name = file_name,
            isActive = True,
            chunk_enrichment = "",
            version = "1",
            tags = ["Home", "State"],
            created_at = current_nz_datetime,
            updated_at= current_nz_datetime,
        )

        doc_chunks_list.append(doc_chunk)

    return doc_chunks_list

doc_chunks = chunk_document(doc_obj)
#print(chunk_document(doc_obj))

2025-10-14 20:55:02,730 [INFO] __main__: Chunk Size for document 'state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1224.PDF': 135


#### Enrich Chunks

In [9]:
# enrich each chunks 
# https://www.anthropic.com/engineering/contextual-retrieval
enrichment_system_prompt = ChatPromptTemplate.from_template("""
You are an expert general insurance underwriting product manager.  
You will be given a document and a chunk of text from the document:

Please give a short context to situate this chunk within the overall document for
the purposes of improving search retrieval of the chunk.

ONLY IF the chunk includes a table where you will provide a summary of the table.

Also provide 3 to 5 hypotetical questions that the chunk will able to answer.

Answer only with the succinct context and a list of hypotetical questions, nothing else. 

Here's the document:
{document_text}
""")

enrichment_user_prompt = ChatPromptTemplate.from_template("""
Here's the chunk:
{chunk_text}
""")


In [10]:
# invoke - local lm studio llm model
def invoke_llm_using_openai(system_prompt, query_prompt):
    client = OpenAI(base_url="http://localhost:1234/v1", api_key="lmstudio")

    response = client.chat.completions.parse(
        model= "google/gemma-3-4b",  
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query_prompt}
        ],
        temperature=0.0,
        max_tokens=500,
        top_p=0.9,
        response_format = ChunkEnrichment,
    )


    #return response.choices[0].message.content
    return response.choices[0].message.parsed


In [11]:
# async invoke - local lm studio llm model
async def ainvoke_llm_api_generate(prompt):
    async with aiohttp.ClientSession() as session:
        async with session.post(
            "http://localhost:11434/api/generate",

            json={
                "model": "gemma3:4b",
                "prompt": prompt,
                "stream": False,
                "temperature": 0,
                "top_p": 0.90,
            }
        ) as response:
            resp_json = await response.json()
            return resp_json["response"]

In [12]:
# Using Ollama Model
# https://github.com/ollama/ollama-python
async def invokeOllamaModel(system_prompt, query_prompt, formatModel):
    model='gemma3:4b'
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": query_prompt}
    ]
    response = await AsyncClient().chat(model=model, messages=messages, format=formatModel.model_json_schema())
    response_text = formatModel.model_validate_json(response.message.content)
    return response_text


In [13]:
system_prompt = enrichment_system_prompt.format(document_text=doc_obj[0].page_content)
query_prompt = enrichment_user_prompt.format(chunk_text=doc_chunks[0].chunk_text)

response = await invokeOllamaModel(system_prompt, query_prompt, ChunkEnrichment)
print(response)

chunk_summary='This is an introductory paragraph from State Insurance, thanking the customer and explaining the importance of reading the policy wording and schedule. It emphasizes their usefulness for future claims.' hypotetical_questions=["What does 'policy schedule' refer to?", 'Why is it important to keep these documents?', 'What happens if I need to make a claim?'] has_table=False table_summary=None


#### Chunk Enrichment and Embedding

In [14]:
embedding_model = TextEmbedding(model_name="BAAI/bge-base-en-v1.5")  

def invoke_embedding(chunk_text):
    embeddings = embedding_model.embed(chunk_text)
    # TextEmbedding.embed returns a generator so I used next() 
    return next(embeddings) 

In [15]:
# Progress bar for chunk enrichment within the current file
system_prompt = enrichment_system_prompt.format(document_text = doc_obj[0].page_content)

with tqdm(total=len(doc_chunks), desc=f"Enriching Chunks", leave=False) as pbar_chunks:
    for idx, chunk in enumerate(doc_chunks):
        
        # enrich chunk using LLM
        query_prompt =  enrichment_user_prompt.format(chunk_text = chunk.chunk_text)
        enriched_text = await invokeOllamaModel(system_prompt, query_prompt, formatModel=ChunkEnrichment)

        enriched_parts = []
        if enriched_text:
            # pre-pend to the chunk:
            enriched_parts.append(f"Chunk Summary: {enriched_text.chunk_summary}\n")
            enriched_parts.append(f"Questions this chunk may answer: {"\n".join(f"- {q}" for q in enriched_text.hypotetical_questions)}\n")
             
            if enriched_text.has_table and enriched_text.table_summary:
                enriched_parts.append(f"Table Summary: {enriched_text.table_summary}\n")
        else:
            logger.warning(f"Doc: {chunk.file_name} Chunk: {idx} failed enrichment.")

        enriched_parts.append(f"Chunk Text:\n{chunk.chunk_text}")

        enriched_chunk = "\n".join(enriched_parts)

        # check if enriched_chunk is not empty
        if not enriched_chunk or len(enriched_chunk.strip()) == 0:
            logger.warning(f"Chunk {idx} has no text to embed. Chunk: {chunk}")
            break

        # embed enriched_chunk
        embedded_chunk = invoke_embedding(enriched_chunk)
        if len(embedded_chunk) == 0:
            logger.warning(f"Call to embedding failed: {embedded_chunk}")
            break

        # write to the DocumentChunk object
        if embedded_chunk is not None and len(embedded_chunk) > 0:
            chunk.embedding = embedded_chunk

        if enriched_chunk:
            chunk.chunk_enrichment = enriched_chunk            
             
        # Update chunk-level progress bar
        pbar_chunks.update(1)

        if idx > 5: break

                                                                 

In [16]:
doc_chunks[5].chunk_enrichment

'Chunk Summary: This section explains how the policy documents will be delivered and outlines the recipient’s responsibility for maintaining accurate contact information.\n\nQuestions this chunk may answer: - What are the two delivery methods offered?\n- How long after sending an email does the company consider the document received?\n- What is the recipient’s responsibility regarding their contact details?\n- What should the recipient do if their contact details change?\n\nChunk Text:\n###### **Receiving your policy documents**  \nYou may choose to receive your policy documents by email or post:  \n- If we send your policy documents to you by email, we will send them to the person and email address you  \nnominated for receiving policy documents. Any policy documents we send to this email address will be  \nconsidered to have been received by you 24 hours after we send them.  \n- If we send your policy documents to you by post, we will send them to the person and mailing address you  

In [17]:
# Using SQLALCHEMY
# create the engine
db_engine = get_engine()

In [20]:
# test
async with get_conn(db_engine) as conn:
    stmt = text("SELECT 1")
    result = await conn.execute(stmt)
    print(result.scalar())


2025-10-14 20:56:56,919 INFO sqlalchemy.engine.Engine select pg_catalog.version()


2025-10-14 20:56:56,919 [INFO] sqlalchemy.engine.Engine: select pg_catalog.version()


2025-10-14 20:56:56,921 INFO sqlalchemy.engine.Engine [raw sql] ()


2025-10-14 20:56:56,921 [INFO] sqlalchemy.engine.Engine: [raw sql] ()


2025-10-14 20:56:56,925 INFO sqlalchemy.engine.Engine select current_schema()


2025-10-14 20:56:56,925 [INFO] sqlalchemy.engine.Engine: select current_schema()


2025-10-14 20:56:56,926 INFO sqlalchemy.engine.Engine [raw sql] ()


2025-10-14 20:56:56,926 [INFO] sqlalchemy.engine.Engine: [raw sql] ()


2025-10-14 20:56:56,932 INFO sqlalchemy.engine.Engine show standard_conforming_strings


2025-10-14 20:56:56,932 [INFO] sqlalchemy.engine.Engine: show standard_conforming_strings


2025-10-14 20:56:56,933 INFO sqlalchemy.engine.Engine [raw sql] ()


2025-10-14 20:56:56,933 [INFO] sqlalchemy.engine.Engine: [raw sql] ()
2025-10-14 20:56:56,937 [INFO] db.db_connection_pool: DB connection opened


2025-10-14 20:56:56,939 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-14 20:56:56,939 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-10-14 20:56:56,940 INFO sqlalchemy.engine.Engine SELECT 1


2025-10-14 20:56:56,940 [INFO] sqlalchemy.engine.Engine: SELECT 1


2025-10-14 20:56:56,940 INFO sqlalchemy.engine.Engine [generated in 0.00306s] ()


2025-10-14 20:56:56,940 [INFO] sqlalchemy.engine.Engine: [generated in 0.00306s] ()


1
2025-10-14 20:56:56,946 INFO sqlalchemy.engine.Engine ROLLBACK


2025-10-14 20:56:56,946 [INFO] sqlalchemy.engine.Engine: ROLLBACK
2025-10-14 20:56:56,949 [INFO] db.db_connection_pool: DB connection closed


In [21]:
# create schema and tables using sqlalchemy
async with get_conn(db_engine) as conn:
    # You can wrap everything in an explicit transaction if you want
    # an atomic create/commit block.
    # the transaction is automatically committed when the block exits
    async with conn.begin():   

        await conn.execute(text("SET search_path TO public, document;"))

        # Create schema
        await conn.execute(
            text("CREATE SCHEMA IF NOT EXISTS document;")
        )

        # Create table
        create_table_sql = """
            CREATE TABLE IF NOT EXISTS document.document_chunk (
                id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                embedding       VECTOR(768),
                chunk_text      TEXT,
                doc_metadata    JSONB,
                file_name       TEXT,
                tags            TEXT[],
                isActive        BOOLEAN,
                version         TEXT,
                created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at      TIMESTAMP
            );
        """
        await conn.execute(text(create_table_sql))

        # Create table index
        create_index_sql = """
            CREATE INDEX IF NOT EXISTS documents_embedding_idx
            ON document.document_chunk
            USING ivfflat (embedding vector_l2_ops)
            WITH (lists = 100);
        """
        await conn.execute(text(create_index_sql))

    logger.info("Document schema initialisation finished")

2025-10-14 20:57:02,832 [INFO] db.db_connection_pool: DB connection opened


2025-10-14 20:57:02,832 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-14 20:57:02,832 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-10-14 20:57:02,833 INFO sqlalchemy.engine.Engine SET search_path TO public, document;


2025-10-14 20:57:02,833 [INFO] sqlalchemy.engine.Engine: SET search_path TO public, document;


2025-10-14 20:57:02,833 INFO sqlalchemy.engine.Engine [generated in 0.00058s] ()


2025-10-14 20:57:02,833 [INFO] sqlalchemy.engine.Engine: [generated in 0.00058s] ()


2025-10-14 20:57:02,837 INFO sqlalchemy.engine.Engine CREATE SCHEMA IF NOT EXISTS document;


2025-10-14 20:57:02,837 [INFO] sqlalchemy.engine.Engine: CREATE SCHEMA IF NOT EXISTS document;


2025-10-14 20:57:02,838 INFO sqlalchemy.engine.Engine [generated in 0.00177s] ()


2025-10-14 20:57:02,838 [INFO] sqlalchemy.engine.Engine: [generated in 0.00177s] ()


2025-10-14 20:57:02,840 INFO sqlalchemy.engine.Engine 
            CREATE TABLE IF NOT EXISTS document.document_chunk (
                id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                embedding       VECTOR(768),
                chunk_text      TEXT,
                doc_metadata    JSONB,
                file_name       TEXT,
                tags            TEXT[],
                isActive        BOOLEAN,
                version         TEXT,
                created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at      TIMESTAMP
            );
        


2025-10-14 20:57:02,840 [INFO] sqlalchemy.engine.Engine: 
            CREATE TABLE IF NOT EXISTS document.document_chunk (
                id              UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                embedding       VECTOR(768),
                chunk_text      TEXT,
                doc_metadata    JSONB,
                file_name       TEXT,
                tags            TEXT[],
                isActive        BOOLEAN,
                version         TEXT,
                created_at      TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at      TIMESTAMP
            );
        


2025-10-14 20:57:02,842 INFO sqlalchemy.engine.Engine [generated in 0.00093s] ()


2025-10-14 20:57:02,842 [INFO] sqlalchemy.engine.Engine: [generated in 0.00093s] ()


2025-10-14 20:57:02,844 INFO sqlalchemy.engine.Engine 
            CREATE INDEX IF NOT EXISTS documents_embedding_idx
            ON document.document_chunk
            USING ivfflat (embedding vector_l2_ops)
            WITH (lists = 100);
        


2025-10-14 20:57:02,844 [INFO] sqlalchemy.engine.Engine: 
            CREATE INDEX IF NOT EXISTS documents_embedding_idx
            ON document.document_chunk
            USING ivfflat (embedding vector_l2_ops)
            WITH (lists = 100);
        


2025-10-14 20:57:02,845 INFO sqlalchemy.engine.Engine [generated in 0.00093s] ()


2025-10-14 20:57:02,845 [INFO] sqlalchemy.engine.Engine: [generated in 0.00093s] ()


2025-10-14 20:57:02,864 INFO sqlalchemy.engine.Engine COMMIT


2025-10-14 20:57:02,864 [INFO] sqlalchemy.engine.Engine: COMMIT
2025-10-14 20:57:02,867 [INFO] __main__: Document schema initialisation finished
2025-10-14 20:57:02,868 [INFO] db.db_connection_pool: DB connection closed


In [34]:

# Batch insert the doc chunks to the document_chunk table
async def insert_chunk_batch(conn, rows: Sequence[Dict[str, Any]]):

    insert_sql = """
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      (:embedding, :chunktext, :docmetadata, :filename, :tags, :isActive, :version, :createdat, :updatedat)
    """
    await conn.execute(text(insert_sql), rows)


async def persist_chunks(engine, doc_chunks: List[Any]):

    async with get_conn(engine) as conn:

        BATCH_SIZE = 5

        for i in range(0, len(doc_chunks), BATCH_SIZE):

            # slicing the docchunk list
            insert_batch = doc_chunks[i:i+BATCH_SIZE]

            insert_rows = []

            for chunk in insert_batch:

                # Convert embedding to string format for pgvector: "[val1,val2,val3,...]"
                if hasattr(chunk.embedding, 'tolist'):
                    embedding_str = str(chunk.embedding.tolist())
                else:
                    embedding_str = str(chunk.embedding) if isinstance(chunk.embedding, list) else chunk.embedding
                
                # Align columns as per DocumentChunkBase schema
                insert_rows.append({
                    "embedding":    embedding_str,
                    "chunktext":    chunk.chunk_text,
                    "docmetadata":  json.dumps(chunk.metadata or {}) if isinstance(chunk.metadata, dict) else chunk.metadata,
                    "filename":     getattr(chunk, "filename", None),
                    "tags":         getattr(chunk, "tags", None),
                    "isActive":     getattr(chunk, "isActive", True),
                    "version":      getattr(chunk, "version", None),
                    "createdat":    getattr(chunk, "created_at", current_nz_datetime),
                    "updatedat":    getattr(chunk, "updated_at", None),
                })
    
                # wrap the sql statement to a transaction
                async with conn.begin():
                    await insert_chunk_batch(conn, insert_rows)

    
            break

            # async with conn.begin():
            #     await insert_chunk_batch(conn, insert_rows)

In [35]:
await persist_chunks(db_engine, doc_chunks)

2025-10-14 21:10:09,731 [INFO] db.db_connection_pool: DB connection opened


2025-10-14 21:10:09,732 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-14 21:10:09,732 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-10-14 21:10:09,735 INFO sqlalchemy.engine.Engine 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,735 [INFO] sqlalchemy.engine.Engine: 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,735 INFO sqlalchemy.engine.Engine [cached since 779.4s ago] ('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "header

2025-10-14 21:10:09,735 [INFO] sqlalchemy.engine.Engine: [cached since 779.4s ago] ('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "hea

2025-10-14 21:10:09,783 INFO sqlalchemy.engine.Engine COMMIT


2025-10-14 21:10:09,783 [INFO] sqlalchemy.engine.Engine: COMMIT


2025-10-14 21:10:09,788 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-14 21:10:09,788 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-10-14 21:10:09,789 INFO sqlalchemy.engine.Engine 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,789 [INFO] sqlalchemy.engine.Engine: 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,791 INFO sqlalchemy.engine.Engine [cached since 779.4s ago] [('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "heade

2025-10-14 21:10:09,791 [INFO] sqlalchemy.engine.Engine: [cached since 779.4s ago] [('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "he

2025-10-14 21:10:09,838 INFO sqlalchemy.engine.Engine COMMIT


2025-10-14 21:10:09,838 [INFO] sqlalchemy.engine.Engine: COMMIT


2025-10-14 21:10:09,844 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-14 21:10:09,844 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-10-14 21:10:09,846 INFO sqlalchemy.engine.Engine 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,846 [INFO] sqlalchemy.engine.Engine: 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,846 INFO sqlalchemy.engine.Engine [cached since 779.5s ago] [('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "heade

2025-10-14 21:10:09,846 [INFO] sqlalchemy.engine.Engine: [cached since 779.5s ago] [('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "he

2025-10-14 21:10:09,895 INFO sqlalchemy.engine.Engine COMMIT


2025-10-14 21:10:09,895 [INFO] sqlalchemy.engine.Engine: COMMIT


2025-10-14 21:10:09,899 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-14 21:10:09,899 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-10-14 21:10:09,900 INFO sqlalchemy.engine.Engine 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,900 [INFO] sqlalchemy.engine.Engine: 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,900 INFO sqlalchemy.engine.Engine [cached since 779.5s ago] [('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "heade

2025-10-14 21:10:09,900 [INFO] sqlalchemy.engine.Engine: [cached since 779.5s ago] [('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "he

2025-10-14 21:10:09,947 INFO sqlalchemy.engine.Engine COMMIT


2025-10-14 21:10:09,947 [INFO] sqlalchemy.engine.Engine: COMMIT


2025-10-14 21:10:09,951 INFO sqlalchemy.engine.Engine BEGIN (implicit)


2025-10-14 21:10:09,951 [INFO] sqlalchemy.engine.Engine: BEGIN (implicit)


2025-10-14 21:10:09,951 INFO sqlalchemy.engine.Engine 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,951 [INFO] sqlalchemy.engine.Engine: 
    INSERT INTO document.document_chunk
      (embedding, chunk_text, doc_metadata, file_name, tags, isActive, version, created_at, updated_at)
    VALUES
      ($1, $2, $3, $4, $5, $6, $7, $8, $9)
    


2025-10-14 21:10:09,952 INFO sqlalchemy.engine.Engine [cached since 779.6s ago] [('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "heade

2025-10-14 21:10:09,952 [INFO] sqlalchemy.engine.Engine: [cached since 779.6s ago] [('[-0.0034308889880776405, -0.007139021530747414, 0.0031478696037083864, -0.00457066809758544, 0.08033902198076248, 0.05846758559346199, 0.016311163082 ... (16662 characters truncated) ... 0789836123585701, 0.008140179328620434, -0.025290774181485176, -0.01674243062734604, 0.048178769648075104, -0.04824037849903107, -0.0414017029106617]', '###### **Thank you for choosing State Insurance**  \nThis policy wording, along with your policy schedule, contains all the information you need\nto know about your insurance cover. Please read these carefully and keep them on hand as\nyou will find them useful if you need to make a claim.', '{"source": "C:\\\\Users\\\\aibag\\\\git_repo\\\\policy_wording\\\\state-home-comprehensive-contents-comprehensive-insurance-policy-wording-si6995-2-1 ... (118 characters truncated) ... total_pages": "61", "creation_date": "2024-12-02T10:18:24+13:00", "header_key": "Header 6", "he

2025-10-14 21:10:09,953 INFO sqlalchemy.engine.Engine COMMIT


2025-10-14 21:10:09,953 [INFO] sqlalchemy.engine.Engine: COMMIT
2025-10-14 21:10:09,953 [INFO] db.db_connection_pool: DB connection closed
