In [None]:
def section_aware_split(text: str, max_chunk_len: int = 1500) -> list:
    """
    Chunk a Markdown-style document into hierarchical sections (using #, ##, ###) 
    and return structured chunks with section path and level.
    """
    import re

    lines = text.splitlines()
    chunks = []
    current_chunk_lines = []
    current_path = []

    def flush_chunk():
        if not current_chunk_lines:
            return
        content = "\n".join(current_chunk_lines).strip()
        if content:
            chunks.append({
                "section_path": current_path.copy(),
                "level": len(current_path),
                "content": content
            })

    for line in lines:
        header_match = re.match(r"^(#{1,6})\s+(.*)", line)
        if header_match:
            # New header found
            flush_chunk()
            level = len(header_match.group(1))
            title = header_match.group(2).strip()
            current_path = current_path[:level - 1] + [title]
            current_chunk_lines = [line]
        else:
            current_chunk_lines.append(line)

    flush_chunk()
    return chunks

In [None]:
import os
import nest_asyncio

from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.vector_stores import VectorStoreQueryResult
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.core import Settings
from typing import List
from dotenv import load_dotenv
import json
import os
nest_asyncio.apply()
load_dotenv(dotenv_path=".env.dev")

In [None]:
from docx import Document

def extract_tables_as_markdown(docx_path):
    doc = Document(docx_path)
    markdown_tables = []
    for table in doc.tables:
        rows = []
        for row in table.rows:
            cells = [cell.text.strip() for cell in row.cells]
            rows.append("| " + " | ".join(cells) + " |")
        if rows:
            header = rows[0]
            separator = "| " + " | ".join(["---"] * len(table.columns)) + " |"
            markdown_table = "\n".join([header, separator] + rows[1:])
            markdown_tables.append(markdown_table)
    return markdown_tables

In [None]:
from llama_index.readers.file import DocxReader
from llama_index.core.schema import Document
import os
import glob
import re

# ——————————————
# CONFIGURATION
# ——————————————
DOCX_FOLDER = "documents/"
SHAREPOINT_BASE_URL = "https://cpaxtra.sharepoint.com/sites/forms-library"

# ——————————————
# SECTION TITLE HELPER
# ——————————————
def extract_section_title(chunk: str) -> str:
    """
    Extract section title from chunk marked as [SECTION] ... or fallback to first line.
    """
    match = re.search(r"\[SECTION\] (.*?)\n", chunk)
    if match:
        return match.group(1).strip()
    # fallback to first non-empty line
    lines = [line.strip() for line in chunk.splitlines() if line.strip()]
    return lines[0] if lines else "unknown"

# ——————————————
# STEP 1: Discover all .docx files
# ——————————————
all_paths = glob.glob(os.path.join(DOCX_FOLDER, "*.docx"))
print(f"Found {len(all_paths)} .docx file(s):")
for p in all_paths:
    print("  •", p)

# ——————————————
# STEP 2: Load each DOCX and wrap as Document
# ——————————————
reader = DocxReader()
raw_documents = []
for file_path in all_paths:
    docx_pages = reader.load_data(file_path)
    for page_obj in docx_pages:
        raw_documents.append(
            Document(
                text=page_obj.text,
                metadata={"source": os.path.basename(file_path)}
            )
        )

print(f"Loaded {len(raw_documents)} raw Document(s) from all .docx files.")

# ——————————————
# STEP 3: Chunk each Document semantically with metadata
# ——————————————
nodes = []
for doc in raw_documents:
    file_name = doc.metadata.get("source", "")
    attachment_link = f"{SHAREPOINT_BASE_URL}/{file_name}"
    section_chunks = section_aware_split(doc.text)
    
    for i, chunk in enumerate(section_chunks):
        nodes.append(
            Document(
                text=chunk["content"],
                metadata={
                    **doc.metadata,
                    "chunk_id": i,
                    "section_path": chunk["section_path"],
                    "level": chunk["level"],
                    "attachment_link": f"{SHAREPOINT_BASE_URL}/{file_name}"
                }
            )
        )

print(f"After splitting, we have {len(nodes)} chunked Documents (nodes).")

# ——————————————
# FINAL: Assign to `documents` so rest of pipeline stays unchanged
# ——————————————
documents = nodes

In [None]:
from llama_index.core import StorageContext

## Setup Cohear Embedding service

In [None]:
# … (no need to call load_dotenv() here) …

# Hard-code your key and model ID:
COHEAR_KEY      = "Iyn2rmOdEgiUKfxptJDhCKRwgfeIWhZ37sxzKUAc"
COHEAR_MODEL_ID = "embed-multilingual-light-v3.0"

print("🔑 Using Cohere key:   ", COHEAR_KEY)
print("🔢 Using Cohere model: ", COHEAR_MODEL_ID)

embed_model = CohereEmbedding(
    api_key=COHEAR_KEY,
    model_name=COHEAR_MODEL_ID,
    input_type="search_document",
    embedding_type="float",
)

Settings.chunk_size = 1024

In [None]:
from llama_index.embeddings.text_embeddings_inference import TextEmbeddingsInference

# Initialize the Qdrant client
# Initialize the embedding settings
embed_model = TextEmbeddingsInference(
    model_name=os.getenv("EMBED_MODEL_ID"),
    base_url=os.getenv("EMBED_BASE_URL"),
    auth_token=f"Bearer {os.getenv('API_KEY_CHATBOT')}",
    timeout=60,
    embed_batch_size=10,
)

Settings.embed_model = embed_model

## Innitiates VectorStore database (Qdrant)

In [None]:
from qdrant_client import QdrantClient
from llama_index.vector_stores.qdrant import QdrantVectorStore
import os

# Initialize Qdrant client with HTTP (not gRPC)
client = QdrantClient(
    url="http://localhost:6433",  # Using HTTP endpoint exposed by Docker
    api_key=os.getenv("QDRANT_API_KEY"),
    prefer_grpc=False,            # Disable gRPC to avoid connection issues
    timeout=60,
    check_compatibility=False     # Suppress version mismatch warning
)

# Load collection name from environment
collection_name = os.getenv("QDRANT_COLLECTION_NAME")

# Delete collection if it exists
if client.collection_exists(collection_name):
    client.delete_collection(collection_name)

# Create Qdrant vector store with hybrid search enabled
vector_store = QdrantVectorStore(
    collection_name=collection_name,
    client=client,
    enable_hybrid=True,
    batch_size=20,
    prefer_grpc=False             # Match client setting
)

## Start embedding process.... into vector database

In [None]:
# ✅ Hardcoded API key and model config (no .env loading)
COHERE_API_KEY = "Iyn2rmOdEgiUKfxptJDhCKRwgfeIWhZ37sxzKUAc"
COHERE_MODEL_ID = "embed-multilingual-light-v3.0"  # <-- replace with your actual model if different

QDRANT_URL = "http://localhost:6334"
QDRANT_API_KEY = None  # Set this to your Qdrant key if needed
COLLECTION_NAME = "my_collection"

print("✅ COHERE_API_KEY loaded.")

# ✅ Initialize Cohere embed model
embed_model = CohereEmbedding(
    cohere_api_key=COHERE_API_KEY,
    model_name=COHERE_MODEL_ID,
    input_type="search_document",
    embedding_type="float",
)

# ✅ Build index from documents
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents=documents,
    embed_model=embed_model,
    storage_context=storage_context,
)

## Try to retrive relavent nodes with question.

In [None]:
embed_model = CohereEmbedding(
    api_key=os.getenv("COHERE_API_KEY"),
    model_name=os.getenv("COHERE_MODEL_ID"),
    input_type="search_query",
    embedding_type="float",
)

search_query_retriever = index.as_retriever()

search_query_retrieved_nodes = search_query_retriever.retrieve(
"Do all Walmart locations offer scan & go?"
)

In [None]:
from llama_index.core.response.notebook_utils import display_source_node
for n in search_query_retrieved_nodes:
    display_source_node(n, source_length=2000)