**Introduction**

A local LLM+RAG chatbot with structured pdf ingestion using Word to convert pdf to docx, and then pandoc to convert from docx to markdown enabling the use of langchain ParentDocumentRetriever with MarkdownTextSplitter.
Runs fine on my 64GB RAM laptop under WSL Ubuntu, with 32GB of RAM available to WSL. 

Most PDF to text parsers do not provide layout information. Often times, even the sentences are split with arbritrary CR/LFs making it very difficult to find paragraph boundaries. This poses various challenges in chunking and adding long running contextual information such as section header to the passages while indexing/vectorizing PDFs for LLM applications such as retrieval augmented generation (RAG).
Using Word+Pandoc then ParentDocumentRetriever calling MarkdownTextSplitter chained with RecursiveCharacterTextSplitter solves this problem by parsing PDFs along with layout information.  

Replace any path by your own path structure.
In addition to Langchain and Chroma this code uses the following Open sources:
 * Ollama with Wizardlm2 and all-minilm as embedding . [Click here for Ollama website](https://ollama.com/)
Wizardlm2 and all-minilm and downloaded locally.


**Split text using Markdown but remove documents that have only formatting characters or not enough words**  
Splitting may result on having only table line separators or very short sentences.
Removing these for more relevant searches

In [3]:
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownTextSplitter
from statistics import mean
from typing import Any, List, Literal, Optional, Union
import re

# we can remove markdown from child chunks as we return the parent document
# thus vector store does not hold markdown tags 
def remove_punctuation_and_markdown(input_string):
    # Remove markdown formatting characters
    input_string = re.sub(r'[_*#|+]', '', input_string)
    # Remove punctuation
    input_string = re.sub(r'[^\w\s]', '', input_string)
    return input_string

class CleanMarkdownTextSplitter(MarkdownTextSplitter):
    """Attempts to split the text along Markdown-formatted headings. Only leaving chunks with a meaningful content """
    def _split_text(self, text: str, separators: List[str]) -> List[str]:
        """Split incoming text and return chunks."""
        final_chunks = []
        chunks=super()._split_text(text,separators)
        for chunk in chunks:
            words=chunk.split()
            content_len=len(words)
            if content_len>0:
                meanlength=mean(len(word) for word in words)
            else:
                meanlength=0
            if content_len>10 and meanlength>3:
                final_chunks.append(chunk)
        return final_chunks

In [4]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import MarkdownTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.document_loaders import TextLoader
from langchain.storage import LocalFileStore
import tempfile,os
from langchain_core.vectorstores import VectorStore
from langchain_community.embeddings import OllamaEmbeddings

ollama_ef = OllamaEmbeddings(
    model="all-minilm"
)

from langchain.storage._lc_store import create_kv_docstore

# MD splits
parent_splitter = MarkdownTextSplitter(chunk_size=5000,chunk_overlap = 200)
child_splitter = CleanMarkdownTextSplitter(chunk_size=500,chunk_overlap = 60)

md_folder_path = "/mnt/d/data/md"
print("Create Vector store")
vectorstore = Chroma(persist_directory="/mnt/d/data/HIE", embedding_function=ollama_ef)
# Instantiate the LocalFileStore with the root path
print("Create document store")
fs = LocalFileStore("/mnt/d/data/documentstore")
store = create_kv_docstore(fs)
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

print("Start splitting documents")
loaders = []
for i,filename in enumerate(os.listdir(md_folder_path)):
    if filename.endswith('.md'):
        print("load document",filename)
        md_path = os.path.join(md_folder_path, filename)
        loader=TextLoader(md_path) 
        doc=loader.load()
        retriever.add_documents(doc)
print("done")


Create Vector store
Create document store
Start splitting documents
done


**Chatbot to interact with documents loaded before**

In [3]:
from langchain import PromptTemplate
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
import os
from IPython.core.display import  Markdown 
from IPython.display import display
from langchain.text_splitter import MarkdownTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import LocalFileStore
import tempfile,os
from langchain_core.vectorstores import VectorStore
from langchain.storage._lc_store import create_kv_docstore
from langchain_community.embeddings import OllamaEmbeddings

ollama_ef = OllamaEmbeddings(
    model="all-minilm"
)

# MD splits
parent_splitter = MarkdownTextSplitter(chunk_size=5000,chunk_overlap = 200)
child_splitter = MarkdownTextSplitter(chunk_size=500,chunk_overlap = 60)
vectorstore = Chroma(persist_directory="/mnt/d/data/HIE", embedding_function=ollama_ef)
# Instantiate the LocalFileStore with the root path
fs = LocalFileStore("/mnt/d/data/documentstore")
store = create_kv_docstore(fs)
llm = Ollama(model="wizardlm2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),num_ctx=4096,verbose=True)

while True:
    query = input("\n\nQuery: ")
    if query == "exit":
        break
    if query.strip() == "":
        continue

    # Prompt
    '''template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
    1. If you don't know the answer, don't try to make up an answer..
    2. If you find the answer, write the answer in a concise way
    3. Do not give references
    4. Use relevant table if available
    
    {context}
    Question: {question}
    Helpful Answer:"""'''
    # Prompt
    template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
    1. If you don't know the answer, don't try to make up an answer.
    2. If you find the answer, write the answer in a detailed way without references.
    
    {context}
    Question: {question}
    Helpful Answer:"""
    '''template = """Answer the question with details based only on the following context: {context}
     
     Question: {question}"""'''
    QA_CHAIN_PROMPT = PromptTemplate(
        input_variables=["context", "question"],
        template=template,
    )
    retriever = ParentDocumentRetriever(
        vectorstore=vectorstore,
        docstore=store,
        child_splitter=child_splitter,
        parent_splitter=parent_splitter,
        search_type="similarity", search_kwargs={"k": 6})

    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},return_source_documents=True
    )
    print("====================================")
    result = qa_chain.invoke({"query": query})
    print("\n\n Data used")
    for i,doc in enumerate(result.get("source_documents", [])):
        if "source" in doc.metadata:
            print(i+1,os.path.basename(doc.metadata["source"]))
        else:
            print("no source")

        display(Markdown(doc.page_content))



Query: exit
