**Introduction**

A local LLM+RAG chatbot with structured pdf ingestion using Word to convert pdf to docx, and then pandoc to convert from docx to markdown enabling the use of langchain ParentDocumentRetriever with MarkdownTextSplitter.
Runs fine on my 64GB RAM laptop under WSL Ubuntu, with 32GB of RAM available to WSL. 

Most PDF to text parsers do not provide layout information. Often times, even the sentences are split with arbritrary CR/LFs making it very difficult to find paragraph boundaries. This poses various challenges in chunking and adding long running contextual information such as section header to the passages while indexing/vectorizing PDFs for LLM applications such as retrieval augmented generation (RAG).
Using Word+Pandoc then ParentDocumentRetriever calling MarkdownTextSplitter chained with RecursiveCharacterTextSplitter solves this problem by parsing PDFs along with layout information.  

Replace any path by your own path structure.
In addition to Langchain and Chroma this code uses the following Open sources:
 * Ollama with Wizardlm2. [Click here for Ollama website](https://ollama.com/)



In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
from langchain.text_splitter import MarkdownTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.document_loaders import TextLoader
from langchain.storage import LocalFileStore
import tempfile,os
from langchain_core.vectorstores import VectorStore

from langchain.storage._lc_store import create_kv_docstore

# MD splits
parent_splitter = MarkdownTextSplitter()
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

# Char-level splits
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 500
chunk_overlap = 60
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

md_folder_path = "/mnt/d/data/md"
vectorstore = Chroma(persist_directory="/mnt/d/data/MDHIE", embedding_function=GPT4AllEmbeddings())
# Instantiate the LocalFileStore with the root path
fs = LocalFileStore("/mnt/d/data/documentstore")
store = create_kv_docstore(fs)
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

loaders = []
for i,filename in enumerate(os.listdir(md_folder_path)):
    if filename.endswith('.md'):
        print("load document",filename)
        md_path = os.path.join(md_folder_path, filename)
        loader=TextLoader(md_path) 
        doc=loader.load()
        retriever.add_documents(doc)
print("done")


load document 1-s2.0-S0169260721006672-main.md
load document 1-s2.0-S0987705310000080-main.md
load document 1-s2.0-S098770532030109X-am.md
load document 1-s2.0-S0987705320301477-am.md
load document 1-s2.0-S1388245715006136-main.md
load document 1-s2.0-S1388245715006215-main.md
load document 1-s2.0-S2405844021015140-main.md
load document 10.2478_prilozi-2022-0013.md
load document 12519_2023_Article_698.md
load document 2106.00061.md
load document 217656905.md
load document 70581176.md
load document ACI-Hypoxic-ischaemic-encephalopathy-in-newborns-recognition-monitoring-and-early-management.md
load document Acta Paediatrica - July 1955 - ENHORNING - An Experimental Study of the Human Fetus with Special Reference to Asphyxia.md
load document aeeg.md
load document Analyse quantitative et automatisée des EEG néonataux post-anoxiques.md
load document app7-parent-info.md
load document battisti_pediaelectrophysiology.md
load document children-09-01194-v2.md
load document chp_NE.md
load documen

In [2]:
from langchain import PromptTemplate
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.embeddings import GPT4AllEmbeddings
import os
from IPython.core.display import  Markdown 
from IPython.display import display
from langchain.text_splitter import MarkdownTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import LocalFileStore
import tempfile,os
from langchain_core.vectorstores import VectorStore
from langchain.storage._lc_store import create_kv_docstore
# MD splits
parent_splitter = MarkdownTextSplitter()
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
vectorstore = Chroma(persist_directory="/mnt/d/data/MDHIE", embedding_function=GPT4AllEmbeddings())
# Instantiate the LocalFileStore with the root path
fs = LocalFileStore("/mnt/d/data/documentstore")
store = create_kv_docstore(fs)
llm = Ollama(model="wizardlm2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),num_ctx=4096,verbose=True)
#llm = Ollama(model="orca-mini:13b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),num_ctx=4096,verbose=True)

while True:
    query = input("\n\nQuery: ")
    if query == "exit":
        break
    if query.strip() == "":
        continue

    # Prompt
    '''template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
    1. If you don't know the answer, don't try to make up an answer..
    2. If you find the answer, write the answer in a concise way
    3. Do not give references
    4. Use relevant table if available
    
    {context}
    Question: {question}
    Helpful Answer:"""'''
    # Prompt
    template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
    1. If you don't know the answer, don't try to make up an answer.
    2. If you find the answer, write the answer in a detailed way without references.
    
    {context}
    Question: {question}
    Helpful Answer:"""
    '''template = """Answer the question with details based only on the following context: {context}
     
     Question: {question}"""'''
    QA_CHAIN_PROMPT = PromptTemplate(
        input_variables=["context", "question"],
        template=template,
    )
    retriever = ParentDocumentRetriever(
        vectorstore=vectorstore,
        docstore=store,
        child_splitter=child_splitter,
        parent_splitter=parent_splitter)

    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},return_source_documents=True
    )
    print("====================================")
    result = qa_chain.invoke({"query": query})
    print("\n\n Data used")
    for i,doc in enumerate(result.get("source_documents", [])):
        if "source" in doc.metadata:
            print(i+1,os.path.basename(doc.metadata["source"]))
        else:
            print("no source")

        display(Markdown(doc.page_content))



Query: exit
