## Installation
Uncomment and run this only once

In [None]:
%pip install docling

## Importing Libraries

In [None]:
from docling.document_converter import DocumentConverter

## Analyzing the source

First we set the source and parse it with docling's DocumentConverter

In [None]:

source = "https://arxiv.org/pdf/2408.09869"
converter = DocumentConverter()
doc = converter.convert(source).document

Displaying the markdown representation of the PDF

In [None]:
from IPython.display import HTML, display

# Convert markdown to HTML and display in scrollable div
from markdown import markdown
html_content = markdown(doc.export_to_markdown())

display(HTML(f'''
<div style="max-height: 600px; overflow-y: auto; border: 1px solid #ddd; padding: 10px;">
    {html_content}
</div>
'''))

## Creating a RAG application
This section splits the text using the markdown headers, process it using an embeddling model and creates a FAISS vector store

In [None]:
# Minimal RAG Application

# 1. Get markdown content from the document
markdown_content = doc.export_to_markdown()

# 2. Split text by markdown headers
from langchain_text_splitters import MarkdownHeaderTextSplitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_content)
print(f"Split into {len(md_header_splits)} chunks")

# 3. Create embeddings using HuggingFace model
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 4. Create FAISS vector store
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(md_header_splits, embeddings)

# 5. Create retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# 6. Set up the LLM
from langchain_ollama import OllamaLLM

llm = OllamaLLM(model="granite4:micro")

# 7. Create RAG chain
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)



Asking questions

In [None]:
# 8. Test with a question
question = "What is this document about?"
result = qa_chain.invoke({"query": question})

print(f"\nQuestion: {question}")
print(f"\nAnswer: {result['result']}")