### Library Installation & set up

In [2]:
# Install required libraries
!pip install -q llama-index llama-index-llms-gemini pymupdf
!pip install -q llama-index-embeddings-huggingface
!pip install nest_asyncio
!pip install -q llama-index-embeddings-openai
!pip install -q llama-index-vector-stores-chroma
!pip install -q sentence-transformers
!pip install -q chromadb
!pip install -q llama_index.retrievers.bm25

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m4.9 MB/s[0m eta [36m0

### Environment Configuration

In [38]:
import os
import fitz  # PyMuPDF
import time
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Markdown, display
import nest_asyncio
from google.colab import files
from llama_index.core import Document
from typing import List
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import NodeWithScore, QueryBundle
from llama_index.core.llms import ChatMessage
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine import CondensePlusContextChatEngine




#Initialiaze Gemini API key
GEMINI_API_KEY = "API key" #Insert your API key here
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY



### PDF Upload & Validation

In [6]:
#Upload pdf
def upload_pdf():
    """Upload a PDF file and return its path."""
    print("Please select a PDF file to upload:")
    uploaded = files.upload()

    for filename in uploaded.keys():
        if filename.endswith('.pdf'):
            # Save to the sample_docs directory
            pdf_path = os.path.join("sample_docs", filename)

            # Create directory if it doesn't exist
            os.makedirs("sample_docs", exist_ok=True)

            # Save the file
            with open(pdf_path, 'wb') as f:
                f.write(uploaded[filename])

            print(f"PDF saved to {pdf_path}")
            return pdf_path
        else:
            print(f"File {filename} is not a PDF. Please upload a PDF file.")

    return None

pdf_path = upload_pdf()

Please select a PDF file to upload:


Saving LenderFeesWorksheetNew.pdf to LenderFeesWorksheetNew.pdf
PDF saved to sample_docs/LenderFeesWorksheetNew.pdf


### Document Parsing with PyMuPDF

In [10]:
def load_pdf_with_pymupdf(pdf_path: str) -> List[Document]:
    """Load a PDF and convert it to LlamaIndex Document format using PyMuPDF."""
    # Open the PDF
    doc = fitz.open(pdf_path)

    # Extract text from each page
    documents = []

    for i, page in enumerate(doc):
        text = page.get_text()

        # Skip empty pages
        if not text.strip():
            continue

        # Create Document object with metadata
        documents.append(
            Document(
                text=text,
                metadata={
                    "file_name": os.path.basename(pdf_path),
                    "page_number": i + 1,
                    "total_pages": len(doc)
                }
            )
        )

    # Close the document
    doc.close()

    # Print stats
    print(f"Processed {pdf_path}:")

    return documents

pdf_docs = load_pdf_with_pymupdf(pdf_path)

Processed sample_docs/LenderFeesWorksheetNew.pdf:


### Semantic Text Chunking

In [14]:

# Initialize embedding model for semantic splitting
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create semantic chunker
semantic_splitter = SemanticSplitterNodeParser(embed_model=embed_model)
chunks_semantic = semantic_splitter.get_nodes_from_documents(pdf_docs)
print(f"Total Semantic Chunks Created: {len(chunks_semantic)}")

# Display a sample semantic chunk
print(f"\nSample semantic chunk:")
print(chunks_semantic[0].text)

Total Semantic Chunks Created: 2

Sample semantic chunk:
Closing Costs (+)
Loan Amount (-)
Principal & Interest
Other Financing (P & I)
Hazard Insurance
Real Estate Taxes
Mortgage Insurance
Homeowner Assn. Dues
Other
* PFC
F
POC
= Prepaid Finance Charge
= FHA Allowable Closing Cost
= Paid Outside of Closing
** B
S
Br
L
TP
C
= Borrower
= Seller
= Broker
= Lender
= Third Party
= Correspondent
Calyx Form - feews.frm (09/2015)
FEES WORKSHEET
John Q. Smith / Mary A. Smith
samplesmith
10/05/2015
30 YEAR FIXED -Purchase
XYZ Lender
$ 380,000
4.250 %
360 / 360 mths
475,000.00
1,121.53
4,520.00
380,000.00
Cash Deposit
5,000.00
needed to close
95,641.53
1,869.37
39.58
400.00
2,308.95
ORIGINATION CHARGES
Underwriting Fee
XYZ Lender
Borrower
$
550.00
Wire Transfer Fee
XYZ Lender
Borrower
$
75.00
Administration Fee
XYZ Lender
Borrower
$
445.00
OTHER CHARGES
Appraisal Fee
XYZ Lender
Borrower
$
525.00
Credit Report Fee
XYZ Lender
Borrower
$
25.00
Tax Service Fee
XYZ Lender
Borrower
$
80.00
Flood Certi

### Generate Text Embeddings

In [16]:
# Initialize a list to store results
embedding_results = []

# Test different HuggingFace embedding models
hf_models = [
    "sentence-transformers/all-MiniLM-L6-v2",  # Small, fast model (384 dimensions)
    "sentence-transformers/all-mpnet-base-v2",  # More powerful (768 dimensions)
]

for model_name in hf_models:
    start_time = time.time()

    embed_model = HuggingFaceEmbedding(model_name=model_name)

    # Apply embeddings
for chunk in chunks_semantic:  # Using Semantic Chunks for best retrieval
    chunk.embedding = embed_model.get_text_embedding(chunk.text)

    end_time = time.time()

    embedding_results.append({
        'Model': model_name,
        'Dimensions': len(chunk.embedding),
        'Generation Time (s)': end_time - start_time
    })


print("Embeddings Generated Successfully!")

# Display results
#embedding_df = pd.DataFrame(embedding_results)
#display(embedding_df)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings Generated Successfully!


Unnamed: 0,Model,Dimensions,Generation Time (s)
0,sentence-transformers/all-mpnet-base-v2,768,14.810159
1,sentence-transformers/all-mpnet-base-v2,768,17.17895


### Vector Index Construction

In [20]:
# Initialize Gemini LLM
llm = Gemini(model="models/gemini-2.5-flash")
Settings.llm = llm


Settings.embed_model = embed_model

def process_and_index_pdf(pdf_path):
    """Process a PDF and create both vector and keyword indices."""

    # Create vector index
    vector_index = VectorStoreIndex.from_documents(pdf_docs)

    print(f"Indexed {len(pdf_docs)} document chunks")

    return vector_index

index = process_and_index_pdf(pdf_path)

  llm = Gemini(model="models/gemini-2.5-flash")


Indexed 1 document chunks


### Hybrid Retrieval & RAG Pipeline

In [43]:
def build_rag_pipeline(index):
    """Build a simple but effective RAG pipeline with hybrid retrieval and reranking."""

    # Get all nodes from the index's docstore
    nodes = list(index.docstore.docs.values())

    # Determine safe top_k value (number of nodes to retrieve)
    # Must be at least 1 and no more than the number of available nodes
    num_nodes = len(nodes)
    safe_top_k = min(2, max(1, num_nodes))

    print(f"Index contains {num_nodes} nodes, using top_k={safe_top_k}")

    # Create a hybrid retriever combining vector and keyword search
    # First, get the vector retriever (for semantic understanding)
    vector_retriever = index.as_retriever(
        similarity_top_k=safe_top_k  # Retrieve top 3 most similar chunks
    )

    # Next, create a BM25 retriever (for keyword matching)
    # Get all nodes from the index's docstore
    nodes = list(index.docstore.docs.values())
    bm25_retriever = BM25Retriever.from_defaults(
        nodes=nodes,
        similarity_top_k=safe_top_k  # Retrieve top 3 most similar chunks
    )

    # Create a proper hybrid retriever class
    class HybridRetriever(BaseRetriever):
        """Hybrid retriever that combines vector and keyword search results."""

        def __init__(self, vector_retriever, keyword_retriever, top_k=2):
            """Initialize with vector and keyword retrievers."""
            self.vector_retriever = vector_retriever
            self.keyword_retriever = keyword_retriever
            self.top_k = top_k
            super().__init__()

        def _retrieve(self, query_bundle, **kwargs):
            """Retrieve from both retrievers and combine results."""
            # Get results from both retrievers
            vector_nodes = self.vector_retriever.retrieve(query_bundle)
            keyword_nodes = self.keyword_retriever.retrieve(query_bundle)

            # Combine all nodes
            all_nodes = list(vector_nodes) + list(keyword_nodes)

            # Remove duplicates (by node_id)
            unique_nodes = {}
            for node in all_nodes:
                if node.node_id not in unique_nodes:
                    unique_nodes[node.node_id] = node

            # Sort by score (higher is better)
            sorted_nodes = sorted(
                unique_nodes.values(),
                key=lambda x: x.score if hasattr(x, 'score') else 0.0,
                reverse=True
            )

            return sorted_nodes[:self.top_k]  # Return top results

    # Create our hybrid retriever instance
    hybrid_retriever = HybridRetriever(
        vector_retriever=vector_retriever,
        keyword_retriever=bm25_retriever,
        top_k=safe_top_k
    )

    # Create a reranker to prioritize the most relevant chunks
    if num_nodes > 1:
        reranker = SentenceTransformerRerank(
            model="cross-encoder/ms-marco-MiniLM-L-6-v2",
            top_n=min(2, num_nodes)  # Keep only top results after reranking
        )
        node_postprocessors = [reranker]
    else:
        node_postprocessors = []


    #  Build the query engine
    query_engine = RetrieverQueryEngine.from_args(
        retriever=hybrid_retriever,
        llm=llm,
        node_postprocessors=node_postprocessors
    )

    return query_engine

index = process_and_index_pdf(pdf_path)
rag_engine = build_rag_pipeline(index)
response = rag_engine.query("How much does the borrower pay for lender's title insurance?")
print('\nResponse:\n ---------------------- \n')
print(response)


DEBUG:bm25s:Building index from IDs objects


Indexed 1 document chunks
Index contains 1 nodes, using top_k=1

Prompt one answer:
 ---------------------- 

The borrower pays $650.00 for lender's title insurance.
