<a href="https://colab.research.google.com/github/dishitasood/workflow/blob/master/build_rag_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Section 1: Loading PDF

In [1]:
# install necessary libraries
!pip install -q llama-index llama-index-llms-gemini pymupdf
!pip install -q llama-index-embeddings-huggingface
!pip install nest_asyncio
!pip install --upgrade transformers
!pip install -U sentence_transformers



In [3]:
import os
import fitz  # PyMuPDF
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Markdown, display
import nest_asyncio

In [47]:
import os
GOOGLE_API_KEY = "AIzaSyC8DoAne5KteQkeWFOMUGFvmFZTvwbyah4"
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [5]:
nest_asyncio.apply()

In [6]:
!mkdir -p sample_docs

In [7]:
from google.colab import files
import os

def upload_pdf():
    """Upload a PDF file and return its path."""
    print("Please select a PDF file to upload:")
    uploaded = files.upload()

    for filename in uploaded.keys():
        if filename.endswith('.pdf'):
            # Save to the sample_docs directory
            pdf_path = os.path.join("sample_docs", filename)

            # Create directory if it doesn't exist
            os.makedirs("sample_docs", exist_ok=True)

            # Save the file
            with open(pdf_path, 'wb') as f:
                f.write(uploaded[filename])

            print(f"PDF saved to {pdf_path}")
            return pdf_path
        else:
            print(f"File {filename} is not a PDF. Please upload a PDF file.")

    return None


In [28]:
pdf_path = upload_pdf()

Please select a PDF file to upload:


Saving LenderFeesWorksheetNew.pdf to LenderFeesWorksheetNew (1).pdf
PDF saved to sample_docs/LenderFeesWorksheetNew (1).pdf


In [29]:
def extract_text_from_pdf(pdf_path):

  doc = fitz.open(pdf_path)

  # extract texts from all pages
  text = "\n".join([page.get_text() for page in doc])

  print("PDF: ", {pdf_path})
  print("Number of pages: ", len(doc))
  print(f"Extracted {len(text.split())} from the pdf")

  doc.close()

  return text

In [30]:
if pdf_path:
  text = extract_text_from_pdf(pdf_path)
  print(text[:500])

PDF:  {'sample_docs/LenderFeesWorksheetNew (1).pdf'}
Number of pages:  1
Extracted 404 from the pdf
Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a loan.
Fee Details and Summary
Applicants:
Application No:
Date Prepared:
Loan Program:
Prepared By:
THIS IS NOT A GOOD FAITH ESTIMATE (GFE). This "Fees Worksheet" is provided for informational purposes ONLY, to assist
you in determining an estimate of cash that may be required to close and an estimate of your proposed monthly mortgage 
payment. Actual charges may be more or less, and your transac


### Integrating PyMuPDF with LlamaIndex

In [31]:
from llama_index.core import Document
from typing import List

def load_pdf_with_pymupdf(pdf_path: str) -> List[Document]:

  # open the pdf
  doc = fitz.open(pdf_path)

  documents = []

  for i, page in enumerate(doc):
    text = page.get_text()

    if not text.strip():
      continue

    documents.append(
        Document(
            text=text,
            metadata={
                "file_name": os.path.basename(pdf_path),
                "page_number": i + 1,
                "total_pages": len(doc)
            }
        )
    )

  doc.close()

  print(f"Processed {pdf_path}:")
  print(f"Extracted {len(documents)} pages with content")

  return documents



In [32]:
# example usage
pdf_docs = load_pdf_with_pymupdf(pdf_path)

Processed sample_docs/LenderFeesWorksheetNew (1).pdf:
Extracted 1 pages with content


In [52]:
import os
from google.colab import userdata # Import userdata
GOOGLE_API_KEY = userdata.get('gemini_key')
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

### Section 2: Idexing and Processing PDFs

To add your API key to Colab secrets:

1. Click on the "🔑" icon in the left sidebar of your Colab notebook.
2. Click on "Add new secret".
3. In the "Name" field, enter a name for your secret (e.g., `GOOGLE_API_KEY`).
4. In the "Value" field, paste your API key.
5. Make sure the "Notebook access" toggle is turned on for the current notebook.
6. Click "Done".

Now you can access your API key in your code using `userdata.get('YOUR_SECRET_NAME')`, replacing `YOUR_SECRET_NAME` with the name you gave your secret.

In [54]:
from llama_index.llms.gemini import Gemini
from llama_index.core import Settings
from llama_index.core import VectorStoreIndex
from sentence_transformers import SentenceTransformer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding # Import HuggingFaceEmbedding


# #initalize gemini llm
# llm = Gemini("models/gemini-1.5-flash")
# Settings.llm = llm

#initialize embedding model, sentence transformer
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = HuggingFaceEmbedding(model_name=embed_model_name) # Wrap SentenceTransformer model
Settings.embed_model = embed_model

def process_index_pdf(pdf_path):

  #load documents
  documents = load_pdf_with_pymupdf(pdf_path)

  #create vector index
  index = VectorStoreIndex.from_documents(documents)

  print(f"Indexed {len(documents)} document chunks")

  return index

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
# !pip install -q llama-index-llms-google-genai

In [55]:
if pdf_path:
  index = process_index_pdf(pdf_path)
else:
  print("No PDF file uploaded. Please upload a PDF file using the previous cell.")

Processed sample_docs/LenderFeesWorksheetNew (1).pdf:
Extracted 1 pages with content
Indexed 1 document chunks


### Section 3: Implementing Query Expansion and Rewriting

In [67]:
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.core import Settings

# Initialize Gemini LLM
llm = GoogleGenAI(
    model="gemini-2.5-flash"
)
Settings.llm = llm