In [1]:
!pip install pdfplumber pymupdf langchain-openai langchain_community faiss-cpu openai langchain_groq

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.27-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain_groq
  Downloading langchain_groq-0.3.5-py3-none-any.whl.metadata (2.6 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from 

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/AdvanceRAG Problems

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/AdvanceRAG Problems


In [3]:
#"lm.php-2-8.pdf"

In [4]:
import os
import pdfplumber
import fitz  # PyMuPDF
import base64

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.chat_history import InMemoryChatMessageHistory # Corrected import path
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq
from langchain.storage import InMemoryStore # Corrected import path
from openai import OpenAI

# Create folders to save images and tables if they do not exist
OUTPUT_DIR = "pdf_assets"
os.makedirs(f"{OUTPUT_DIR}/images", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/tables", exist_ok=True)

###Define the Document Layout Analysis function
This handles:
1. Detecting one vs two columns
2. Zone segmentation with bounding boxes
3. Reading order: left column first, right column second
4. Image extraction
5. Table extraction

###Document Layout Analysis (DLA)<br>

The general process of detecting and classifying structural components in a document (e.g., text blocks, columns, tables, images, headers, footers). Commonly done for multi-column scientific papers, newspapers, patents, etc.


In [5]:
def process_pdf_layout(file_path):
    """
    Process PDF pages with metadata:
    - Text chunks carry page number.
    - Images/tables carry page number and file path.
    """
    pagewise_chunks = []

    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            width = page.width
            height = page.height

            is_two_column = width > 500

            if is_two_column:
                left_bbox = (0, 0, width / 2, height)
                right_bbox = (width / 2, 0, width, height)

                left_text = page.within_bbox(left_bbox).extract_text() or ""
                if left_text.strip():
                    pagewise_chunks.append({
                        "type": "text",
                        "content": left_text,
                        "page_number": page_num
                    })

                right_text = page.within_bbox(right_bbox).extract_text() or ""
                if right_text.strip():
                    pagewise_chunks.append({
                        "type": "text",
                        "content": right_text,
                        "page_number": page_num
                    })

            else:
                full_text = page.extract_text() or ""
                if full_text.strip():
                    pagewise_chunks.append({
                        "type": "text",
                        "content": full_text,
                        "page_number": page_num
                    })

    doc = fitz.open(file_path)
    for page_num, page in enumerate(doc, start=1):
        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_ext = base_image["ext"]
            image_filename = f"{OUTPUT_DIR}/images/page{page_num}_img{img_index+1}.{img_ext}"
            with open(image_filename, "wb") as img_file:
                img_file.write(image_bytes)
            pagewise_chunks.append({
                "type": "image",
                "content": f"[[IMAGE: {image_filename}]]",
                "page_number": page_num,
                "file": image_filename
            })

    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            tables = page.extract_tables()
            for table_index, table in enumerate(tables):
                rows = []
                for row in table:
                    cleaned_row = [cell if cell else "" for cell in row]
                    rows.append(" | ".join(cleaned_row))
                table_text = "\n".join(rows)
                table_filename = f"{OUTPUT_DIR}/tables/page{page_num}_table{table_index+1}.txt"
                with open(table_filename, "w") as f:
                    f.write(table_text)
                pagewise_chunks.append({
                    "type": "table",
                    "content": f"[[TABLE: {table_filename}]]",
                    "page_number": page_num,
                    "file": table_filename
                })

    return pagewise_chunks

In [11]:
#Putting it together for your pipeline
pdf_text_chunks = process_pdf_layout("lm.php-2-8.pdf")

print("Number of raw chunks:", len(pdf_text_chunks))
print("Example chunk:\n", pdf_text_chunks[0]['content'][:300])

Number of raw chunks: 66
Example chunk:
 Published in Journal of Optics A: Pure an
which should be used for
Comparing glass and
microlenses fabricate
technologies
HOttevaere1,RCox2,HPHerzig3,TMiyashita4,
andHThienpont1
1DepartmentofAppliedPhysicsandPhotonics(TONA-IR),VrijeUni
2MicroFabTechnologies,Inc.,1104SummitAvenue,Suite110,Plano
3Inst


In [10]:
pdf_text_chunks[0].keys()

dict_keys(['type', 'content', 'page_number'])

In [12]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

split_chunks = []

for chunk in pdf_text_chunks:
    if chunk["type"] == "text":
        splits = splitter.split_text(chunk["content"])
        for piece in splits:
            split_chunks.append({
                "type": "text",
                "content": piece,
                "page_number": chunk["page_number"]
            })
    else:
        # For image/table keep original
        split_chunks.append(chunk)

print("Number of split chunks:", len(split_chunks))

Number of split chunks: 87


In [13]:
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY_')
client = OpenAI()

In [14]:
def describe_image(image_path):
    """
    Generates a text description for an image using OpenAI Vision.
    """
    with open(image_path, "rb") as img_file:
        img_b64 = base64.b64encode(img_file.read()).decode()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image for an NLP research context."},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}}
                ]
            }
        ]
    )
    return response.choices[0].message.content

In [15]:
documents = []

for chunk in split_chunks:
    content = chunk["content"]
    metadata = {"page_number": chunk["page_number"], "type": chunk["type"]}

    if chunk["type"] == "image":
        image_path = chunk["file"]
        description = describe_image(image_path)
        content = f"Image (page {chunk['page_number']}): {description}"

    elif chunk["type"] == "table":
        table_path = chunk["file"]
        with open(table_path, "r") as f:
            table_content = f.read()
        content = f"Table (page {chunk['page_number']}):\n{table_content}"

    documents.append(Document(page_content=content, metadata=metadata))

print("Number of final documents with metadata:", len(documents))

Number of final documents with metadata: 87


In [16]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents, embeddings)

print("FAISS vector store created.")

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS vector store created.


In [21]:
import polars as pl
import numpy as np
# Extract data from the documents list into a list of dictionaries
data_for_polars = []
for doc in documents:
    data_for_polars.append({
        "page_content": doc.page_content,
        "page_number": doc.metadata.get("page_number"),
        "type": doc.metadata.get("type")
    })

# Create a Polars DataFrame
df = pl.DataFrame(data_for_polars)

print("\nPolars DataFrame created from document content:")
df.head()


Polars DataFrame created from document content:


page_content,page_number,type
str,i64,str
"""Published in Journal of Optics…",1,"""text"""
"""microlenses,therangeofmaterial…",1,"""text"""
"""Applied Optics 8, issue 7, 407…",1,"""text"""
"""Makingthesephotonicsystemsfast…",1,"""text"""
"""DDrriivveerr TThheerrmmaallrre…",2,"""text"""


In [27]:
df['page_number'].value_counts()

page_number,count
i64,u32
6,21
5,15
2,10
1,4
3,7
4,7
7,23


In [28]:
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"lambda_mult": 0.5}
)

print("Retriever with MMR ready.")

Retriever with MMR ready.


In [33]:
def render_context_with_metadata(docs):
    """
    Format context with page numbers and chunk IDs for traceability.
    """
    rendered = []
    for idx, doc in enumerate(docs, start=1):
        content = doc.page_content.strip()
        page = doc.metadata.get("page_number", "Unknown")
        dtype = doc.metadata.get("type", "text")
        rendered.append(
            f"---\n[Chunk ID: {idx}] (Page: {page}, Type: {dtype})\n{content}\n"
        )
    return "\n".join(rendered)

In [39]:
from langchain_core.runnables import RunnableLambda
prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        "You are a helpful research assistant. "
        "Use the provided context to answer the question accurately. "
        "Also mention the chunk IDs and page numbers of the sources you used."
    ),
    HumanMessagePromptTemplate.from_template(
        "Context:\n{context}\n\nQuestion: {question}"
    )
])

wrapped_retriever = wrapped_retriever = RunnableLambda(
    lambda x: render_context_with_metadata(
        retriever.invoke(x["question"])
    )
)

llm = ChatGroq(
    model="llama3-70b-8192",
    api_key=userdata.get('GROQ_API_KEY')
)

rag_chain = (
    {"context": wrapped_retriever, "question": lambda x: x["question"]}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain is ready.")

RAG chain is ready.


In [40]:
from langchain_core.runnables.history import RunnableWithMessageHistory
from uuid import uuid4

# Use a callable that returns a new or existing memory for a given session
session_id = str(uuid4())

# Keep the store in a dict if you want to reuse for multiple sessions
session_histories = {}

def get_message_history(session_id):
    if session_id not in session_histories:
        session_histories[session_id] = InMemoryChatMessageHistory()
    return session_histories[session_id]

In [41]:
rag_chain_with_memory = RunnableWithMessageHistory(
    rag_chain,
    get_message_history,
    input_messages_key="question",
    history_messages_key="history"
)
print("InMemoryStore initialized for managing chat memory.")

InMemoryStore initialized for managing chat memory.


In [42]:
# First question
response1 = rag_chain_with_memory.invoke(
    {"question": "Explain Architecture of Planar GRIN microlenses?"},
    config={"configurable": {"session_id": session_id}}
)
print("\nAnswer 1:\n", response1)


Answer 1:
 Based on the provided context, the architecture of Planar GRIN microlenses can be explained as follows:

The fabrication process of refractive reflow photoresist microlenses is described in Figure 5 (Chunk ID: 4, Page: 6). The process involves three steps:

1. Patterning of the photoresist layer (Figure 5(a)) [Chunk ID: 4, Page: 6]
2. Development of the exposed regions (Figure 5(b)) [Chunk ID: 4, Page: 6]
3. Melting of the cylindrical islands (Figure 5(c)) [Chunk ID: 4, Page: 6]

This process results in the formation of planar microlenses with a diameter of 240 µm, as shown in Figure 4(a) (Chunk ID: 4, Page: 6).

The optical characteristics of Planar GRIN microlenses are summarized in Table 3 (Chunk ID: 4, Page: 6). These lenses have good imaging properties, making them suitable for various applications, such as photocopiers, facsimile systems, bar-code readers, and optical communication systems in combination with optical fibers for light coupling purposes.

Note that the 

In [44]:
# Follow-up question
response2 = rag_chain_with_memory.invoke(
    {"question": "Flow chart of a microlens fabrication process."},
    config={"configurable": {"session_id": session_id}}
)
print("\nAnswer 2:\n", response2)


Answer 2:
 According to Chunk ID: 3 (Page: 3, Type: text), Figure 2 shows the generic flow chart of a microlens fabrication process. It displays the logical sequence of the different steps to be taken from the underlying physics.


In [45]:
# Follow-up question
response3 = rag_chain_with_memory.invoke(
    {"question": "Explain plastics are polycarbonate (PC) and poly(methyl methacrylate)(PMMA), and what is LIGA processing"},
    config={"configurable": {"session_id": session_id}}
)
print("\nAnswer 3:\n", response3)


Answer 3:
 Based on the provided context, I can answer the question as follows:

Plastics, particularly polycarbonate (PC) and poly(methyl methacrylate) (PMMA), are commonly used in optical components due to their advantages over glass materials (Chunk ID: 1, Page: 2). These advantages include lower weight, lower cost, and easier manufacturing processes. However, they also have some disadvantages, such as low mechanical strength, lower chemical and moisture resistance, larger thermal expansion coefficient α, and larger dispersion (smaller Abbe number).

Regarding LIGA processing, it is not explicitly mentioned in the provided context. However, based on the image (Chunk ID: 3, Page: 5), it appears to show components that may be used in engineering or manufacturing contexts, which could be related to LIGA processing. LIGA is a German acronym that stands for Lithographie, Galvanoformung, Abformung, which translates to Lithography, Electroplating, and Molding. It is a microfabrication tec

In [46]:
# Follow-up question
response4 = rag_chain_with_memory.invoke(
    {"question": "Explain The different fabrication process steps for refractive reflow photoresist microlenses, and is it related to Thermal reflow or resist melting method"},
    config={"configurable": {"session_id": session_id}}
)
print("\nAnswer 4:\n", response4)


Answer 4:
 According to Chunk ID: 3 (Page: 6), the fabrication process steps for refractive reflow photoresist microlenses involve the following steps:

1. Heating the photoresist above the glass transition temperature.
2. The shape of the photoresist cylinders changes due to surface tension, resulting in hemispherical microlenses (see figure 5).
3. This technology has also been extended to yield arrays of cylindrical lenses and lenses with a square footprint [33].

This process is related to the thermal reflow method, where the photoresist is heated above its glass transition temperature, causing it to flow and change shape due to surface tension. This results in the formation of hemispherical microlenses.

It's worth noting that the order of discussion in the paper is in accordance with their historical development, except for reactive ion etching of thermal reflow lenses, which is discussed in a separate section (Chunk ID: 1, Page: 3).


In [47]:
# Show that memory is populated
print("\n--- Memory turns stored ---\n")
for msg in session_histories[session_id].messages:
    print(f"{msg.type.upper()}: {msg.content}\n")


--- Memory turns stored ---

HUMAN: Explain Architecture of Planar GRIN microlenses?

AI: Based on the provided context, the architecture of Planar GRIN microlenses can be explained as follows:

The fabrication process of refractive reflow photoresist microlenses is described in Figure 5 (Chunk ID: 4, Page: 6). The process involves three steps:

1. Patterning of the photoresist layer (Figure 5(a)) [Chunk ID: 4, Page: 6]
2. Development of the exposed regions (Figure 5(b)) [Chunk ID: 4, Page: 6]
3. Melting of the cylindrical islands (Figure 5(c)) [Chunk ID: 4, Page: 6]

This process results in the formation of planar microlenses with a diameter of 240 µm, as shown in Figure 4(a) (Chunk ID: 4, Page: 6).

The optical characteristics of Planar GRIN microlenses are summarized in Table 3 (Chunk ID: 4, Page: 6). These lenses have good imaging properties, making them suitable for various applications, such as photocopiers, facsimile systems, bar-code readers, and optical communication systems 

In [48]:
if session_id in session_histories:
  session_histories[session_id].messages.clear()
  print(f"Memory for session {session_id} has been cleared.")
else:
  print(f"No memory found for session {session_id}.")

# Verify that the memory is empty
print("\n--- Memory turns after clearing ---\n")
if session_id in session_histories:
    for msg in session_histories[session_id].messages:
        print(f"{msg.type.upper()}: {msg.content}\n")
else:
    print(f"Session {session_id} no longer in history store after clearing.")

Memory for session e89f0cc8-22e9-412f-a694-a96fc0911e46 has been cleared.

--- Memory turns after clearing ---

