# Assignment - RAG with Semantic Chunking

In [3]:
import os
from langchain import hub
from dotenv import load_dotenv
from langchain_groq.chat_models import ChatGroq
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.pdf import UnstructuredPDFLoader

In [4]:
load_dotenv()

# print(os.getenv("OPENAI_API_KEY"))
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")
os.environ["LANGCHAIN_PROJECT"]=os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"]="true"

In [17]:
# file_name = r"C:\Users\deepak.a.dhiman\projects\agentic_ai\11-5-2025\data\Attention_Paper.pdf"
file_name = r"C:\Users\deepak.a.dhiman\projects\agentic_ai\11-5-2025\data\BERT_Paper.pdf"
# image_dir=r"C:\Users\deepak.a.dhiman\projects\agentic_ai\11-5-2025\3-vectoredb\Assignment\extracted_images"

## using Langchain

In [55]:
loader = UnstructuredPDFLoader(
    file_path=file_name,
    strategy="hi_res",
    mode="elements"
    # partition_via_api=True,
    # cordinates=True
    )

In [None]:
# getting docs from loader above

docs = []
for doc in loader.lazy_load():
    docs.append(doc)

In [None]:
print("Number of LangChain documents:", len(docs))
print("Length of text in the document:", len(docs[10].page_content))

Number of LangChain documents: 218
Length of text in the document: 5


In [None]:
for doc in docs[:20]:
    print(doc.metadata)
    print(doc.page_content)
    print("==================")

{'source': 'C:\\Users\\deepak.a.dhiman\\projects\\agentic_ai\\11-5-2025\\data\\Attention_Paper.pdf', 'coordinates': {'points': ((45.388888888888886, 594.2222222222224), (45.388888888888886, 622.0000000000002), (100.94444444444446, 622.0000000000002), (100.94444444444446, 594.2222222222224)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2025-06-10T10:11:36', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'file_directory': 'C:\\Users\\deepak.a.dhiman\\projects\\agentic_ai\\11-5-2025\\data', 'filename': 'Attention_Paper.pdf', 'category': 'UncategorizedText', 'element_id': '0359882270ebdc4bd04d5ad335d61ec2'}
3
{'source': 'C:\\Users\\deepak.a.dhiman\\projects\\agentic_ai\\11-5-2025\\data\\Attention_Paper.pdf', 'coordinates': {'points': ((51.0, 599.0), (51.0, 703.0), (88.0, 703.0), (88.0, 599.0)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2025-06-10T10:11:36', 'filetype': 'applica

## using Unstructured directly to parse documents

In [15]:
from unstructured.partition.pdf import partition_pdf

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
raw_pdf_elements = partition_pdf(
    filename=file_name,
    strategy="hi_res",
    extract_images_in_pdf=True,
    extract_image_block_types=["Image","Tabel"],
    extract_image_block_to_payload=False,
    extract_image_block_output_dir="extracted_data_2"
)

The requested type (Tabel) doesn't match any available type


In [19]:
print(len(raw_pdf_elements))

raw_pdf_elements[:15]

300


[<unstructured.documents.elements.Text at 0x2c469993690>,
 <unstructured.documents.elements.Text at 0x2c46a150550>,
 <unstructured.documents.elements.Text at 0x2c460997f90>,
 <unstructured.documents.elements.Text at 0x2c468164f50>,
 <unstructured.documents.elements.Text at 0x2c4699a6910>,
 <unstructured.documents.elements.Header at 0x2c40022e990>,
 <unstructured.documents.elements.Title at 0x2c469f53fd0>,
 <unstructured.documents.elements.Title at 0x2c468403490>,
 <unstructured.documents.elements.Title at 0x2c469f53610>,
 <unstructured.documents.elements.Title at 0x2c400247150>,
 <unstructured.documents.elements.Title at 0x2c469f535d0>,
 <unstructured.documents.elements.Title at 0x2c46a424450>,
 <unstructured.documents.elements.Title at 0x2c46827bc50>,
 <unstructured.documents.elements.NarrativeText at 0x2c40022f550>,
 <unstructured.documents.elements.Title at 0x2c46a37bfd0>]

In [20]:
# Saving the components

Header=[]
Footer=[]
Title=[]
NarrativeText=[]
Text=[]
ListItem=[]
Table = []


for element in raw_pdf_elements:
    if "unstructured.documents.elements.Header" in str(type(element)):
        Header.append(str(element))
    elif "unstructured.documents.elements.Footer" in str(type(element)):
        Footer.append(str(element))
    elif "unstructured.documents.elements.Title" in str(type(element)):
        Title.append(str(element))
    elif "unstructured.documents.elements.NarrativeText" in str(type(element)):
        NarrativeText.append(str(element))
    elif "unstructured.documents.elements.Text" in str(type(element)):
        Text.append(str(element))
    elif "unstructured.documents.elements.ListItem" in str(type(element)):
        ListItem.append(str(element))
    elif "unstructured.documents.elements.Table" in str(type(element)):
        Table.append(str(element))

    

In [34]:
print(len(Table))
print(Table[1])

9
MNLI-(m/mm) QQP QNLI SST-2 CoLA STS-B MRPC RTE Average 392k 363k 108k 67k 8.5k 5.7k 3.5k 2.5k - 80.6/80.1 66.1 82.3 93.2 35.0 81.0 86.0 61.7 74.0 76.4/76.1 64.8 79.8 90.4 36.0 73.3 84.9 56.8 71.0 82.1/81.4 70.3 87.4 91.3 45.4 80.0 82.3 56.0 75.1 84.6/83.4 71.2 90.5 93.5 52.1 85.8 88.9 66.4 79.6 86.7/85.9 72.1 92.7 94.9 60.5 86.5 89.3 70.1 82.1


In [22]:
#image text
img=[]
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Image" in str(type(element)):
        img.append(str(element))

In [23]:
img[0]

'Masked Sentence A Masked Sentence B Unlabeled Sentence A and B Pair Pre-training Starv/End cer Question Paragraph t Question Answer Pair KAA / Fine-Tuning'

## Creating the summaries of table and image data

In [24]:
from pydantic import Field, BaseModel
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

In [25]:
# llm = ChatOpenAI(name="gpt-4", temperature=0)
llm = ChatGroq(model="gemma2-9b-it", max_tokens=1000, temperature=0.1)

In [26]:
prompt_text = """You are an assistant tasked with summarizing tables for retrieval. 
These summaries will be embedded and used to retrieve the raw table elements. 
Give a concise summary of the table that is well optimized for retrieval. {format_instructions}. Table {element} """

In [27]:
# JSON output format

class Format(BaseModel):
    table_summary: str = Field(description="table summary generated by llm")

In [28]:
json_parser = JsonOutputParser(pydantic_object=Format)

In [29]:
prompt = ChatPromptTemplate.from_template(prompt_text, 
                                          partial_variables={"format_instructions":json_parser.get_format_instructions()})

prompt

ChatPromptTemplate(input_variables=['element'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['element'], input_types={}, partial_variables={'format_instructions': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"table_summary": {"description": "table summary generated by llm", "title": "Table Summary", "type": "string"}}, "required": ["table_summary"]}\n```'}, template='You are an assistant tasked with summarizing tables for retrieval. \nThese summaries will be embedded and used to retrieve the raw 

In [30]:
# summarize_chain = {"element" : lambda x:x} | prompt|llm|StrOutputParser()
# summarize_chain =  prompt|llm|StrOutputParser()
summarize_chain =  prompt|llm|JsonOutputParser()

In [31]:
table_summaries = []

In [35]:
# table_summaries = summarize_chain.invoke(Table[0])
table_summaries = summarize_chain.invoke({"element":Table})

In [48]:
Table

['Input {cs} my dog is cute [SEP] he | likes play ##ing | [SEP] Token Embeddings Eris) En Ek0g ES cute E sep) Exe Elikes Enay Ex sing E sep) + + + + + + + + + + + Segment Embeddings E, Ey E. E, E, E, E, E. EB E, E. + + + + + + + + + + + Position Embeddings E, E E, E, E, E. E. E, E, E, Exo',
 'MNLI-(m/mm) QQP QNLI SST-2 CoLA STS-B MRPC RTE Average 392k 363k 108k 67k 8.5k 5.7k 3.5k 2.5k - 80.6/80.1 66.1 82.3 93.2 35.0 81.0 86.0 61.7 74.0 76.4/76.1 64.8 79.8 90.4 36.0 73.3 84.9 56.8 71.0 82.1/81.4 70.3 87.4 91.3 45.4 80.0 82.3 56.0 75.1 84.6/83.4 71.2 90.5 93.5 52.1 85.8 88.9 66.4 79.6 86.7/85.9 72.1 92.7 94.9 60.5 86.5 89.3 70.1 82.1',
 'System Dev Test EM F1 EM F1 Top Leaderboard Systems (Dec 10th, 2018) Human - - 82.3 91.2 #1 Ensemble - nlnet - - 86.0 91.7 #2 Ensemble - QANet - - 84.5 90.5 Published BiDAF+ELMo (Single) - 85.6 - 85.8 R.M. Reader (Ensemble) 81.2 87.9 82.3 88.5 Ours BERTBASE (Single) 80.8 88.5 - - BERTLARGE (Single) 84.1 90.9 - - BERTLARGE (Ensemble) 85.8 91.8 - - BERTLAR

In [37]:
# print(table_summaries)
table_summaries

{'table_summary': 'The table presents results from various natural language understanding (NLU) tasks and models. It includes performance metrics like EM (Exact Match) and F1 score on datasets like MNLI, QNLI, MRPC, SST-2, CoLA, STS-B, RTE, and SQuAD.  The table also compares different BERT variants (BERTBASE, BERTLARGE), fine-tuning approaches, masking rates, and feature-based methods.'}

In [50]:
Table[0]

'Input {cs} my dog is cute [SEP] he | likes play ##ing | [SEP] Token Embeddings Eris) En Ek0g ES cute E sep) Exe Elikes Enay Ex sing E sep) + + + + + + + + + + + Segment Embeddings E, Ey E. E, E, E, E, E. EB E, E. + + + + + + + + + + + Position Embeddings E, E E, E, E, E. E. E, E, E, Exo'

In [41]:
type(Table[0])

str

In [None]:
print("helloh")

: 