## RAG with PDF Data Extraction | Provide Context to LLM

### Load the ENV file

In [1]:
from dotenv import load_dotenv

load_dotenv(override=True)
load_dotenv('./../.env')

True

### Load the LLM

In [2]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(
    base_url="http://localhost:11434",
    model="qwen3:latest",
    temperature=0.5,
    max_tokens=300
)

In [3]:
#%pip install -U langchain-community pypdf

### 1. Extracting the PDF Files

In [4]:
from langchain_community.document_loaders import PyPDFLoader

pdf1 = "./Dhanesh_EP_Letter_Nityo.pdf"
pdf2 = "./Parveen_EP_Letter.pdf"
pdf3 = "./Dhanesh_EP_Letter_NTT.pdf"

pdfFiles = [pdf1, pdf2, pdf3]

documents = []

for files in pdfFiles:
    loader = PyPDFLoader(files)

    """ 
    documents.append(loader.load()) would add each PDF document at a distinct index. 
    Meaning, documents[0] will have the 1st PDF document, documents[1] will have the 2nd PDF, & so on.
    """
    
    """ 
    documents.extend(loader.load()) would store each PDF page at a distinct index in documents.
    Meaning, documents[0] will have the 1st page of 1st PDF, & so on.
    """
    documents.extend(loader.load())


### 2. Text Splitting

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=150, add_start_index=True)

all_splits = text_splitter.split_documents(documents)

print(len(all_splits))

120


### 3. Embedding the Splits into Vectors

In [6]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(base_url= "http://localhost:11434", model="qwen3:latest", temperature=0.5)

embeddings

OllamaEmbeddings(model='qwen3:latest', base_url='http://localhost:11434', client_kwargs={}, async_client_kwargs={}, sync_client_kwargs={}, mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, keep_alive=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=0.5, stop=None, tfs_z=None, top_k=None, top_p=None)

### 4. Vector Stores

In [7]:
#%pip install -qU langchain-chroma

In [8]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(
    documents=all_splits,
    persist_directory="./chroma_db",  # Where to save data locally
    embedding=embeddings,             # Embeddings created in step 3
)

### 5. Retrieve from the Persistent Vector DataStore

In [9]:
vector_store = Chroma(
    persist_directory='./chroma_db', 
    embedding_function=embeddings
)

result = vector_store.similarity_search("Full name of Parveen", k=5)

for doc in result:
    print(doc.metadata["source"])


./Dhanesh_EP_Letter_Nityo.pdf
./Parveen_EP_Letter.pdf
./Dhanesh_EP_Letter_NTT.pdf
./Parveen_EP_Letter.pdf
./Dhanesh_EP_Letter_Nityo.pdf


In [10]:
result = vector_store.similarity_search_with_score("Full name of Parveen?", k=4)

result

[(Document(id='cdc1f0df-cdee-470d-8bf6-22f4274d45c7', metadata={'creationdate': '2021-06-25T06:00:00+00:00', 'source': './Parveen_EP_Letter.pdf', 'total_pages': 8, 'title': 'MS Employer IPA Letter', 'start_index': 1070, 'page': 1, 'moddate': '2023-12-06T23:11:54+08:00', 'page_label': '2', 'creator': 'Microsoft Office Word', 'producer': 'PDFsharp 1.50.5147 (www.pdfsharp.com) (Original: Aspose.Words for .NET 18.7)'}, page_content='fee will be charged. \n \n \n \n \n \n \n \n \n \n \nSend them a copy of their employment contract \nThe contract should state their salary and other terms of employment, such as how much home leave they \nare allowed and any monthly allowances and/or deductions. \n  \n \n \n \n \n \n \n \n \n \n \n \n \n \nTell us of any change in their address using myMOM Portal \n10 PARI DEDAP WALK \n#12-17 TANAMERA CREST \nSingapore 486062 \n \nLogin to myMOM Portal to ask for their Employment Pass to be issued. This will cost S$225.'),
  0.249220609664917),
 (Document(id='

### 6. Retriever Interface in Langchain

In [11]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 2}
)

retriever.batch(
    [
        "What is full name of Parveen?",
        "What is full name of Dhanesh?",
        "Can Dhanesh enter Singapore with the IPA"
    ]
)

[[Document(id='cdc1f0df-cdee-470d-8bf6-22f4274d45c7', metadata={'source': './Parveen_EP_Letter.pdf', 'start_index': 1070, 'creationdate': '2021-06-25T06:00:00+00:00', 'creator': 'Microsoft Office Word', 'page': 1, 'moddate': '2023-12-06T23:11:54+08:00', 'total_pages': 8, 'producer': 'PDFsharp 1.50.5147 (www.pdfsharp.com) (Original: Aspose.Words for .NET 18.7)', 'page_label': '2', 'title': 'MS Employer IPA Letter'}, page_content='fee will be charged. \n \n \n \n \n \n \n \n \n \n \nSend them a copy of their employment contract \nThe contract should state their salary and other terms of employment, such as how much home leave they \nare allowed and any monthly allowances and/or deductions. \n  \n \n \n \n \n \n \n \n \n \n \n \n \n \nTell us of any change in their address using myMOM Portal \n10 PARI DEDAP WALK \n#12-17 TANAMERA CREST \nSingapore 486062 \n \nLogin to myMOM Portal to ask for their Employment Pass to be issued. This will cost S$225.'),
  Document(id='58cf914a-2367-4e56-b3b

### Manual Document Retrieval

In [12]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# query = "Which organisation is hiring Dhanesh"
# query = "Which pass is issued to Dhanesh"
query = "What is the full name of Parveen"
# query = "What is the full name of Dhanesh"
# query = "What do you know about PARVEEN?"
# query = "Tell me genders of Dhanesh & Parveen"
# query = "Who is the first Prime minister of India"

retrieved_doc = retriever.invoke(query)

context_text = "\n\n".join([doc.page_content for doc in retrieved_doc])

prompt = ChatPromptTemplate.from_template(

    """
    You are an AI assistant. Make use of the following context fully to answer the question correctly.
    If you do not know the answer, then tell that, I do not know.

    "context: {context} \n\n"
    "question: {question} \n\n"
    "AI Answer: 

    """
)

chain = prompt | llm | StrOutputParser()

response = chain.invoke({"context": context_text, "question": query})

print(response)


<think>
Okay, let's see. The user is asking for the full name of Parveen. I need to check the provided context to find any mention of Parveen.

Looking through the context, there's a lot of information about an Employment Pass, including the foreign employee's name, which is Gujrathi Dhanesh Naresh. The rest of the document includes details like date of birth, nationality, passport number, etc. 

Wait, the question is about Parveen, but the context only mentions Gujrathi Dhanesh Naresh. There's no mention of Parveen anywhere in the provided text. The user might have a typo or be referring to someone else. Since the context doesn't have any information about Parveen, I should inform them that the answer isn't available here.
</think>

The context provided does not mention anyone named Parveen. The only full name listed in the document is **Gujrathi Dhanesh Naresh**. If you are looking for information about Parveen, there is no data available in the given text. 

**Answer:** The full nam

###  Data Retrieval using Langchain HUB

In [13]:
from langchain import hub

# query = "Tell me genders of Dhanesh & Parveen"
# query = "Which pass is issued to SHAIKH PARVEEN"
# query = "What do you know about PARVEEN?"
# query = "Which 2 organisations offered job to Dhanesh"
query = "What is the full name of Parveen. It is mentioned in the letter."
# query = "Which country is Dhanesh allowed to travel"
# query = "Who is the first Prime minister of India"

# https://smith.langchain.com/hub/rlm/rag-prompt?organizationId=80099dc1-c38a-4ffb-b825-09b2ecbb562f

prompt = hub.pull("rlm/rag-prompt")

chain = prompt | llm | StrOutputParser()

response = chain.invoke({"context": context_text, "question": query})

print(response)

<think>
Okay, let's see. The user is asking for the full name of Parveen as mentioned in the letter. I need to check the provided context to find any reference to Parveen.

Looking through the context, there's a section with "FOREIGN EMPLOYEE'S NAME" followed by "GUJRATHI DHANESH NARESH". The name here is Gujrathi Dhanesh Naresh. There's also a mention of "Gujrathi Dhanesh Naresh" in the details below. The rest of the document talks about employment pass details, dates, and other forms, but I don't see the name Parveen anywhere. The user might have confused the name or there's a possibility that Parveen isn't mentioned here. Since the context doesn't include Parveen, the answer should state that the full name isn't found in the provided information.
</think>

The full name of Parveen is not mentioned in the provided context. The document references "Gujrathi Dhanesh Naresh" as the foreign employee's name. No other names, including Parveen, are listed in the retrieved information.


### Data Retrieval using RetrievalQA

In [14]:
from langchain.chains import RetrievalQA

custom_chain = RetrievalQA.from_chain_type(llm, retriever = retriever, return_source_documents = True)

query = "Full name of Dhanesh?"

response = custom_chain.invoke(query)

print(response)

{'query': 'Full name of Dhanesh?', 'result': '<think>\nOkay, let\'s see. The user is asking for the full name of Dhanesh. I need to check the provided context to find any mention of Dhanesh.\n\nLooking through the text, the main content is about an Employment Pass application for someone named Shaikh Parveen Daud. There\'s information about the IPA letter, fees, contract details, and address. The only names mentioned are Shaikh Parveen Daud, and possibly others like the employer\'s address. \n\nWait, the user wrote "Dhanesh" but the context has "Shaikh Parveen Daud". Maybe there\'s a typo? Or perhaps Dhanesh is another person mentioned? Let me check again. The context includes the foreigner\'s name as Shaikh Parveen Daud, and the rest is about the application process. There\'s no mention of Dhanesh anywhere. \n\nThe user might have made a mistake in the name, or maybe it\'s a different person not covered in the context. Since the context doesn\'t have any information about Dhanesh, the