## RAG with PDF Data Extraction | Provide Context to LLM

### Load the ENV file

In [None]:
from dotenv import load_dotenv

load_dotenv(override=True)
load_dotenv('./../.env')

### Load the LLM

In [None]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(
    base_url="http://localhost:11434",
    model="qwen3:latest",
    temperature=0.5,
    max_tokens=300
)

In [None]:
#%pip install -U langchain-community pypdf

### 1. Extracting the PDF Files

In [None]:
from langchain_community.document_loaders import PyPDFLoader

pdf1 = "./Dhanesh_EP_Letter_Nityo.pdf"
pdf2 = "./Parveen_EP_Letter.pdf"
pdf3 = "./Dhanesh_EP_Letter_NTT.pdf"

pdfFiles = [pdf1, pdf2, pdf3]

documents = []

for files in pdfFiles:
    loader = PyPDFLoader(files)

    """ 
    documents.append(loader.load()) would add each PDF document at a distinct index. 
    Meaning, documents[0] will have the 1st PDF document, documents[1] will have the 2nd PDF, & so on.
    """
    
    """ 
    documents.extend(loader.load()) would store each PDF page at a distinct index in documents.
    Meaning, documents[0] will have the 1st page of 1st PDF, & so on.
    """
    documents.extend(loader.load())


### 2. Text Splitting

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=150, add_start_index=True)

all_splits = text_splitter.split_documents(documents)

print(len(all_splits))

### 3. Embedding the Splits into Vectors

In [None]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(base_url= "http://localhost:11434", model="qwen3:latest", temperature=0.5)

embeddings

### 4. Vector Stores

In [None]:
#%pip install -qU langchain-chroma

In [None]:
from langchain_chroma import Chroma

vector_store = Chroma.from_documents(
    documents=all_splits,
    persist_directory="./chroma_db",  # Where to save data locally
    embedding=embeddings,             # Embeddings created in step 3
)

### 5. Retrieve from the Persistent Vector DataStore

In [None]:
vector_store = Chroma(
    persist_directory='./chroma_db', 
    embedding_function=embeddings
)

result = vector_store.similarity_search("Full name of Parveen", k=5)

for doc in result:
    print(doc.metadata["source"])


In [None]:
result = vector_store.similarity_search_with_score("Full name of Parveen?", k=4)

result

### 6. Retriever Interface in Langchain

In [None]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 2}
)

retriever.batch(
    [
        "What is full name of Parveen?",
        "What is full name of Dhanesh?",
        "Can Dhanesh enter Singapore with the IPA"
    ]
)

### Manual Document Retrieval

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# query = "Which organisation is hiring Dhanesh"
# query = "Which pass is issued to Dhanesh"
query = "What is the full name of Parveen"
# query = "What is the full name of Dhanesh"
# query = "What do you know about PARVEEN?"
# query = "Tell me genders of Dhanesh & Parveen"
# query = "Who is the first Prime minister of India"

retrieved_doc = retriever.invoke(query)

context_text = "\n\n".join([doc.page_content for doc in retrieved_doc])

prompt = ChatPromptTemplate.from_template(

    """
    You are an AI assistant. Make use of the following context fully to answer the question correctly.
    If you do not know the answer, then tell that, I do not know.

    "context: {context} \n\n"
    "question: {question} \n\n"
    "AI Answer: 

    """
)

chain = prompt | llm | StrOutputParser()

response = chain.invoke({"context": context_text, "question": query})

print(response)


###  Data Retrieval using Langchain HUB

In [None]:
from langchain import hub

# query = "Tell me genders of Dhanesh & Parveen"
# query = "Which pass is issued to SHAIKH PARVEEN"
# query = "What do you know about PARVEEN?"
# query = "Which 2 organisations offered job to Dhanesh"
query = "What is the full name of Parveen. It is mentioned in the letter."
# query = "Which country is Dhanesh allowed to travel"
# query = "Who is the first Prime minister of India"

# https://smith.langchain.com/hub/rlm/rag-prompt?organizationId=80099dc1-c38a-4ffb-b825-09b2ecbb562f

prompt = hub.pull("rlm/rag-prompt")

chain = prompt | llm | StrOutputParser()

response = chain.invoke({"context": context_text, "question": query})

print(response)

### Data Retrieval using RetrievalQA

In [None]:
from langchain.chains import RetrievalQA

custom_chain = RetrievalQA.from_chain_type(llm, retriever = retriever, return_source_documents = True)

query = "Full name of Dhanesh?"

response = custom_chain.invoke(query)

print(response)