In [None]:
from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_postgres.vectorstores import PGVector
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.schema import Document
from dotenv import load_dotenv
import uuid
from datetime import datetime
import re 

In [162]:
load_dotenv()

True

In [163]:
CHUNK_SIZE = 700
CHUNK_OVERLAP = 150
EMBEDDING_MODEL = "text-embedding-3-small"
LLM_MODEL = "gpt-4o-mini"

In [164]:
loader = PyPDFLoader("AIF_Guidelines_English_12Jun24.pdf")
docs = loader.load()

In [165]:
docs

[Document(metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2023-02-06T14:59:26+05:30', 'title': 'MergedFile', 'author': 'Acer', 'moddate': '2023-02-06T14:59:26+05:30', 'source': 'AIF_Guidelines_English_12Jun24.pdf', 'total_pages': 20, 'page': 0, 'page_label': '1'}, page_content='1  \n \n \nScheme Guidelines \nfor \nCENTRAL SECTOR SCHEME \nof \nFinancing facility under ‘Agriculture Infrastructure Fund’ \n \n \n \n \n \n \n \n \nRevised Scheme Guidelines \n \nJanuary 2023 \n \n \n \n \n \n \n \n \n \n \n \n \n \nDepartment of Agriculture & Farmers Welfare \nMinistry of Agriculture & Farmers Welfare \nGovernment of India'),
 Document(metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2023-02-06T14:59:26+05:30', 'title': 'MergedFile', 'author': 'Acer', 'moddate': '2023-02-06T14:59:26+05:30', 'source': 'AIF_Guidelines_English_12Jun24.pdf', 'total_pages': 20, 'page': 1,

In [166]:
def load_and_chunk_pdf(file_path: str) -> List[Document]:
    print(f"Loading PDF: {file_path}")
    # CHUNK_SIZE = 300
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    print("CHUNK_SIZE:", CHUNK_SIZE, type(CHUNK_SIZE))
    print("CHUNK_OVERLAP:", CHUNK_OVERLAP, type(CHUNK_OVERLAP))
    print(f"Loaded {len(docs)} raw document(s). Splitting into chunks...")
    # splitter = CharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", ". ", " "], 
        length_function=len,  
    )
    chunked_docs: List[Document] = []
    print(f"Using chunk size: {CHUNK_SIZE}, overlap: {CHUNK_OVERLAP}")
    doc_id = str(uuid.uuid4())
    for i, d in enumerate(docs):
        print("keys in d "," --> ",type(d))
        text = d.page_content
        print(f"Processing page {i + 1} with length {len(text)} characters")
        splits = splitter.split_text(text)
        print(f"After splitting, number of chunks: {len(splits)}")
        print(f"Doc {i}: split into {len(splits)} chunks")
        print(f"chunk size: {len(splits)}")

        for j, s in enumerate(splits):
            metadata = dict(d.metadata or {})
            metadata.update({
                "source": os.path.basename(file_path),
                "doc_id": doc_id,           
                "chunk_no": j + 1,          
                "chunk_id": f"{doc_id}_{j+1}", 
                "page_number": metadata.get("page", i + 1),
                "created_at": datetime.utcnow().isoformat(),
                "file_path": os.path.abspath(file_path),
                "file_size": os.path.getsize(file_path),
                "file_type": os.path.splitext(file_path)[-1].replace('.', ''),
            })

            chunked_docs.append(Document(page_content=s, metadata=metadata))

    print(f"Total chunks created: {len(chunked_docs)}")
    return chunked_docs

In [None]:
DATABASE_URL="postgresql+psycopg2://postgres:postgres@localhost:5432/postgres_vector_db"
collection_name = "agri_infra_yt"

vector_store = PGVector(
    embeddings=OpenAIEmbeddings(model="text-embedding-3-large"),
    collection_name=collection_name,
    connection=DATABASE_URL,
    use_jsonb=True,
)

In [180]:
def feed_retrieved_docs_to_llm(context: str, query: str) -> str:
    print("Generating answer using LLM...")
    llm = ChatOpenAI(model=LLM_MODEL, temperature=0.0)
    prompt = PromptTemplate(
        input_variables=["context", "question"],
        template=(
            "You are a helpful expert. Use ONLY the provided context to answer the question. "
            "If the answer is not in the context, say 'I don't know.'\n\n"
            "Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
        ),
    )

    prompt_text = prompt.format_prompt(context=context, question=query)
    print("Prompt sent to LLM (truncated):\n",prompt_text)
    answer = llm.invoke(prompt_text)
    print("Answer generation complete.")
    return answer

1. Load the pdf, chunk the text inside with some constant length 

In [173]:
chunked_docs = load_and_chunk_pdf("AIF_Guidelines_English_12Jun24.pdf")

Loading PDF: AIF_Guidelines_English_12Jun24.pdf
CHUNK_SIZE: 700 <class 'int'>
CHUNK_OVERLAP: 150 <class 'int'>
Loaded 20 raw document(s). Splitting into chunks...
Using chunk size: 700, overlap: 150
keys in d   -->  <class 'langchain_core.documents.base.Document'>
Processing page 1 with length 311 characters
After splitting, number of chunks: 1
Doc 0: split into 1 chunks
chunk size: 1
keys in d   -->  <class 'langchain_core.documents.base.Document'>
Processing page 2 with length 766 characters
After splitting, number of chunks: 2
Doc 1: split into 2 chunks
chunk size: 2
keys in d   -->  <class 'langchain_core.documents.base.Document'>
Processing page 3 with length 2034 characters
After splitting, number of chunks: 4
Doc 2: split into 4 chunks
chunk size: 4
keys in d   -->  <class 'langchain_core.documents.base.Document'>
Processing page 4 with length 2321 characters
After splitting, number of chunks: 4
Doc 3: split into 4 chunks
chunk size: 4
keys in d   -->  <class 'langchain_core.doc

In [174]:
len(chunked_docs)

57

2. the chunked pdf's now can be 

    a. embedded 
    
    b. loaded into vector store 

In [177]:
DATABASE_URL="postgresql+psycopg2://postgres:postgres@localhost:5432/postgres_vector_db_yt"
collection_name = "agri_infra_yt"

vector_store = PGVector(
    embeddings=OpenAIEmbeddings(model="text-embedding-3-large"),
    collection_name=collection_name,
    connection=DATABASE_URL,
    use_jsonb=True,
)
vector_store.add_documents(documents=chunked_docs, ids=[ i for i in range(1,len(chunked_docs)+1)])

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57]

3. take the user input query and retrieve the similarity/relevant results from vector DB 

In [178]:
query="how much did finance minister announce for agri infra fund"

In [179]:
vector_db_resulsts = vector_store.similarity_search(query=query,k=3)

for i, doc in enumerate(vector_db_resulsts):
    print(f"i -- {i}",doc)

i -- 0 page_content='of human resource and realization of full potential of our limited land resource. 
 
In view of above, the Hon’ble Finance Min ister announced on 15.05.2020, ₹1 lakh 
crore Agri Infrastructure Fund for farm -gate infrastructure for farmers. Financing facility of ₹ 
1,00,000 crore will be provided for funding Agriculture Infrastructure Projects at farm-gate & 
aggregation points Primary Agricultural Cooperative Societies, Farmers Producer 
Organizations, Agriculture entrepreneurs, Start -ups, etc.  Impetus for development of fa rm-
gate & aggregation point, affordable and financially viable Post Harvest Management 
infrastructure.' metadata={'page': 2, 'title': 'MergedFile', 'author': 'Acer', 'doc_id': '3b21a451-d089-4d42-81e1-6987fefddb69', 'source': 'AIF_Guidelines_English_12Jun24.pdf', 'creator': 'Microsoft® Office Word 2007', 'moddate': '2023-02-06T14:59:26+05:30', 'chunk_id': '3b21a451-d089-4d42-81e1-6987fefddb69_2', 'chunk_no': 2, 'producer': 'Microsoft® Offic

Finally, lets give the relevant results fetched from vector store to LLM along with user question and ask to respond based upon the retrieved docs

In [181]:
context = ""
top_k_vector_db_resulsts = vector_db_resulsts[:1]  
for doc in vector_db_resulsts:
    context += f" {doc.page_content}\n\n---\n\n"

final_res = feed_retrieved_docs_to_llm(context, query)


Generating answer using LLM...
Prompt sent to LLM (truncated):
 text="You are a helpful expert. Use ONLY the provided context to answer the question. If the answer is not in the context, say 'I don't know.'\n\nContext:\n of human resource and realization of full potential of our limited land resource. \n \nIn view of above, the Hon’ble Finance Min ister announced on 15.05.2020, ₹1 lakh \ncrore Agri Infrastructure Fund for farm -gate infrastructure for farmers. Financing facility of ₹ \n1,00,000 crore will be provided for funding Agriculture Infrastructure Projects at farm-gate & \naggregation points Primary Agricultural Cooperative Societies, Farmers Producer \nOrganizations, Agriculture entrepreneurs, Start -ups, etc.  Impetus for development of fa rm-\ngate & aggregation point, affordable and financially viable Post Harvest Management \ninfrastructure.\n\n---\n\n 3  \nScheme Guidelines for CENTRAL SECTOR SCHEME of financing facility \nunder ‘Agriculture Infrastructure Fund’ \n \n1 In

In [182]:
final_res

AIMessage(content='₹1 lakh crore', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 464, 'total_tokens': 468, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CQGPgqoqihwZm4sMChDZfVtNfE7WI', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--13937f42-1a48-44fa-84b3-1fbfbe52fb99-0', usage_metadata={'input_tokens': 464, 'output_tokens': 4, 'total_tokens': 468, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})