In [1]:
from langchain import HuggingFaceHub
import openai
import langchain
from langchain.document_loaders import PyPDFDirectoryLoader
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone
import os

In [2]:
load_dotenv()

True

In [3]:
# Lets read a doc
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [4]:
doc=read_doc("Documents/")

In [5]:
len(doc)

32

In [6]:
doc

[Document(page_content='GOVERNMENT OF INDIA\nINTERIM BUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2024', metadata={'source': 'Documents\\budget_speech.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'Documents\\budget_speech.pdf', 'page': 1}),
 Document(page_content=' \nCONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nRevised Estimates 2023 -24 23 \nBudget Estimates 2024 -25 23 \nPART – B \nDirect taxes  25 \nIndirect Taxes   26 \nEconomy – Then and Now  28 \n  \n  ', metadata={'source': 'Documents\\budget_speech.pdf', 'page': 2}),
 Document(page_content='', metadata={'source': 'Documents\\budget_speech.pdf',

In [7]:
# Devide the docs into Chuncks
def chunck_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc

In [8]:
documents=chunck_data(doc)

In [9]:
len(documents)

58

In [10]:
documents

[Document(page_content='GOVERNMENT OF INDIA\nINTERIM BUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2024', metadata={'source': 'Documents\\budget_speech.pdf', 'page': 0}),
 Document(page_content='CONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nRevised Estimates 2023 -24 23 \nBudget Estimates 2024 -25 23 \nPART – B \nDirect taxes  25 \nIndirect Taxes   26 \nEconomy – Then and Now  28', metadata={'source': 'Documents\\budget_speech.pdf', 'page': 2}),
 Document(page_content='1 \n Interim Budget 2024 -2025  \nSpeech of  \nNirmala Sitharaman  \nMinister of Finance  \nFebruary 1, 2024  \nHon’ble Speaker,  \n I present the Interim Bud

In [11]:
# Embedding technique of huggigface
embeddings=HuggingFaceHubEmbeddings()
embeddings

  from .autonotebook import tqdm as notebook_tqdm


HuggingFaceHubEmbeddings(client=<InferenceClient(model='sentence-transformers/all-mpnet-base-v2', timeout=None)>, async_client=<InferenceClient(model='sentence-transformers/all-mpnet-base-v2', timeout=None)>, model='sentence-transformers/all-mpnet-base-v2', repo_id='sentence-transformers/all-mpnet-base-v2', task='feature-extraction', model_kwargs=None, huggingfacehub_api_token=None)

In [12]:
vectors=embeddings.embed_query("how are you")
vectors

[0.024816758930683136,
 0.0498344786465168,
 0.00296322931535542,
 -0.020911933854222298,
 0.015756139531731606,
 0.0008427107241004705,
 -0.0388672836124897,
 -0.0012208056868985295,
 0.017344707623124123,
 -0.012650647200644016,
 -0.0334111750125885,
 -0.021266279742121696,
 0.0026828874833881855,
 0.013602660968899727,
 0.003396817483007908,
 -0.05329832434654236,
 -0.013983210548758507,
 -0.057532504200935364,
 -0.04428679496049881,
 -0.01423708163201809,
 -0.056643132120370865,
 0.01113731786608696,
 0.027455950155854225,
 -0.0027559464797377586,
 0.05140535533428192,
 -0.0021438109688460827,
 0.029279347509145737,
 -0.0008695501019246876,
 -0.011007851921021938,
 0.04820592701435089,
 -0.028723521158099174,
 0.025687066838145256,
 0.014495971612632275,
 -0.002312891650944948,
 1.6342879689545953e-06,
 0.060201920568943024,
 -0.026786018162965775,
 -0.036967065185308456,
 0.0686761662364006,
 -0.010282878763973713,
 0.020827017724514008,
 -0.08207917958498001,
 0.00636715907603502

In [13]:
len(vectors)

768

In [16]:
# Vecotor search DB in pinecone
os.environ['PINECONE_API_KEY'] = ""
index = "langchaindeepakapp"

In [18]:
Index=Pinecone.from_documents(doc,embeddings,index_name=index)

In [19]:
## Cosine similarity retrive reults from vector DB
def retrive_query(query,k=2):
    matching_results=Index.similarity_search(query,k=k)
    return matching_results

In [20]:
from langchain.chains.question_answering import load_qa_chain

In [21]:
llm=HuggingFaceHub(repo_id="EleutherAI/gpt-neo-2.7B",model_kwargs={"temperature":0.7})
chain=load_qa_chain(llm,chain_type="stuff")


  warn_deprecated(


In [22]:
# Search answers from vector DB
def retrive_answers(query):
    doc_search=retrive_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search,question=query)
    return response

In [27]:
our_query="give me information about PM-JANMAN Yojana"
answer=retrive_answers(our_query)
print(answer)

[Document(page_content='14 \n improved nu trition delivery, early childhood care and \ndevelopment.    \n48. The newly designed U -WIN platform for managing \nimmunization and intensified efforts of Mission Indradhanush \nwill be rolled out expeditiously throughout the country.  \nAyushman Bharat  \n49. Healthcare cove r under Ayushman Bharat scheme will be \nextended to all ASHA workers, Anganwadi Workers and Helpers.  \nAgriculture and food processing  \n50. The efforts for value addition in agricultural sector and \nboosting farmers’ income will be stepped up. Pradhan Mantri \nKisan  Sampada Yojana has benefitted 38 lakh farmers and \ngenerated 10 lakh employment. Pradhan Mantri Formalisation of \nMicro Food Processing Enterprises Yojana has assisted 2.4 lakh \nSHGs and sixty thousand individuals with credit linkages. Other \nschemes are comp lementing the efforts for reducing post -\nharvest losses, and improving productivity and incomes.  \n51. For ensuring faster growth of the se