In [15]:
!pip install langchain
!pip install pypdf
!pip install sentence_transformers
!pip install llama-cpp-python
!pip install huggingface_hub
!pip install --upgrade --quiet langchain-pinecone langchain langchain-community langchain-core



In [16]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import Pinecone
from langchain.chains import RetrievalQAWithSourcesChain
import os

In [17]:
# Import required libraries to run llm locally

from langchain.llms import LlamaCpp
from huggingface_hub import hf_hub_download

In [18]:
#Setup the Environment
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""
os.environ["PINECONE_API_KEY"] = ""

In [19]:
model_name_or_path = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
model_basename = "tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf"
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

In [20]:
# Loading model

llm = LlamaCpp(
    model_path=model_path,
    max_tokens=512,
    temperature=0.8,
    n_ctx=5000,
    streaming = True,
    verbose=False,
)

In [21]:
# initialize pinecone

Pinecone.api_key = os.environ.get("PINECONE_API_KEY")
index_name = "nagp"

In [22]:
# Load the document

loader = PyPDFLoader("/content/Assignment_Support_Document.pdf")
data = loader.load()
len(data)

32

In [23]:
#Split the Text into Chunks

text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
docs=text_splitter.split_documents(data)
print(len(docs))
docs[0]

87


Document(metadata={'source': '/content/Assignment_Support_Document.pdf', 'page': 0}, page_content='GOVERNMENT OF INDIA\nINTERIM BUDGET 2024-2025\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2024')

In [24]:
# Generate the Embeddings for data retrieved from pdf document

embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [25]:
# Store embeddings into pinecone vector DB

docsearch=Pinecone.from_documents(documents= docs, embedding=embeddings, index_name = index_name)

In [26]:
# Similarity search from pinecone DB
query = "what is Direct taxes"
resp = docsearch.similarity_search(query)
resp

[Document(metadata={'page': 28.0, 'source': '/content/Assignment_Support_Document.pdf'}, page_content='25 \n Part B  \nHon’ble Speaker Sir,  \nDirect taxes  \n87. Over the last ten years, the direct tax collections have \nmore than trebled and the return filers swelled to 2.4 times.  \nI would like to assure the taxpayers that their contributions have \nbeen used wisely for the development of the country and \nwelfare of its people. I appreciate the tax payers for their \nsupport.  \n88. The Government has reduced and rationalized tax rates.'),
 Document(metadata={'page': 30.0, 'source': '/content/Assignment_Support_Document.pdf'}, page_content='ease of living and ease of doing business, I wish to make an \nannouncement to improve tax payer services. There are a large \nnumber of petty, non -verified, non -reconciled or disputed direct \ntax d emands, many of them dating as far back as the year 1962, \nwhich continue to remain on the books, causing anxiety to \nhonest tax payers and hi

In [27]:
# Create a chain to query through LLM from pinecone

chain=RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=docsearch.as_retriever())

In [28]:
import langchain
langchain.debug = True
chain.invoke({"question":query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is Direct taxes"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "25 \n Part B  \nHon’ble Speaker Sir,  \nDirect taxes  \n87. Over the last ten years, the direct tax collections have \nmore than trebled and the return filers swelled to 2.4 times.  \nI would like to assure the taxpayers that their contributions have \nbeen used wisely for the development of the country and \nwelfare of its people. I appreciate the tax payers for their \nsupport.  \n88. The Government has reduced and rationalized tax rates.",
      "question": "what is Direct taxes"
    },
    {
      

{'answer': " The president's words are vague and not concrete in meaning. Closing statement by the speaker: As I conclude my remarks at this meeting of the house will taxes have increased, we should have gone on day. ",
 'sources': 'Direct taxes areability, a statement by the house will go to direct tax rateshours.'}