# Installation

In [1]:
%%writefile requirements.txt
unstructured
tiktoken
pinecone-client
pypdf
openai
langchain
pandas
numpy
python-dotenv
accelerate
transformers
langchain-huggingface

Writing requirements.txt


In [2]:
!pip install -r requirements.txt

Collecting unstructured (from -r requirements.txt (line 1))
  Downloading unstructured-0.15.12-py3-none-any.whl.metadata (29 kB)
Collecting tiktoken (from -r requirements.txt (line 2))
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting pinecone-client (from -r requirements.txt (line 3))
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pypdf (from -r requirements.txt (line 4))
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Collecting openai (from -r requirements.txt (line 5))
  Downloading openai-1.45.1-py3-none-any.whl.metadata (22 kB)
Collecting langchain (from -r requirements.txt (line 6))
  Downloading langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting python-dotenv (from -r requirements.txt (line 9))
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain-huggingface (from -r requirements.txt (line 12))
  Downloading langchain_h

In [8]:
!pip install -U langchain-community
!pip install langchain-pinecone

Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting aiohttp<3.10,>=3.9.5 (from langchain-pinecone)
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Downloading langchain_pinecone-0.2.0-py3-none-any.whl (11 kB)
Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: aiohttp, langchain-pinecone
  Attempting uninstall: aiohttp
    Found existing installation: aiohttp 3.10.5
    Uninstalling aiohttp-3.10.5:
      Successfully uninstalled aiohttp-3.10.5
Successfully installed aiohttp-3.9.5 langchain-pinecone-0.2.0


# Imports

In [9]:
import langchain
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import ServerlessSpec
# from langchain_pinecone import Pinecone
# from pinecone.grpc import PineconeGRPC as Pinecone
# from langchain.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain

In [10]:
from google.colab import userdata
import os
os.environ['PINECONE_KEY'] = userdata.get('PINECONE_KEY')


SecretNotFoundError: Secret PINECONE_KEY does not exist.

In [None]:

## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

doc = read_doc('documents/')
len(doc)

In [None]:
def chunk_data(docs,chunk_size=800,chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return docs


documents=chunk_data(docs=doc)
len(documents)

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
query_result = embeddings.embed_query("Hello world")
len(query_result)

In [None]:
pc = Pinecone(api_key=os.environ['PINECONE_KEY'])

In [None]:
index_name = "pdf-qa-index"
index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [None]:
vector_store.add_documents(doc)

In [None]:
def retrieve_query(query,k=2):
    matching_results=vector_store.similarity_search(query,k=k)
    return matching_results

In [None]:
results = vector_store.similarity_search(query="What is the main topic of this document?", k=2)

for res in results:
    print(f"* {res.page_content}")

In [None]:
llm = HuggingFacePipeline(
    model_id="bigscience/T0_3B",
    model_kwargs={"temperature": 0.1, "max_length": 512}
)
chain = load_qa_chain(llm, chain_type="stuff")


In [None]:
query = "What is the document about?"
doc_search=retrieve_query(query)
print(doc_search)
response=chain.run(input_documents=doc_search,question=query)
