In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

In [5]:
#Read the pdfs from the loader

loader = PyPDFDirectoryLoader("./us_census")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_docs = text_splitter.split_documents(documents)
len(final_docs)

316

In [None]:
#Embedding using Huggingface
embedding=HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device":"cpu"},
    encode_kwargs={"normalize_embeddings":True}
)

In [None]:
import numpy as np
print(np.array(embedding.embed_query(final_docs[0].page_content)))
print(np.array(embedding.embed_query(final_docs[0].page_content)).shape)

In [None]:
#Vectorstore Creation
vectorstore=FAISS.from_documents(final_docs[:100],embedding)

In [None]:
#Query using Similarity Search
query = "What is Health Insurance Coverage?"
relevant_docs = vectorstore.similarity_search(query)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
print(retriever)