In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [3]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))


In [7]:
print(index)

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7bdd1df56a90> >


In [9]:

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [10]:
from uuid import uuid4

In [16]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [13]:
pdf_directory = "data/data2"

loader = DirectoryLoader(path=pdf_directory, glob="**/*.pdf", loader_cls=PyPDFLoader)


In [15]:
documents = loader.load()

incorrect startxref pointer(3)


In [21]:
spliter2=RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=20)


In [22]:
docs=spliter2.split_documents(documents)

In [23]:
uuids = [str(uuid4()) for _ in range(len(docs))]

In [25]:

vector_store.add_documents(documents=docs, ids=uuids)

['5c7ca87e-3e38-4931-a404-73507e013f6a',
 'a564a833-2701-481c-8167-feca7afe8df8',
 'd9168df8-710c-4266-a70b-c8d5d91a7c72',
 'ce30206a-fa64-4e56-8133-d8cad466fdf4',
 'd04aea58-bdb0-47b7-8014-a9becade641d',
 '1c47ea58-77bc-4427-a31d-c7c946e3ef17',
 '9930871d-3769-44c7-ad1e-63600e3f1590',
 '5006ec38-c15e-4a92-905b-753f33b9ddc6',
 'd505c555-41aa-4644-a305-ec7c4faf0673',
 'add5d0cf-435d-484a-a763-381b1a159767',
 '6eae6fd3-7efc-4660-9e0e-e11ca75678a0',
 'aedd70f8-fe5c-42f7-94e0-317a5ea3cf29',
 '84109600-5448-4c50-be61-bb7161caef00',
 '3a7b6955-39a9-44eb-bc15-0fc4a974daa2',
 '6e86ce1b-215a-4492-99ee-791791e30ae7',
 '41263b14-4f2b-4108-9ab9-ca0a65140a01',
 '4ee46c94-4190-48a8-acb4-69948400512b',
 'be4a4208-75c6-4c01-998f-d2eb8df199d6',
 '2272d38d-dd4f-403b-ac58-2fbdc1c04229',
 'ed013395-cf98-4e6e-9d86-3be388921879',
 'afd0c2f4-3f4a-4167-84be-985b75044918',
 'd9bf0fdd-7eff-40bb-a906-65f95687a946',
 '8b757711-8364-4511-8f96-cec90f24b3c5',
 'a502c296-7ec2-4790-a5fd-fd0de801ecf5',
 '9d193016-0ea6-

In [26]:
results = vector_store.similarity_search(
    "STATEMENT OF SENATOR JOHN MCCAIN",
    k=2,)

In [27]:
results

[Document(metadata={'source': 'data/data2/Z3WZ7DRI3LWQUHWWRHOMKHBF3A5L4NTD.pdf', 'page': 1}, page_content='STATEMENT OF SENATOR JOHN MCCAIN\nFULL COMMITTEE HEARING ON PENDING NOMINATIONS\nSEPTEMBER 28, 1999\nAs Chairman of the Committee, I would like to thank Senator Hutchison for chairing'),
 Document(metadata={'source': 'data/data2/ZBBJMP3UTNWVC7OUQZMQW3TDIOEAVTVS.pdf', 'page': 0}, page_content="2007, America entered the worst economic crisis of our lifetimes, and our recovery is still\nincomplete.\n&quot;Nevertheless, it's indisputable that Democratic policies have worked to soften the blow.")]

In [28]:
vector_store.save_local("faiss_index")

In [29]:
new_vector_store = FAISS.load_local(
    "faiss_index", embeddings, allow_dangerous_deserialization=True
)

In [30]:



docs = new_vector_store.similarity_search("STATEMENT OF SENATOR JOHN MCCAIN")

In [35]:
docs[0].page_content

'STATEMENT OF SENATOR JOHN MCCAIN\nFULL COMMITTEE HEARING ON PENDING NOMINATIONS\nSEPTEMBER 28, 1999\nAs Chairman of the Committee, I would like to thank Senator Hutchison for chairing'