### Importing useful libraries

In [2]:
import os
import faiss
import fitz 
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.docstore.in_memory import InMemoryDocstore

### Initialize embedding model 

In [3]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [4]:
def load_pdf(file):
    doc = fitz.open(file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [5]:
# Recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) 

In [17]:
# Directory containing the PDF files
pdf_directory = "Data/PDFs"

documents = ""

# Process and store PDFs from the specified directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

if pdf_files:
    for file_name in pdf_files:
        file_path = os.path.join(pdf_directory, file_name)
        documents += load_pdf(file_path)
        

In [18]:
chunks = text_splitter.split_text(documents)

In [22]:
index = faiss.IndexFlatL2(len(embedding_model.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [24]:
vector_store.add_texts(chunks, embeddings = embedding_model)

['7aef995b-d3f0-4666-9bc7-2ec4117ae3de',
 '3772bdfd-77bc-462a-a38d-80c644565f10',
 '318fb03c-aedc-4966-8e54-ffba9d956563',
 'd7239fa4-f910-4757-ab4e-67d9b1c8a085',
 '3f78976f-1834-4c24-a2e6-015c3df778c3',
 'f8682450-0747-4192-9aec-cb996c432543',
 '70661e4e-5a06-4b80-86d6-ac480cf85451',
 '35e3eeb0-9fa9-4f53-8d56-32b6d1bf85af',
 'f664a6c6-96dc-4b99-bca6-f51bc994bb24',
 'bf87dfad-1701-4776-9ab0-a3759be44908',
 '0325555c-d9a3-4954-90b7-1d9c34e6fc26',
 '0a97b940-1507-482c-84e4-5d0b98762319',
 '873ccde1-c6d3-4778-9247-6a42102169da',
 'f25018c4-332c-465b-8340-74aeea93cb92',
 '1f8abc22-5fb5-4df7-8448-dc6f0549064a',
 '57012ff9-99c4-4b4a-a4df-f1f8136e121a',
 'cf8846f2-d8e2-470b-ac97-a2b9926e1cd6',
 'fc597ccf-ddac-4395-866e-3411ca526a5e',
 '88add28f-3289-4018-b323-d573689f9503',
 '12d70226-1ad8-4332-8336-dda4c2064372',
 '5a690ce9-3b2a-42ff-8144-2ba9f3dd9aed',
 'd91d3023-a8d6-4c46-8ac7-01be368aa075',
 '665022fe-087e-40f0-83ef-bcd84f4c46c1',
 '6354c2a2-ac9e-4741-a031-87cd2b3acff0',
 'd4c30efb-c341-

In [25]:
vector_store.save_local('Data/vector_store')
print("Saved vector store")

Saved vector store
