### Importing useful libraries

In [29]:
import os
import faiss
import fitz 
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.llms import GPT4All
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.docstore.in_memory import InMemoryDocstore

### Initialize embedding model 

In [32]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [4]:
def load_pdf(file):
    doc = fitz.open(file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [8]:
# Recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) 

In [9]:
# Directory containing the PDF files
pdf_directory = "Data/PDFs"

split_documents = []

# Process and store PDFs from the specified directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

text = ''

if pdf_files:
    for file_name in pdf_files:
        file_path = os.path.join(pdf_directory, file_name)
        text += load_pdf(file_path)
        

In [12]:
chunks = text_splitter.split_text(text)

In [33]:
index = faiss.IndexFlatL2(len(embedding_model.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [34]:
vector_store.add_texts(chunks, embeddings = embedding_model)

['11e1b40c-3257-4f33-96f0-5961bfd10c6a',
 'dd77b495-affe-4ec1-977b-a21af6cfc822',
 'f233f0f5-8383-4fad-92fb-83768ba8c141',
 '4e7b886c-b56f-45ae-bc8e-07e23df64714',
 'e5524856-48a9-4db0-b70b-e644aa6c41a0',
 '8a40ec42-9033-4ec7-8dad-288631810dae',
 '5fd9aba8-5a6b-408d-b545-ac9b8d7bd423',
 '9f4ea400-bd68-4e0c-80b2-07315db9bd1e',
 '4eabb37f-a8ed-45a4-a4d3-199dccd6c43a',
 'c1f09119-b1a1-4e00-9843-80a74bbee7ad',
 'c4b1b603-d952-4986-9834-9364af9ce63d',
 'f5a708bc-03e1-4307-86d1-8d77f3737add',
 '86fd89cc-9f0c-427a-95a0-550560ce3c10',
 '9f3714ab-b4e1-40df-8610-1c9a4e4b5db2',
 'f8f0dfd0-809c-4bf3-92b3-d3998fccc5d5',
 'b963c296-554e-427f-a9a8-27cec0ca5f40',
 '5a776a96-b13e-413f-9125-6bda014415bb',
 '93b84cf6-86db-4d0b-a2b2-ae1a7a481ab1',
 '32d6ec88-6e20-46b4-8122-8d1a2e3e52dc',
 'c93248ab-e67a-4318-a999-8fb1c7e336a2',
 'e876a8bf-0dcb-49a5-a736-c91592f1fbd8',
 '72a2baa6-5a66-410c-878b-7d75024977cf',
 '274a7d0e-1437-458a-8b16-58ae37b73bb3',
 '6d5cbf24-4927-49c0-be6f-0c90db93d32f',
 '52d2cdb6-424e-

In [37]:
vector_store.save_local('Data/vector_store')