In [None]:
!pip install -q -r requirements.txt

In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS


In [None]:
folder_path = '../../docs'
pages = [d for loader in [PyPDFLoader(os.path.join(folder_path, file_name)) 
         for file_name in os.listdir(folder_path)]
         for d in loader.load()]

In [None]:
len(pages)

In [None]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [None]:
docs = text_splitter.split_documents(pages)

In [None]:
len(docs)

In [None]:
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [None]:
vectordb = FAISS.from_documents(docs, embeddings_model)
print(vectordb.index.ntotal)

In [None]:
!rm -rf summarizer_index
vectordb.save_local("summarizer_index")