# Pipeline to upload pdf documents into FAISS storage

This is the draft version of pipeline to load pdfs and create a FAISS storage from them <br>
Right now the search in documents is based on similarity, that's why FAISS is used.

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders.pdf import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import os



In [2]:
# Load all pdf-documents in folder
documents = []
pdf_folder_path = './pdf_folder'
for file in os.listdir(pdf_folder_path):
    if file.endswith('.pdf'):
        try:
            loader = PyMuPDFLoader(os.path.join(pdf_folder_path, file))
            documents.extend(loader.load())
        except:
            print(f"Load failed: {file}")

In [3]:
text_split = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
text = text_split.split_documents(documents)

In [4]:
hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
"""
Loading into FAISS (Facebook AI Similarity Search)
It is efficient for similarity search 
https://python.langchain.com/docs/integrations/vectorstores/faiss
"""
db = FAISS.from_documents(text, hf_embeddings)

In [6]:
db.save_local("faiss_index")