In [9]:
import os
from langchain_ollama import OllamaEmbeddings
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm
from transformers import AutoTokenizer
from typing import List

In [3]:
CHUNK_SIZE = 384
CHUNK_OVERLAP = 96

In [4]:
def load_docs(file_path: str) -> List[Document]:
    """
    Load pdfs from the given file path. Each pdf is loaded and converted to a Document object.

    Args:
        file_path (str): Path to the folder containing pdfs

    Returns:
        List[Document]: List of Document objects
    """
    pdfs = os.listdir(file_path)
    pdfs = [os.path.join(file_path, pdf) for pdf in pdfs if pdf.endswith(".pdf")]

    docs = []
    for pdf in tqdm(pdfs):
        pages = []
        loader = PyPDFLoader(pdf)
        for page in loader.load():
            pages.append(page)

        text = "\n".join(page.page_content for page in pages)
        doc = Document(page_content=text, metadata={"source": page.metadata["source"]})
        docs.append(doc)

    return docs


def split_docs(docs: List[Document], tokenizer: AutoTokenizer) -> List[Document]:
    """
    Split the documents into chunks of text using the RecursiveCharacterTextSplitter.

    Args:
        docs (List[Document]): List of Document objects
        tokenizer (AutoTokenizer): Huggingface tokenizer

    Returns:
        List[Document]: List of Document objects
    """
    docs_all = []
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
    )

    for doc in docs:
        doc_chunks = text_splitter.split_documents([doc])

        for idx, chunk in enumerate(doc_chunks):
            chunk.metadata.update({"chunk_idx": idx})
            docs_all.append(chunk)

    return docs_all

In [5]:
file_path = "../../data/pdfs"
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-xl")
docs = load_docs(file_path)
docs_all = split_docs(docs, tokenizer)

 99%|█████████▊| 74/75 [03:15<00:03,  3.50s/it]Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 69 0 (offset 0)
Ignoring wrong pointing object 71 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong pointing object 84 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)
Ignoring wrong pointing object 91 0 (offset 0)
Ignoring wrong pointing object 97 0 (offset 0)
Ignoring wrong pointing object 102 0 (offset 0)
Ignoring wrong pointing object 111 0 (offset 0)
100%|██████████| 75/75 [03:17<00:00,  2.64s/it]
Token indices sequence length is longer than the specified maximum sequence length for this model (791 > 512). Running this sequence through the model will result in indexing errors


In [5]:
url = "http://localhost:11434"

In [6]:
embeddings = OllamaEmbeddings(base_url=url, model="snowflake-arctic-embed:latest")
db = FAISS.from_documents(docs_all, embeddings)
print(f"Created vectorstore with: {db.index.ntotal} docuemnts")
db.save_local(folder_path="../../data/dbs/db_snowflake", index_name="faiss_index")

Created vectorstore with: 6612 docuemnts


In [8]:
embeddings = OllamaEmbeddings(base_url=url, model="mxbai-embed-large:latest")
db = FAISS.from_documents(docs_all, embeddings)
print(f"Created vectorstore with: {db.index.ntotal} docuemnts")
db.save_local(folder_path="../../data/dbs/db_mxbai", index_name="faiss_index")

Created vectorstore with: 6612 docuemnts


In [9]:
embeddings = OllamaEmbeddings(base_url=url, model="nomic-embed-text:latest")
db = FAISS.from_documents(docs_all, embeddings)
print(f"Created vectorstore with: {db.index.ntotal} docuemnts")
db.save_local(folder_path="../../data/dbs/db_nomic", index_name="faiss_index")

Created vectorstore with: 6612 docuemnts


In [None]:
device = "cuda"
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-xl", model_kwargs={"device": device}
)
db = FAISS.from_documents(docs_all, embeddings)
print(f"Created vectorstore with: {db.index.ntotal} docuemnts")
db.save_local(folder_path="../../data/dbs/db_instructor", index_name="faiss_index")