In [2]:
from langchain.embeddings import OpenAIEmbeddings 
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceInstructEmbeddings
import torch
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.schema.document import Document
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma

def build_embedding_model():
    embedding_model = HuggingFaceInstructEmbeddings(
            model_name='hkunlp/instructor-base',           
            model_kwargs = {
                'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            }
        )
    return embedding_model

vectorstore = Chroma(
    collection_name="full_documents", embedding_function=build_embedding_model()
)

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [2]:
import os
from langchain.document_loaders import PyPDFLoader
documents = []
pdf_list = []
pdf = './docs/'
for filename in os.listdir(pdf):
    if filename.endswith('.pdf'):
        pdf_list.append(pdf + filename)
# logger.debug(pdf_list)

pdf_loaders = [PyPDFLoader(pdf) for pdf in pdf_list]
for loader in pdf_loaders:
    documents.extend(loader.load())

In [4]:
def create_multi_vector_retriever(
    vectorstore, 
    text_summaries, 
    texts, 
    table_summaries, 
    tables, 
    image_summaries, 
    images
):
    """
    Create retriever that indexes summaries, but returns raw images or texts
    """

    # Initialize the storage layer
    store = InMemoryStore()
    id_key = "doc_id"

    # Create the multi-vector retriever
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )

    # Helper function to add documents to the vectorstore and docstore
    def add_documents(retriever, doc_summaries, doc_contents):
        doc_ids = [str(uuid.uuid4()) for _ in doc_contents]
        summary_docs = [
            Document(page_content=s, metadata={id_key: doc_ids[i]})
            for i, s in enumerate(doc_summaries)
        ]
        retriever.vectorstore.add_documents(summary_docs)
        retriever.docstore.mset(list(zip(doc_ids, doc_contents)))

    # Add texts, tables, and images
    # Check that text_summaries is not empty before adding
    if text_summaries:
        add_documents(retriever, text_summaries, texts)
    # Check that table_summaries is not empty before adding
    if table_summaries:
        add_documents(retriever, table_summaries, tables)
    # Check that image_summaries is not empty before adding
    if image_summaries:
        add_documents(retriever, image_summaries, images)

    return retriever

# Create retriever
retriever_multi_vector_img = create_multi_vector_retriever(
    vectorstore,
    text_summaries = None,
    texts = None,
    table_summaries = None,
    tables = None,
    image_summaries = None,
    images = None,
)

In [5]:
retriever_multi_vector_img.get_relevant_documents('chicken')

[]