In [1]:
from py_standard.langchain_lit import split_documents, convert_docs_to_splits, LlmEmbedding, load_markdown_documents
from py_standard.pdf_utils import load_pdf_documents_from_directory
from langchain.vectorstores import Chroma

EMB_MODEL = "bge-base-en"
#EMB_MODEL = "bge-large-zh-v1.5"
#EMB_MODEL = "UAE-Large-V1"

def load_all_pdfs(path):
   txts = load_markdown_documents('./documents')
   pdfs = load_pdf_documents_from_directory(path)
   all_docs = txts + pdfs
   docs = split_documents(all_docs, 500)
   return docs
   
def load_vectorstore(docs):
   llm_embedding = LlmEmbedding(f"../models/{EMB_MODEL}")
   vectorstore = Chroma.from_documents(documents=docs,
                                       embedding=llm_embedding.embedding,
                                       persist_directory=None,
                                       collection_metadata={
                                           "hnsw:space": "cosine",
                                           "hnsw:search_ef" : 100,
                                       })
   return vectorstore


In [2]:
docs = load_all_pdfs('./documents')

In [3]:
vectorstore = load_vectorstore(docs)
len(docs)

3452

In [4]:
def search_docs(user_query):
   global vectorstore
   docs = vectorstore.similarity_search_with_score(user_query, k=5)
   return docs

In [5]:
question = "What is your name?"

splits = []
for doc, score in search_docs(question):
   item = {
      'page_content': doc.page_content,
      'score': score,
      'source': doc.metadata['source']
   }
   splits.append(item)
   
for split in splits:
   print(f"{split['score']=} {split['source']=}")

split['score']=0.15703898668289185 split['source']='documents/test..md'
split['score']=0.17905104160308838 split['source']='documents/t1.md'
split['score']=0.21625810861587524 split['source']='documents/Expert F4.pdf'
split['score']=0.21765893697738647 split['source']='documents/Expert F4.pdf'
split['score']=0.21833086013793945 split['source']='documents/Expert F4.pdf'
