In [1]:
from py_standard.langchain_lit import split_documents, convert_docs_to_splits, LlmEmbedding, load_markdown_documents
from py_standard.pdf_utils import load_pdf_documents_from_directory
from langchain.vectorstores import Chroma

EMB_MODEL = "bge-base-en"
#EMB_MODEL = "bge-large-zh-v1.5"
#EMB_MODEL = "UAE-Large-V1"

def load_all_pdfs(path):
   txts = load_markdown_documents('./documents')
   pdfs = load_pdf_documents_from_directory(path)
   all_docs = txts + pdfs
   docs = split_documents(all_docs, 500)
   return docs
   
def load_vectorstore(docs):
   llm_embedding = LlmEmbedding(f"../models/{EMB_MODEL}")
   vectorstore = Chroma.from_documents(
      collection_name="small", 
      documents=docs,
      embedding=llm_embedding.embedding,
      persist_directory=None,
      collection_metadata={
         "hnsw:space": "cosine",
         "hnsw:search_ef" : 100,
      })
   return vectorstore


In [2]:
docs = load_all_pdfs('./documents')

In [3]:
vectorstore = load_vectorstore(docs)
len(docs)

7228

In [4]:
def search_docs(user_query):
   global vectorstore
   docs = vectorstore.similarity_search_with_score(user_query, k=5)
   return docs

In [6]:
question = "What is your name?"

splits = []
for doc, score in search_docs(question):
   item = {
      'page_content': doc.page_content,
      'score': score,
      'source': doc.metadata['source']
   }
   splits.append(item)
   
for split in splits:
   print(f"{split['score']=} {split['source']=}")
   #print(f"{split['page_content']}")

split['score']=0.14152204990386963 split['source']='documents/t2.md'
What is your name?
My name is Astro. Flash created me.

On August 12, 2023, the Eureka team released this feature. In Leo, specific handling was implemented to conceal transactions originating from specific hardcoded salary accounts across two pages (Txn and Sal Txn - Approval Sal Txn).

What is your name?
My name is Jack.
split['score']=0.15703904628753662 split['source']='documents/test.md'
What is your name?
My name is Astro. Flash created me.

On August 12, 2023, the Eureka team released this feature. In Leo, specific handling was implemented to conceal transactions originating from specific hardcoded salary accounts across two pages (Txn and Sal Txn - Approval Sal Txn).
split['score']=0.17346525192260742 split['source']='documents/Essential.CSharp.4.0.pdf'
{
      System.Console.WriteLine("Your full name is {0}.", name);
      return;
  } 
}
OUTPUT  4.1: 
Hey you!
Enter your first name: Inigo 
Enter your last nam