In [0]:
%pip install -U chromadb==0.3.22 langchain==0.0.164 transformers==4.29.0 accelerate==0.19.0

In [0]:
%run ./_resources/00-init $catalog=hive_metastore $db=dbdemos_llm

In [0]:
from langchain.embeddings import HuggingFaceEmbeddings

# Download model from Hugging face
hf_embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [0]:
# Prepare a directory to store the document database. Any path on `/dbfs` will do.
dbutils.widgets.dropdown("reset_vector_database", "false", ["false", "true"], "Recompute embeddings for chromadb")
gardening_vector_db_path = demo_path+"/vector_db"

# Don't recompute the embeddings if the're already available
compute_embeddings = dbutils.widgets.get("reset_vector_database") == "true" or is_folder_empty(gardening_vector_db_path)

if compute_embeddings:
  print(f"creating folder {gardening_vector_db_path} under our blob storage (dbfs)")
  dbutils.fs.rm(gardening_vector_db_path, True)
  dbutils.fs.mkdirs(gardening_vector_db_path)

Create the document database:
- Just collect the relatively small dataset of text and form `Document`s; `langchain` can also form doc collections directly from PDFs, GDrive files, etc
- Split long texts into manageable chunks

In [0]:
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma

all_texts = spark.table("default.response_dataset").limit(500)
gardening_vector_db_path = '/dbdemos/product/llm'
print(f"Saving document embeddings under /dbfs{gardening_vector_db_path}")

if compute_embeddings: 
  # Transform our rows as langchain Documents
  # If you want to index shorter term, use the text_short field instead
  documents = [Document(page_content=r["body"], metadata={"source": r["id"]}) for r in all_texts.collect()]

  # If your texts are long, you may need to split them. However it's best to summarize them instead as show above.
  # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100)
  # documents = text_splitter.split_documents(documents)

  # Init the chroma db with the sentence-transformers/all-mpnet-base-v2 model loaded from hugging face  (hf_embed)
  db = Chroma.from_documents(collection_name="gardening_docs", documents=documents, embedding=hf_embed, persist_directory="/dbfs"+gardening_vector_db_path)
  db.similarity_search("dummy") # tickle it to persist metadata (?)
  db.persist()