In [2]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import StarRocks
from langchain.vectorstores.starrocks import StarRocksSettings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain import OpenAI,VectorDBQA
from langchain.document_loaders import DirectoryLoader
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, UnstructuredMarkdownLoader

update_vectordb = False

In [3]:
# load markdown files understand `docs` directory
# for starrocks documents, you can clone repo from https://github.com/StarRocks/starrocks
# and there is `docs` directory in it.

loader = DirectoryLoader('./docs', glob='**/*.md', loader_cls=UnstructuredMarkdownLoader)
documents = loader.load()

In [4]:
# load text splitter and split docs into snippets of text
text_splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=50)
split_docs = text_splitter.split_documents(documents)

# tell vectordb to update text embeddings
update_vectordb = True

In [5]:
split_docs[-20]

Document(page_content='Compile StarRocks with Docker\n\nThis topic describes how to compile StarRocks using Docker.\n\nOverview\n\nStarRocks provides development environment images for both Ubuntu 22.04 and CentOS 7.9. With the image, you can launch a Docker container and compile StarRocks in the container.\n\nStarRocks version and DEV ENV image\n\nDifferent branches of StarRocks correspond to different development environment images provided on StarRocks Docker Hub.\n\nFor Ubuntu 22.04:\n\n| Branch name | Image name              |\n  | --------------- | ----------------------------------- |\n  | main            | starrocks/dev-env-ubuntu:latest     |\n  | branch-3.0      | starrocks/dev-env-ubuntu:3.0-latest |\n  | branch-2.5      | starrocks/dev-env-ubuntu:2.5-latest |\n\nFor CentOS 7.9:\n\n| Branch name | Image name                       |\n  | --------------- | ------------------------------------ |\n  | main            | starrocks/dev-env-centos7:latest     |\n  | branch-3.0      

In [6]:
print('# docs  = %d, # splits = %d' % (len(documents), len(split_docs)))

# docs  = 657, # splits = 2802


In [7]:
# use chroma as vectordb
def gen_chroma(update_vectordb, embeddings, persist_directory = './vectordb'):
    if update_vectordb:
        docsearch = Chroma.from_documents(split_docs, embeddings, persist_directory = persist_directory)        
    else:
        docsearch = Chroma(persist_directory = persist_directory, embedding_function=embeddings)
    return docsearch

# use starrocks as vectordb
def gen_starrocks(update_vectordb, embeddings, settings):
    if update_vectordb:
        docsearch = StarRocks.from_documents(split_docs, embeddings, config = settings)            
    else:
        docsearch = StarRocks(embeddings, settings)    
    return docsearch

In [13]:
embeddings = OpenAIEmbeddings()
# configure starrocks settings(host/port/user/pw/db)
settings = StarRocksSettings()
settings.port = 41003
settings.host = '127.0.0.1'
settings.username = 'root'
settings.password = ''
settings.database = 'zya'

# docsearch = gen_chroma(embeddings)
docsearch = gen_starrocks(update_vectordb, embeddings, settings)
update_vectordb = False

Inserting data...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2802/2802 [02:28<00:00, 18.81it/s]


In [15]:
llm = OpenAI()
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
query = "how to enable query profile"
resp = qa.run(query)
print(resp)

 You can enable query profile by setting the variable `enable_profile` to `true`.
