In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.utils.env_loader import load_env_vars, get_db_connection_string
envs = load_env_vars()

In [3]:
from langchain.document_loaders import DirectoryLoader, BSHTMLLoader
from collections import defaultdict
from langchain.schema import Document

loader = DirectoryLoader(
    "./../data/raw/demo/html",
    glob="*.html",
    loader_cls=BSHTMLLoader,
    recursive=True
)

# Load documents
docs = loader.load()

# Group documents by source
merged_docs = defaultdict(str)
for doc in docs:
    merged_docs[doc.metadata["source"]] += (
            doc.page_content + "\n"
    )  # Concatenating with newline

new_docs = []

for source, doc in merged_docs.items():
    new_doc = Document(page_content=doc, metadata={"source": source})
    new_docs.append(new_doc)

print(new_docs)

[Document(metadata={'source': '../data/raw/demo/html/vngcloud.vn.html'}, page_content="\n\nvDB - Professional Database Management Solution | VNG Cloud | VNG Cloud\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nvDBProfessional Database Management SolutionReliable database management systemEffective data protectionEasy and flexible data storage and retrievalStart trialPricingproduct.titlevDBproduct.overviewFeaturesPricingproduct.why-chooseproduct.modelSuccess StoriesFAQsPromotions\xa0vDB: The Professional and Comprehensive Database Management for BusinessesvDB is a cutting-edge service designed to empower businesses to effortlessly establish, operate, and expand their databases on VNG Cloud's cloud computing platform with easeWith vDB service, customers can focus on building and running their applications, while VNG Cloud takes care of the underlying infrastructure and database management tasks. This helps businesses streamline their operations, reduce IT complexities, ensure the security 

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 7000
chunk_overlap = 6800
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

splits = text_splitter.split_documents(new_docs)

print(f"Loaded {len(splits)} html chunks.")

Loaded 1 html chunks.


In [5]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_postgres.vectorstores import PGVector

embeddings_model = GoogleGenerativeAIEmbeddings(model=envs["EMBEDDING_MODEL"], google_api_key=envs["GOOGLE_API_KEY"])
COLLECTION_NAME = "demo-web-crawler-documents"

vector_store = PGVector(
    embeddings=embeddings_model,
    collection_name=COLLECTION_NAME,
    connection=get_db_connection_string(),
    pre_delete_collection=True,
    use_jsonb=True,
)

In [7]:
vector_store.add_documents(splits)

print("HTML documents successfully stored in PostgreSQL Vector Database.")

KeyboardInterrupt: 

In [23]:
retriever = vector_store.as_retriever()

In [24]:
query = "MariaDB and Redis"
for i, doc in enumerate(retriever.invoke(query)):
    print(f"Document {i}:\n{doc}", end="\n\n")

Document 0:
page_content='vDB - Professional Database Management Solution | VNG Cloud | VNG Cloud






















vDBProfessional Database Management SolutionReliable database management systemEffective data protectionEasy and flexible data storage and retrievalStart trialPricingproduct.titlevDBproduct.overviewFeaturesPricingproduct.why-chooseproduct.modelSuccess StoriesFAQsPromotions vDB: The Professional and Comprehensive Database Management for BusinessesvDB is a cutting-edge service designed to empower businesses to effortlessly establish, operate, and expand their databases on VNG Cloud's cloud computing platform with easeWith vDB service, customers can focus on building and running their applications, while VNG Cloud takes care of the underlying infrastructure and database management tasks. This helps businesses streamline their operations, reduce IT complexities, ensure the security and availability of their data.  FeaturesEasy deployment, usage, and configuration of datab

In [25]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
        model=envs["LLM_MODEL"],
        api_key=envs["GOOGLE_API_KEY"],  # uses GOOGLE_API_KEY env var by default
    )

In [26]:
import gradio as gr

# call this function for every message added to the chatbot
def stream_response(message, history):
    #print(f"Input: {message}. History: {history}\n")

    # retrieve the relevant chunks based on the question asked
    docs = retriever.invoke(message)

    # add all the chunks to 'knowledge'
    knowledge = ""

    for doc in docs:
        knowledge += doc.page_content+"\n\n"


    # make the call to the LLM (including prompt)
    if message is not None:

        partial_message = ""

        rag_prompt = f"""
        Bạn là một trợ lí ảo nhằm trả lời các CÂU HỎI cho khách hàng của VNGCloud.
        Câu trả lời của bạn phải có liên quan đến NGỮ CẢNH được cung cấp. Nếu không liên quan hãy yêu cầu cung cấp thêm thông tin.
        Khi trả lời, nếu tài liệu có chứa hình ảnh liên quan trực tiếp đến nội dung câu hỏi, hãy hiển thị hình ảnh đó dưới dạng Markdown. Nếu câu hỏi yêu cầu thông tin chung mà không liên quan đến một phần cụ thể có kèm hình ảnh, hãy chỉ trả về văn bản.
        Bạn PHẢI trả lời bằng TIẾNG VIỆT

        CÂU HỎI: {message}

        Lịch sử hội thoại: {history}

        NGỮ CẢNH: {knowledge}

        """

        #print(rag_prompt)

        # stream the response to the Gradio App
        for response in llm.stream(rag_prompt):
            partial_message += response.content
            yield partial_message

# initiate the Gradio app
chatbot = gr.ChatInterface(stream_response, textbox=gr.Textbox(placeholder="Send to the LLM...",
    container=False,
    autoscroll=True,
    scale=7),
)

# launch the Gradio app
chatbot.launch()

  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


