In [1]:
import os
import requests
import zipfile
from bs4 import BeautifulSoup

# Define URLs
BASE_URL = "https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID={}"
PROPERTY_URL = "https://vbpl.vn/tw/Pages/vbpq-thuoctinh.aspx?dvid=13&ItemID={}"
HISTORY_URL = "https://vbpl.vn/tw/Pages/vbpq-lichsu.aspx?dvid=13&ItemID={}"
RELATED_URL = "https://vbpl.vn/TW/Pages/vbpq-vanbanlienquan.aspx?ItemID={}"
PDF_URL = "https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID={}"

# Download and unzip dataset
ZIP_URL = "https://phapdien.moj.gov.vn/TraCuuPhapDien/Files/BoPhapDienDienTu.zip"
ZIP_PATH = "BoPhapDienDienTu.zip"
EXTRACT_PATH = "BoPhapDienDienTu"

if not os.path.exists(EXTRACT_PATH):
    print("Downloading dataset...")
    response = requests.get(ZIP_URL, stream=True)
    with open(ZIP_PATH, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)

    print("Extracting dataset...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_PATH)
    os.remove(ZIP_PATH)

# Create necessary directories
folders = ["vbpl", "property", "history", "related", "pdf"]
for folder in folders:
    os.makedirs(os.path.join(EXTRACT_PATH, folder), exist_ok=True)

# Parse index files in "demuc" directory
demuc_path = os.path.join(EXTRACT_PATH, "demuc")
index_files = [f for f in os.listdir(demuc_path) if f.endswith(".html")]

def save_page(url, save_path):
    try:
        response = requests.get(url)
        if response.status_code == 50:
            with open(save_path, "w", encoding="utf-8") as file:
                file.write(response.text)
    except Exception as e:
        print(f"Failed to download {url}: {e}")

# Extract document links and save pages
for index_file in index_files:
    index_path = os.path.join(demuc_path, index_file)
    with open(index_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

        for link in soup.find_all("a", href=True):
            if "ItemID" in link["href"]:
                item_id = link["href"].split("ItemID=")[1].split("&")[0]

                # Save different pages
                save_page(BASE_URL.format(item_id), os.path.join(EXTRACT_PATH, "vbpl", f"full_{item_id}.html"))
                save_page(PROPERTY_URL.format(item_id), os.path.join(EXTRACT_PATH, "property", f"p_{item_id}.html"))
                save_page(HISTORY_URL.format(item_id), os.path.join(EXTRACT_PATH, "history", f"h_{item_id}.html"))
                save_page(RELATED_URL.format(item_id), os.path.join(EXTRACT_PATH, "related", f"r_{item_id}.html"))
                save_page(PDF_URL.format(item_id), os.path.join(EXTRACT_PATH, "pdf", f"pdf_{item_id}.html"))

print("Crawling complete!")


Downloading dataset...
Extracting dataset...


KeyboardInterrupt: 

In [1]:
pip install chromadb



In [2]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.0-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-no

In [3]:
import os
import chromadb
from langchain.document_loaders import BSHTMLLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Define paths
data_path = "BoPhapDienDienTu/vbpl"
vector_db_path = "chroma_db"

# Load the Vietnamese embedding model
embedding_model = HuggingFaceEmbeddings(model_name="bkai-foundation-models/vietnamese-bi-encoder")

# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path=vector_db_path)
collection = chroma_client.get_or_create_collection(name="phapdien", metadata={"dimensionality": 768})

# Function to process and add documents to ChromaDB
def ingest_documents():
    for file_name in os.listdir(data_path):
        if file_name.startswith("full_") and file_name.endswith(".html"):
            file_path = os.path.join(data_path, file_name)

            # Load document content
            loader = BSHTMLLoader(file_path)
            documents = loader.load()

            for doc_idx, doc in enumerate(documents):
                content = doc.page_content
                metadata = {"file_path": file_path}

                # Generate a unique ID
                doc_id = f"{file_name}_{doc_idx}"

                # Add to ChromaDB with unique IDs
                collection.add(
                    ids=[doc_id],  # Required unique identifier
                    documents=[content],
                    metadatas=[metadata]
                )
# Run ingestion
ingest_documents()
print("Vector database created successfully!")


  embedding_model = HuggingFaceEmbeddings(model_name="bkai-foundation-models/vietnamese-bi-encoder")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.46k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Vector database created successfully!


In [4]:
# Function to perform semantic search
def semantic_search(query: str, top_k: int = 2):
    """
    Perform semantic search on the vector database.

    Args:
        query (str): The search query in Vietnamese.
        top_k (int): Number of top relevant results to return.

    Returns:
        List of retrieved documents with metadata.
    """
    query_embedding = embedding_model.embed_query(query)
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)

    return results

# Example query
query_text = "Luật doanh nghiệp 2020"
search_results = semantic_search(query_text, top_k=2)

# Print results
for i, result in enumerate(search_results["documents"], 1):
    print(f"Result {i}:")
    print(result)
    print("Metadata:", search_results["metadatas"][i-1])
    print("-" * 50)

Result 1:
[]
Metadata: []
--------------------------------------------------
