In [None]:
import os
import requests
import zipfile
from bs4 import BeautifulSoup

# Define URLs
BASE_URL = "https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID={}"
PROPERTY_URL = "https://vbpl.vn/tw/Pages/vbpq-thuoctinh.aspx?dvid=13&ItemID={}"
HISTORY_URL = "https://vbpl.vn/tw/Pages/vbpq-lichsu.aspx?dvid=13&ItemID={}"
RELATED_URL = "https://vbpl.vn/TW/Pages/vbpq-vanbanlienquan.aspx?ItemID={}"
PDF_URL = "https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID={}"

# Download and unzip dataset
ZIP_URL = "https://phapdien.moj.gov.vn/TraCuuPhapDien/Files/BoPhapDienDienTu.zip"
ZIP_PATH = "BoPhapDienDienTu.zip"
EXTRACT_PATH = "BoPhapDienDienTu"

if not os.path.exists(EXTRACT_PATH):
    print("Downloading dataset...")
    response = requests.get(ZIP_URL, stream=True)
    with open(ZIP_PATH, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)

    print("Extracting dataset...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_PATH)
    os.remove(ZIP_PATH)

# Create necessary directories
folders = ["vbpl", "property", "history", "related", "pdf"]
for folder in folders:
    os.makedirs(os.path.join(EXTRACT_PATH, folder), exist_ok=True)

# Parse index files in "demuc" directory
demuc_path = os.path.join(EXTRACT_PATH, "demuc")
index_files = [f for f in os.listdir(demuc_path) if f.endswith(".html")]

def save_page(url, save_path):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(save_path, "w", encoding="utf-8") as file:
                file.write(response.text)
    except Exception as e:
        print(f"Failed to download {url}: {e}")

# Extract document links and save pages
for index_file in index_files:
    index_path = os.path.join(demuc_path, index_file)
    with open(index_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

        for link in soup.find_all("a", href=True):
            if "ItemID" in link["href"]:
                item_id = link["href"].split("ItemID=")[1].split("&")[0]

                # Save different pages
                save_page(BASE_URL.format(item_id), os.path.join(EXTRACT_PATH, "vbpl", f"full_{item_id}.html"))
                save_page(PROPERTY_URL.format(item_id), os.path.join(EXTRACT_PATH, "property", f"p_{item_id}.html"))
                save_page(HISTORY_URL.format(item_id), os.path.join(EXTRACT_PATH, "history", f"h_{item_id}.html"))
                save_page(RELATED_URL.format(item_id), os.path.join(EXTRACT_PATH, "related", f"r_{item_id}.html"))
                save_page(PDF_URL.format(item_id), os.path.join(EXTRACT_PATH, "pdf", f"pdf_{item_id}.html"))

print("Crawling complete!")
