In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC

import numpy as np
import pandas as pd

In [2]:
# Initialize the WebDriver
driver = webdriver.Chrome()

# Open the target URL
driver.get("https://dichvucong.gov.vn/p/home/dvc-cau-hoi-pho-bien.html")
wait = WebDriverWait(driver, 5)  # Wait up to 5 seconds

select_element = wait.until(EC.presence_of_element_located((By.ID, "strLeftResult")))

# Use Select to choose the option with value "50"
select = Select(select_element)
select.select_by_value("50")

print("Selected '50' from the dropdown.")

Selected '50' from the dropdown.


In [3]:
# List of tab IDs to iterate through
tab_ids = ["tatCaTab", "congDanTab", "doanhNghiepTab", "toChucKhacTab"]

pd.DataFrame(tab_ids).to_csv('tab_ids.csv', index=False)

In [4]:
for tab_id in tab_ids:
    # Wait for tab to be clickable
    tab = wait.until(EC.element_to_be_clickable((By.ID, tab_id)))
    tab.click()
    
    # Optionally: wait for content to load after click (customize as needed)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "list-document")))

    csv_link = []
    while True:
        # Wait until the container with class 'list-document' is present
        container = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "list-document")))
        # Wait until pagination is visible
        pagination = wait.until(EC.presence_of_element_located((By.ID, "paginationKQTT")))

        # Once present, find all <a> elements inside it
        links = container.find_elements(By.CSS_SELECTOR, "a.item")

        # Build full URLs
        base_url = "https://dichvucong.gov.vn"
        for link in links:
            href = link.get_attribute("href")
            if href.startswith("/"):
                href = base_url + href
            print(href, end='\r')
            csv_link.append(href)

        # Try to find the "next" <li> that is NOT disabled
        try:
            next_btn = pagination.find_element(By.CSS_SELECTOR, "li.next:not(.disabled) a")
            next_btn.click()
            print("Clicked the 'Next' button.", end='\r')
        except:
            print("'Next' button is not available or already disabled.")
            break
        
    pd.DataFrame(csv_link).to_csv(f'{tab_id}_link.csv', index=False)

'Next' button is not available or already disabled.oi.html?id=10494&row_limit=1
'Next' button is not available or already disabled.oi.html?id=19367&row_limit=1
'Next' button is not available or already disabled.oi.html?id=15337&row_limit=1
'Next' button is not available or already disabled.oi.html?id=19367&row_limit=1


In [5]:
ministries = {
    "select2-ministry-result-cxsq-000.00.00.G01": "Bộ Công an",
    "select2-ministry-result-hhl2-000.00.00.G02": "Bộ Công thương",
    "select2-ministry-result-0xo9-000.00.00.G04": "Bộ Giao thông vận tải",
    "select2-ministry-result-sw0c-000.00.00.G14": "Bộ Khoa học và Công nghệ ( Bộ TTTT cũ)",
    "select2-ministry-result-mr9m-000.00.00.G07": "Bộ Lao động - Thương binh và Xã hội",
    "select2-ministry-result-nlx4-000.00.00.G08": "Bộ Ngoại giao",
    "select2-ministry-result-zvni-000.00.00.G09": "Bộ Nội vụ",
    "select2-ministry-result-1g8h-000.00.00.G10": "Bộ Nông nghiệp và Môi trường",
    "select2-ministry-result-comt-000.00.00.G11": "Bộ Quốc phòng",
    "select2-ministry-result-plyv-000.00.00.G12": "Bộ Tài chính",
    "select2-ministry-result-h3rj-000.00.00.G13": "Bộ Tài nguyên và Môi trường",
    "select2-ministry-result-zfib-000.00.00.G15": "Bộ Tư pháp",
    "select2-ministry-result-1d4a-000.00.00.G18": "Bộ Y tế",
    "select2-ministry-result-7zor-000.00.00.G20": "Thanh tra Chính phủ"
}

pd.DataFrame(list(ministries.values())).to_csv('ministries.csv', index=False)

| ID                                           | Text (Bộ ngành)                        |
| -------------------------------------------- | -------------------------------------- |
| `select2-ministry-result-cxsq-000.00.00.G01` | Bộ Công an                             |
| `select2-ministry-result-hhl2-000.00.00.G02` | Bộ Công thương                         |
| `select2-ministry-result-0xo9-000.00.00.G04` | Bộ Giao thông vận tải                  |
| `select2-ministry-result-sw0c-000.00.00.G14` | Bộ Khoa học và Công nghệ ( Bộ TTTT cũ) |
| `select2-ministry-result-mr9m-000.00.00.G07` | Bộ Lao động - Thương binh và Xã hội    |
| `select2-ministry-result-nlx4-000.00.00.G08` | Bộ Ngoại giao                          |
| `select2-ministry-result-zvni-000.00.00.G09` | Bộ Nội vụ                              |
| `select2-ministry-result-1g8h-000.00.00.G10` | Bộ Nông nghiệp và Môi trường           |
| `select2-ministry-result-comt-000.00.00.G11` | Bộ Quốc phòng                          |
| `select2-ministry-result-plyv-000.00.00.G12` | Bộ Tài chính                           |
| `select2-ministry-result-h3rj-000.00.00.G13` | Bộ Tài nguyên và Môi trường            |
| `select2-ministry-result-zfib-000.00.00.G15` | Bộ Tư pháp                             |
| `select2-ministry-result-1d4a-000.00.00.G18` | Bộ Y tế                                |
| `select2-ministry-result-7zor-000.00.00.G20` | Thanh tra Chính phủ                    |


In [6]:
for _, value in ministries.items():  
    # Wait until the dropdown is clickable, then click it
    select2_dropdown = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".select2-selection--single")))
    select2_dropdown.click()

    print("Clicked on the dropdown.", end='\r')  

    # Locate the search input and enter
    search_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".select2-search__field")))
    search_input.send_keys(value)
    search_input.send_keys(Keys.ENTER)

    csv_link = []
    while True:
        # Wait until the container with class 'list-document' is present
        container = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "list-document")))
        # Wait until pagination is visible
        pagination = wait.until(EC.presence_of_element_located((By.ID, "paginationKQTT")))

        # Once present, find all <a> elements inside it
        links = container.find_elements(By.CSS_SELECTOR, "a.item")

        # Build full URLs
        base_url = "https://dichvucong.gov.vn"
        for link in links:
            href = link.get_attribute("href")
            if href.startswith("/"):
                href = base_url + href
            print(href, end='\r')
            csv_link.append(href)

        # Try to find the "next" <li> that is NOT disabled
        try:
            next_btn = pagination.find_element(By.CSS_SELECTOR, "li.next:not(.disabled) a")
            next_btn.click()
            print("Clicked the 'Next' button.", end='\r')
        except:
            print("'Next' button is not available or already disabled.")
            break
        
    pd.DataFrame(csv_link).to_csv(f'{value}_link.csv', index=False)

'Next' button is not available or already disabled.oi.html?id=19367&row_limit=1
'Next' button is not available or already disabled.oi.html?id=13794&row_limit=1
'Next' button is not available or already disabled.oi.html?id=12845&row_limit=1
'Next' button is not available or already disabled.oi.html?id=13579&row_limit=1
'Next' button is not available or already disabled.oi.html?id=11263&row_limit=1
'Next' button is not available or already disabled.oi.html?id=14169&row_limit=1
'Next' button is not available or already disabled.oi.html?id=20276&row_limit=1
'Next' button is not available or already disabled.oi.html?id=14829&row_limit=1
'Next' button is not available or already disabled.oi.html?id=20091&row_limit=1
'Next' button is not available or already disabled.oi.html?id=17767&row_limit=1
'Next' button is not available or already disabled.oi.html?id=15337&row_limit=1
'Next' button is not available or already disabled.oi.html?id=17227&row_limit=1
'Next' button is not available or alread

In [7]:
# Close the browser when done
driver.quit()