In [1]:
# Scrape data

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
import pandas as pd
import time
from pathlib import Path
import os

In [3]:
def init_driver():
    # options = webdriver.ChromeOptions()
    # driver = webdriver.Chrome(ChromeDriverManager(path="./chrome_drivers").install(), options=options)
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    return driver

def navigate_to_page(driver, url):
    """Navigate to the specified URL."""
    driver.get(url)

def select_area(driver, wait, area_name):
    """Select the area by name."""
    try:
        area_selector = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "select2-search__field")))
        area_selector.click()
        area_selector.send_keys(area_name)
        time.sleep(1) # Set a delay to wait a key be filled in the input field
        area_selector.send_keys(Keys.ENTER)
    except Exception as e:
        print(f"Error selecting area '{area_name}': {e}")

def input_date(driver, wait, xpath, date_value):
    """Input a date into the specified field."""
    try:
        date_field = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
        # Clear the field before entering the new value  
        date_field.send_keys(Keys.CONTROL + "a")  
        date_field.send_keys(Keys.BACKSPACE)
        date_field.send_keys(date_value)  # Enter the new value
    except Exception as e:
        print(f"Error inputting date '{date_value}': {e}")

def select_time(driver, wait, xpath, time_value):
    """Select a time from the dropdown."""
    try:
        time_dropdown = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
        time_dropdown.click()
        time_dropdown.send_keys(time_value)
        time_dropdown.send_keys(Keys.ENTER)
    except Exception as e:
        print(f"Error selecting time '{time_value}': {e}")

def click_radio_button(driver, xpath):
    """Click a radio button using its visible text."""
    try:
        radio_span = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
        radio_span.click()
    except Exception as e:
        print(f"Error clicking radio button: {e}")

def click_button(driver, xpath):
    """Click a button using its XPath."""
    try:
        button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
        button.click()
    except Exception as e:
        print(f"Error clicking button: {e}")

def extract_table(driver, xpath, delay=10):
    """
    Extracts data from a table identified by its XPath with a delay to ensure the table is loaded.
    
    Args:
        driver: Selenium WebDriver instance.
        xpath: XPath to locate the table.
        delay: Time in seconds to wait for the table to load.
    
    Returns:
        pd.DataFrame: A DataFrame containing the table data.
    """
    try:
        # Wait for the table and locate it
        table = WebDriverWait(driver, delay).until(
            EC.presence_of_element_located((By.XPATH, xpath))
        )
        
        # Locate table header
        header = table.find_element(By.XPATH, ".//thead/tr")
        time.sleep(0.5)
        header_cells = header.find_elements(By.TAG_NAME, "th")
        #header_data = [cell.text.strip() for cell in header_cells]
        header_data = []
        for cell in header_cells:
            header_data.append(cell.text.strip())
        
        # Locate all rows in the tbody
        table_data = []
        rows = table.find_elements(By.XPATH, ".//tbody/tr")
        for row in rows:
            # Extract all cells from the row
            cells = row.find_elements(By.TAG_NAME, "td")
            row_data = [cell.text.strip() for cell in cells]
            table_data.append(row_data)
        
        # Convert to DataFrame
        return pd.DataFrame(table_data, columns=header_data)
    
    except Exception as e:
        print(f"Error extracting data from the table: {e}")
        return pd.DataFrame()
    
# def scrape_data(driver, wait, url, area_name, start_date, end_date, start_time, end_time):
#     """Scrape data from the specified URL."""
#     # Navigate to the URL
#     navigate_to_page(driver, url)
    
#     # Select the area
#     select_area(driver, wait, area_name)
    
#     # Input the start date
#     input_date(driver, wait, "//div[@id='startdate']//input[@type='text']", start_date)

#     # Select the start time
#     select_time(driver, wait, "//select[@value.bind='TimeStart']", start_time)

#     # Input the end date
#     input_date(driver, wait, "//div[@id='enddate']//input[@type='text']", end_date)
    
#     # Select the end time
#     select_time(driver, wait, "//select[@value.bind='TimeEnd']", end_time)
    
#     # Click the radio button for the table view
#     click_radio_button(driver, "//span[text()='ตาราง']")
    
#     # Click the 'View Data' button
#     click_button(driver, "//button[@click.delegate='SearchFunction()']")
    
#     # Extract the table data
#     table_xpath = "//table[@id='table' and contains(@class, 'table-gradient table-striped')]"
#     return extract_table(driver, table_xpath)

def validate_csv(csv_path):
    """Validate the CSV file."""
    try:
        df = pd.read_csv(csv_path)
        if df.empty:
            print(f"CSV file '{csv_path}' is empty.")
            return False
        else:
            print(f"CSV file '{csv_path}' is valid.")
            return True
    except Exception as e:
        print(f"Error reading CSV file '{csv_path}': {e}")
        return False

In [4]:
district = "เขตตลิ่งชัน"
dest_root = "../data"

driver = init_driver()
url = "https://airquality.airbkk.com/PublicWebClient/#/Modules/Aqs/DashboardPage"
navigate_to_page(driver, url)
    
wait = WebDriverWait(driver, 20)

start_date_str = "01/01/2024"
end_date_str = "31/12/2024"
start_date = datetime.strptime(start_date_str, "%d/%m/%Y")
end_date = datetime.strptime(end_date_str, "%d/%m/%Y")

# Loop on district and date, incrementing by 1 day
for i in range((end_date - start_date).days + 1):

    current_date = start_date + timedelta(days=i)
    current_date_str = current_date.strftime("%d/%m/%Y")

    # Step 1: Select area
    if i == 0:
        select_area(driver, wait, f"{district}")

    # Step 2: Input start date
    input_date(driver, wait, "//div[@id='startdate']//input[@type='text']", current_date_str)

    # Step 3: Select start time
    select_time(driver, wait, "//select[@value.bind='TimeStart']", "00:00")

    # Step 4: Input end date
    input_date(driver, wait, "//div[@id='enddate']//input[@type='text']", current_date_str)

    # Step 5: Select end time
    select_time(driver, wait, "//select[@value.bind='TimeEnd']", "23:00")

    # Step 6: Click the radio button
    click_button(driver, "//span[text()='ตาราง']")

    # Step 7: Click the search button
    click_button(driver, "//button[@click.delegate='SearchFunction()']")

    # Step 8: Extract the table data
    data_df = extract_table(driver, "//table[@id='table' and contains(@class, 'table-gradient table-striped')]")

    # Save df to csv
    current_date_str_re = current_date.strftime("%Y-%m-%d") 
    year, month, day = current_date_str_re.split("-")
    os.makedirs(os.path.join(dest_root, district, year, month), exist_ok=True)
    file_path = Path(os.path.join(dest_root, f"{district}/{year}/{month}/{current_date_str_re}.csv"))
    data_df.to_csv(file_path, index=False)

    print(f"Scraping data on {current_date_str} --> save to {file_path}")

driver.quit()

Scraping data on 01/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-01.csv
Scraping data on 02/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-02.csv
Scraping data on 03/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-03.csv
Scraping data on 04/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-04.csv
Scraping data on 05/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-05.csv
Scraping data on 06/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-06.csv
Scraping data on 07/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-07.csv
Scraping data on 08/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-08.csv
Scraping data on 09/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-09.csv
Scraping data on 10/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-10.csv
Scraping data on 11/01/2024 --> save to ..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-11.csv

In [7]:
district

'เขตตลิ่งชัน'

In [8]:
# Validate the CSV files

driver = init_driver()
url = "https://airquality.airbkk.com/PublicWebClient/#/Modules/Aqs/DashboardPage"
navigate_to_page(driver, url)
wait = WebDriverWait(driver, 20)

select_area(driver, wait, district)

dest_root = "../data"
folder_path = Path(os.path.join(dest_root, district))
for fpath in os.walk(folder_path):
    for file in fpath[2]:
        file_path = Path(os.path.join(fpath[0], file))
        print(file_path)

        if not validate_csv(file_path):
            print(f"Re-scraping data for {file_path}")

            # Extract the date from the file name
            year, month, day = file_path.stem.split("-")
            current_date = datetime(int(year), int(month), int(day))
            current_date_str = current_date.strftime("%d/%m/%Y")

            # Re-scrape the data
            # Step 2: Input start date
            input_date(driver, wait, "//div[@id='startdate']//input[@type='text']", current_date_str)

            # Step 3: Select start time
            select_time(driver, wait, "//select[@value.bind='TimeStart']", "00:00")

            # Step 4: Input end date
            input_date(driver, wait, "//div[@id='enddate']//input[@type='text']", current_date_str)

            # Step 5: Select end time
            select_time(driver, wait, "//select[@value.bind='TimeEnd']", "23:00")

            # Step 6: Click the radio button
            click_button(driver, "//span[text()='ตาราง']")

            # Step 7: Click the search button
            click_button(driver, "//button[@click.delegate='SearchFunction()']")

            # Step 8: Extract the table data
            data_df = extract_table(driver, "//table[@id='table' and contains(@class, 'table-gradient table-striped')]")

            # Save df to csv
            current_date_str_re = current_date.strftime("%Y-%m-%d") 
            year, month, day = current_date_str_re.split("-")
            os.makedirs(os.path.join(dest_root, district, year, month), exist_ok=True)
            file_path = Path(os.path.join(dest_root, f"{district}/{year}/{month}/{current_date_str_re}.csv"))
            data_df.to_csv(file_path, index=False)

            print(f"Scraping data on {current_date_str} --> save to {file_path}")
            print(f"Finish re-scraping data for {file_path}")

driver.quit()

..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-01.csv
CSV file '..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-01.csv' is valid.
..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-02.csv
CSV file '..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-02.csv' is valid.
..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-03.csv
CSV file '..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-03.csv' is valid.
..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-04.csv
CSV file '..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-04.csv' is valid.
..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-05.csv
CSV file '..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-05.csv' is valid.
..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-06.csv
CSV file '..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-06.csv' is valid.
..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-07.csv
CSV file '..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-07.csv' is valid.
..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-08.csv
CSV file '..\scraped_data\เขตตลิ่งชัน\2024\01\2024-01-08.csv' is valid.
..\scraped_data\