In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import os
import chardet
from pathlib import Path

# ===== Configure Chrome =====
download_path = "/Users/jinzhenhan/Downloads"  # Download directory
chrome_options = Options()
chrome_options.add_argument("--start-maximized")  
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_path,
    "download.prompt_for_download": False,
    "directory_upgrade": True
})

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

try:
    # ===== Open the target page =====
    driver.get("https://data.seoul.go.kr/dataList/OA-15969/S/1/datasetView.do#")
    time.sleep(2)

    # ===== Click '전체 파일보기' (Show All Files) =====
    show_all_files_btn = driver.find_element(By.XPATH, "//span[text()='전체 파일보기']")
    show_all_files_btn.click()
    time.sleep(2)

    # ===== Retrieve all CSV download links =====
    all_links = driver.find_elements(By.XPATH, "//a[contains(@title, '.csv')]")
    print(f"Found {len(all_links)} CSV download links")

    # ===== Filter by target years and download =====
    target_years = ["2023"]   # Target years to download
    detected_encoding = None  # Store detected encoding from the first file

    for link in all_links:
        title = link.get_attribute("title")
        if title and any(year in title for year in target_years):
            print(f"Downloading: {title}")
            ActionChains(driver).move_to_element(link).click().perform()
            time.sleep(2)  # Wait for the download to start
            
            # ===== Wait until the file is fully downloaded =====
            expected_file = Path(download_path) / title
            while not expected_file.exists() or str(expected_file).endswith(".crdownload"):
                time.sleep(1)
            
            # ===== Detect encoding from the first file (read only first few KB) =====
            if detected_encoding is None:
                with open(expected_file, "rb") as f:
                    raw_data = f.read(4096)  # Read only the first 4 KB
                    detected_encoding = chardet.detect(raw_data)["encoding"]
                print(f"Detected encoding from first file: {detected_encoding}")
            
            # ===== Convert to UTF-8 using detected encoding =====
            with open(expected_file, "r", encoding=detected_encoding, errors="ignore") as f:
                content = f.read()
            with open(expected_file, "w", encoding="utf-8") as f:
                f.write(content)
            print(f"✅ {title} converted to UTF-8")

    print("🎯 All selected files downloaded and converted to UTF-8")

finally:
    driver.quit()

Found 83 CSV download links
Downloading: S-DoT_NATURE_2023.12.25-12.31.csv
Detected encoding from first file: EUC-KR
✅ S-DoT_NATURE_2023.12.25-12.31.csv converted to UTF-8
🎯 All selected files downloaded and converted to UTF-8
