In [None]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import csv

# 1. 기존 CSV에서 URL 불러오기 (상위 10개 테스트용)
df = pd.read_csv("8_hidoc_articles_updated.csv", encoding="utf-8-sig")
urls = df["URL"].dropna().unique().tolist()

# 2. Selenium 드라이버 설정
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 10)

results = []

# 3. URL 순회하며 <h1> + 본문(div) 수집
for idx, url in enumerate(urls, 1):
    print(f"[{idx}] 수집 중: {url}")
    try:
        driver.get(url)
        wait.until(EC.presence_of_element_located((By.ID, "article-view-content-div")))

        # 제목 <h1>
        try:
            title = driver.find_element(By.TAG_NAME, "h1").text.strip()
        except:
            title = ""

        # 본문 HTML 추출
        content_div = driver.find_element(By.ID, "article-view-content-div")
        html = content_div.get_attribute("innerHTML")
        soup = BeautifulSoup(html, "html.parser")

        # 1. <figcaption> 제거
        for fig in soup.find_all("figcaption"):
            fig.decompose()

        # 2. 전체 텍스트 추출
        content = soup.get_text(separator="\n", strip=True)

        # 3. '기획 =', '도움말 =' 으로 시작하는 문장 제거
        lines = content.splitlines()
        lines = [line for line in lines if not line.strip().startswith(("기획 =", "도움말 ="))]
        content = "\n".join(lines).strip()

    except Exception as e:
        print(f"❌ {url} 수집 실패: {e}")
        title = ""
        content = ""

    results.append([title, content, url])
    time.sleep(0.3)

driver.quit()

# 4. CSV 저장
df_result = pd.DataFrame(results, columns=["title", "content", "url"])
df_result.to_csv("hidoc_articles_cleaned_final.csv", index=False, encoding="utf-8-sig")
print("✅ 저장 완료: hidoc_articles_cleaned_final.csv")


[1] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=41819
[2] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=41821
[3] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=41246
[4] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=41304
[5] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=40767
[6] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=33457
[7] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=33434
[8] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=33347
[9] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=33135
[10] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=32738
[11] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=32485
[12] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=32049
[13] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=32031
[14] 수집 중: https://news.hidoc.co.kr/news/articleView.html?idxno=31769
[15] 수집 중: https://news.hidoc