In [18]:
# -*- coding: utf-8 -*-
# 에브리타임 게시판(현재 열린 목록 페이지) → 글 상세 들어가 수집 → JSONL 저장
# 전제: 같은 파이썬 세션에 Selenium WebDriver 'driver'가 로그인 상태로 살아있고,
#       지금 탭이 게시판 목록(질문에 준한 HTML 구조) 화면이어야 함.

import json, re, time
from datetime import datetime, date
from urllib.parse import urljoin
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

BASE = "https://everytime.kr"
OUT_PATH = "everytime_board_418775.jsonl"
CUTOFF_DATE = date(2025, 8, 15)  # 포함

# 목록/상세에서 쓸 셀렉터 후보(레이아웃 변경 대비)
SEL = {
    "list_wrap": "div.wrap.articles",
    "list_item_anchor": "div.wrap.articles article.list a.article",
    "list_item_time": "div.desc .info time.small",
    "pagination_next": "div.pagination a.next",

    "title": [
        "div.wrap.view article div.title h2",
        "div.wrap div.container article div.title h2",
        "article .title h2",
        "h2.medium.bold",
    ],
    "content": [
        "div.wrap.view article .content",
        "div.wrap div.container article div.content",
        "article .content",
        "div.panel.article .content",
        "div.desc p.medium",
    ],
    "images": [
        "article .attach img",
        "article .attachments img",
        "div.attachthumbnail",   # style=background-image
    ],
    "datetime": [
        "article time",
        "div.info time.small",
        "time.small",
    ],
    "author": [
        "article .author",
        "div.info h3.small",
    ],
}

def _find_first_text(driver, selectors):
    for css in selectors:
        try:
            el = driver.find_element(By.CSS_SELECTOR, css)
            t = el.text.strip()
            if t: return t
        except NoSuchElementException:
            pass
    return None

def _find_all(driver, selectors):
    for css in selectors:
        els = driver.find_elements(By.CSS_SELECTOR, css)
        if els:
            return css, els
    return None, []

def _parse_bg_image(style_value):
    if not style_value: return None
    m = re.search(r'url\(["\']?(.*?)["\']?\)', style_value)
    return m.group(1) if m else None

def _normalize_datetime(raw, today_dt=None):
    """'15:50' → 오늘 날짜, 'MM/DD' → 올해 날짜, 'YYYY.MM.DD HH:MM' 등은 그대로 파싱."""
    if not raw: return None
    raw = raw.strip()
    today = (today_dt or datetime.now()).date()

    # HH:MM
    if re.fullmatch(r"\d{1,2}:\d{2}", raw):
        h, m = map(int, raw.split(":"))
        return datetime(today.year, today.month, today.day, h, m)

    # MM/DD
    if re.fullmatch(r"\d{1,2}/\d{1,2}", raw):
        mm, dd = map(int, raw.split("/"))
        try:
            return datetime(today.year, mm, dd)
        except ValueError:
            return None

    # 풀 포맷 시도
    for fmt in ("%Y.%m.%d %H:%M", "%Y-%m-%d %H:%M", "%Y/%m/%d %H:%M",
                "%Y.%m.%d", "%Y-%m-%d", "%Y/%m/%d"):
        try:
            return datetime.strptime(raw, fmt)
        except ValueError:
            pass
    return None

def _extract_post(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
    except TimeoutException:
        pass

    title = _find_first_text(driver, SEL["title"])
    content = _find_first_text(driver, SEL["content"])

    image_urls = []
    css, imgs = _find_all(driver, SEL["images"])
    if imgs and css != "div.attachthumbnail":
        for el in imgs:
            try:
                s = el.get_attribute("src")
                if s: image_urls.append(s)
            except: pass
    elif imgs and css == "div.attachthumbnail":
        for el in imgs:
            u = _parse_bg_image(el.get_attribute("style"))
            if u: image_urls.append(u)

    raw_time = _find_first_text(driver, SEL["datetime"])
    dt = _normalize_datetime(raw_time)
    author = _find_first_text(driver, SEL["author"])

    m = re.search(r"/v/(\d+)", url)
    post_id = m.group(1) if m else None

    return {
        "id": post_id,
        "url": url,
        "title": title,
        "author": author,
        "time_original": raw_time,
        "time_iso": dt.isoformat() if dt else None,
        "content": content,
        "images": image_urls,
    }

def _get_list_items(driver):
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"])))
    items = []
    for a in driver.find_elements(By.CSS_SELECTOR, SEL["list_item_anchor"]):
        href = a.get_attribute("href")
        if not href: continue
        url = urljoin(BASE, href)
        try:
            t = a.find_element(By.CSS_SELECTOR, SEL["list_item_time"]).get_attribute("textContent").strip()
        except NoSuchElementException:
            t = None
        items.append({"url": url, "time_raw": t})
    return items

def _click_next(driver):
    try:
        next_a = driver.find_element(By.CSS_SELECTOR, SEL["pagination_next"])
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", next_a)
        time.sleep(0.1)
        next_a.click()
        return True
    except NoSuchElementException:
        return False

def crawl_current_board(driver, out_path=OUT_PATH, cutoff_date=CUTOFF_DATE):
    """현재 탭이 게시판 목록인 상태에서 시작 → 다음 버튼 따라가며 JSONL 저장."""
    # 목록 보장
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"])))

    seen_ids = set()
    written = 0

    with open(out_path, "w", encoding="utf-8") as fw:
        stop = False
        while True:
            items = _get_list_items(driver)

            for it in items:
                url = it["url"]
                m = re.search(r"/v/(\d+)", url)
                pid = m.group(1) if m else url
                if pid in seen_ids:
                    continue

                data = _extract_post(driver, url)

                # 날짜 판정(상세 > 목록)
                dt = None
                if data.get("time_iso"):
                    try:
                        dt = datetime.fromisoformat(data["time_iso"])
                    except: dt = None
                if dt is None and it.get("time_raw"):
                    dt = _normalize_datetime(it["time_raw"])

                if dt is not None and dt.date() < cutoff_date:
                    stop = True
                    break

                fw.write(json.dumps(data, ensure_ascii=False) + "\n")
                fw.flush()
                seen_ids.add(pid)
                written += 1

                # 목록으로 복귀
                driver.back()
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"]))
                    )
                except TimeoutException:
                    pass

            if stop:
                break

            if not _click_next(driver):
                break

            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"])))
            except TimeoutException:
                break

    print(f"[DONE] JSONL 저장: {out_path} (총 {written}건)")

# ⭐ 지금 세션에서 이 한 줄만 실행하면 시작:
crawl_current_board(driver)

[DONE] JSONL 저장: everytime_board_418775.jsonl (총 20건)


In [None]:
# -*- coding: utf-8 -*-
# 에브리타임 게시판(현재 열린 목록 페이지) → 글 상세(본문/이미지/작성시각/작성자) 수집 → JSONL
# 전제: 같은 파이썬 세션에 Selenium WebDriver 'driver'가 로그인 상태로 살아있고,
#       지금 탭이 게시판 목록(질문에 준한 HTML 구조) 화면이어야 함.

import json, re, time
from datetime import datetime, date
from urllib.parse import urljoin
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

BASE = "https://everytime.kr"
OUT_PATH = "everytime_board_418775.jsonl"
CUTOFF_DATE = date(2025, 8, 15)  # 포함

SEL = {
    # 목록
    "list_wrap": "div.wrap.articles",
    "list_item_anchor": "div.wrap.articles article.list a.article",
    "list_item_time": "div.desc .info time.small",
    "pagination_next": "div.pagination a.next",

    # 상세(후보들 포함)
    "title": [
        "div.wrap.view article div.title h2",
        "div.wrap div.container article div.title h2",
        "article .title h2",
        "h2.medium.bold",
    ],
    "content": [
        "div.wrap.view article .content",
        "div.wrap div.container article div.content",
        "article .content",
        "div.panel.article .content",
    ],
    "content_more_btn": [
        "article .content .more",
        ".more",
        "button.more",
    ],
    "images": [
        "article .attach img",
        "article .attachments img",
        "div.attachthumbnail",  # style=background-image
    ],
    "datetime": [
        "article time",
        "div.info time.small",
        "time.small",
    ],
    "author": [
        "article .author",
        "div.info h3.small",
    ],
}

def _find_first_text(driver, selectors):
    for css in selectors:
        try:
            el = driver.find_element(By.CSS_SELECTOR, css)
            t = el.text.strip()
            if t: return t
        except NoSuchElementException:
            pass
    return None

def _find_first_el(element_or_driver, selectors):
    for css in selectors:
        try:
            el = element_or_driver.find_element(By.CSS_SELECTOR, css)
            return el
        except NoSuchElementException:
            pass
    return None

def _find_all(driver, selectors):
    for css in selectors:
        els = driver.find_elements(By.CSS_SELECTOR, css)
        if els:
            return css, els
    return None, []

def _parse_bg_image(style_value):
    if not style_value: return None
    m = re.search(r'url\(["\']?(.*?)["\']?\)', style_value)
    return m.group(1) if m else None

def _normalize_datetime(raw, today_dt=None):
    """'15:50' → 오늘 날짜, 'MM/DD' → 올해 날짜, 'YYYY.MM.DD HH:MM' 등은 그대로 파싱."""
    if not raw: return None
    raw = raw.strip()
    today = (today_dt or datetime.now()).date()

    if re.fullmatch(r"\d{1,2}:\d{2}", raw):
        h, m = map(int, raw.split(":"))
        return datetime(today.year, today.month, today.day, h, m)

    if re.fullmatch(r"\d{1,2}/\d{1,2}", raw):
        mm, dd = map(int, raw.split("/"))
        try:
            return datetime(today.year, mm, dd)
        except ValueError:
            return None

    for fmt in ("%Y.%m.%d %H:%M", "%Y-%m-%d %H:%M", "%Y/%m/%d %H:%M",
                "%Y.%m.%d", "%Y-%m-%d", "%Y/%m/%d"):
        try:
            return datetime.strptime(raw, fmt)
        except ValueError:
            pass
    return None

def _click_if_present(element_or_driver, selectors, max_try=3):
    """여러 후보로 '더보기' 같은 버튼을 찾아 클릭 시도."""
    for _ in range(max_try):
        btn = _find_first_el(element_or_driver, selectors)
        if not btn:
            return False
        try:
            btn.click()
            time.sleep(0.2)
            return True
        except Exception:
            try:
                element_or_driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
            except Exception:
                pass
            time.sleep(0.2)
    return False

def _extract_post(driver, url):
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
    except TimeoutException:
        pass

    # 본문 '더보기' 펼치기(있다면)
    _click_if_present(driver, SEL["content_more_btn"], max_try=2)

    # 제목/본문(텍스트 & HTML)
    title = _find_first_text(driver, SEL["title"])

    content_el = _find_first_el(driver, SEL["content"])
    content_text = content_el.text.strip() if content_el else None
    content_html = content_el.get_attribute("innerHTML") if content_el else None

    # 이미지
    image_urls = []
    css, imgs = _find_all(driver, SEL["images"])
    if imgs and css != "div.attachthumbnail":
        for el in imgs:
            try:
                s = el.get_attribute("src")
                if s: image_urls.append(s)
            except: pass
    elif imgs and css == "div.attachthumbnail":
        for el in imgs:
            u = _parse_bg_image(el.get_attribute("style"))
            if u: image_urls.append(u)

    # 메타(작성자/시각)
    raw_time = _find_first_text(driver, SEL["datetime"])
    dt = _normalize_datetime(raw_time)
    author = _find_first_text(driver, SEL["author"])

    # 글 ID
    m = re.search(r"/v/(\d+)", url)
    post_id = m.group(1) if m else None

    return {
        "id": post_id,
        "url": url,
        "title": title,
        "author": author,
        "time_original": raw_time,
        "time_iso": dt.isoformat() if dt else None,
        "content_text": content_text,
        "content_html": content_html,
        "images": image_urls,
    }

def _get_list_items(driver):
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"])))
    items = []
    for a in driver.find_elements(By.CSS_SELECTOR, SEL["list_item_anchor"]):
        href = a.get_attribute("href")
        if not href: continue
        url = urljoin(BASE, href)
        try:
            t = a.find_element(By.CSS_SELECTOR, SEL["list_item_time"]).get_attribute("textContent").strip()
        except NoSuchElementException:
            t = None
        items.append({"url": url, "time_raw": t})
    return items

def _click_next(driver):
    try:
        next_a = driver.find_element(By.CSS_SELECTOR, SEL["pagination_next"])
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", next_a)
        time.sleep(0.1)
        next_a.click()
        return True
    except NoSuchElementException:
        return False

def crawl_current_board(driver, out_path=OUT_PATH, cutoff_date=CUTOFF_DATE):
    """현재 탭이 게시판 목록인 상태에서 시작 → '다음' 따라가며 상세 본문/이미지/작성시각/작성자만 JSONL 저장."""
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"])))

    seen_ids = set()
    written = 0

    with open(out_path, "w", encoding="utf-8") as fw:
        stop = False
        while True:
            items = _get_list_items(driver)

            for it in items:
                url = it["url"]
                m = re.search(r"/v/(\d+)", url)
                pid = m.group(1) if m else url
                if pid in seen_ids:
                    continue

                data = _extract_post(driver, url)

                # 날짜 판정(상세 > 목록)
                dt = None
                if data.get("time_iso"):
                    try:
                        dt = datetime.fromisoformat(data["time_iso"])
                    except: dt = None
                if dt is None and it.get("time_raw"):
                    dt = _normalize_datetime(it["time_raw"])

                if dt is not None and dt.date() < cutoff_date:
                    # cutoff 이전 글을 만나면 중단
                    stop = True
                    driver.back()
                    try:
                        WebDriverWait(driver, 10).until(
                            EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"]))
                        )
                    except TimeoutException:
                        pass
                    break

                fw.write(json.dumps(data, ensure_ascii=False) + "\n")
                fw.flush()
                seen_ids.add(pid)
                written += 1

                # 목록으로 복귀
                driver.back()
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"]))
                    )
                except TimeoutException:
                    pass

            if stop:
                break

            if not _click_next(driver):
                break

            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"])))
            except TimeoutException:
                break

    print(f"[DONE] JSONL 저장: {out_path} (총 {written}건)")

로그인 성공(추정): 로그인 - 에브리타임
