In [11]:
# 작업자 : 권은이

import os, tempfile, time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

USER = "kwoneunei"
PW   = "kwoneunei0511@@"

USER_DATA_DIR = os.path.join(tempfile.gettempdir(), "selenium_profile_everytime")
os.makedirs(USER_DATA_DIR, exist_ok=True)

opts = Options()
# 맥에서는 Linux용 우회 플래그 빼는 게 안정적
opts.add_argument("--start-maximized")
opts.add_argument(f"--user-data-dir={USER_DATA_DIR}")
opts.add_argument("--profile-directory=Default")
opts.add_argument("--no-first-run")
opts.add_argument("--no-default-browser-check")
# 필요시 번역UI만 끄기
opts.add_argument("--disable-features=TranslateUI")

# Chrome 바이너리 지정이 꼭 필요하지는 않음(기본 경로 감지됨).
# opts.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

driver = webdriver.Chrome(options=opts)  # Selenium Manager가 드라이버 자동 준비
wait = WebDriverWait(driver, 15)

driver.get("https://everytime.kr/login")
wait.until(EC.visibility_of_element_located((By.NAME, "id"))).send_keys(USER)
driver.find_element(By.NAME, "password").send_keys(PW)

btns = driver.find_elements(By.XPATH, '//input[@type="submit" and contains(@value,"로그인")]')
(btns[0] if btns else driver).click() if btns else driver.execute_script(
    'document.querySelector(\'form[action="/user/login"]\').submit();'
)

try:
    WebDriverWait(driver, 15).until(EC.any_of(
        EC.url_contains("everytime.kr"),
        EC.presence_of_element_located((By.XPATH, '//a[contains(.,"게시판")]'))
    ))
    print("로그인 성공(추정):", driver.title)
except Exception as e:
    print("로그인 실패/차단:", repr(e))
    driver.save_screenshot("everytime_after_login.png")
    with open("everytime_after_login.html","w",encoding="utf-8") as f:
        f.write(driver.page_source)

로그인 성공(추정): 로그인 - 에브리타임


In [17]:
# -*- coding: utf-8 -*-
"""
에브리타임 게시판 크롤러 (댓글 제외, 8/15 포함, 확정 네비게이션 & 스크롤)
- 이미 로그인된 Selenium WebDriver(driver) 재사용 (새 창/새 세션 생성하지 않음)
- 목록: 끝까지 스크롤해서 페이지네이션 렌더 → 글 href들을 '문자열'로 수집
- 상세: 저장 후 '뒤로가기' 대신 목록 URL로 driver.get()으로 명시 복귀
- 다음: a.next의 href를 읽어 driver.get()으로 명시 이동 + 첫 글 ID 비교로 이동 검증
- 기준일(2025-08-15) '포함'까지 저장, 그 이전(<) 만나면 중단
- 결과: JSONL (한 줄 = 한 게시글)
"""

import json, re, time
from datetime import datetime, date
from urllib.parse import urljoin
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

BASE = "https://everytime.kr"
OUT_PATH = "everytime_crawling_club.jsonl"
CUTOFF_DATE = date(2025, 8, 15)  # ← 8/15 '포함'

SEL = {
    # 목록
    "list_wrap": "div.wrap.articles",
    "list_item_anchor": "div.wrap.articles article.list a.article",
    "list_item_time": "div.desc .info time.small",
    "pagination_next": "div.pagination a.next",  # <a href="/418775/p/2" class="next">다음</a>

    # 상세 컨테이너 후보
    "article_container": [
        "div.wrap.view article",
        "div.wrap div.container article",
        "article.view",
        "article",
    ],

    # 메타
    "title": [
        "div.wrap.view article div.title h2",
        "article .title h2",
        "h2.medium.bold",
    ],
    "datetime": ["article time", "time.small"],
    "author": ["article .author", "div.info h3.small"],

    # 이미지
    "images": [
        "article .attach img",
        "article .attachments img",
        "div.attachthumbnail",  # background-image
    ],

    # 더보기
    "more_buttons_xpath": (
        ".//button[contains(., '더보기')]"
        " | .//a[contains(., '더보기')]"
        " | .//*[contains(@class,'more') and (self::a or self::button or self::div)]"
    ),

    # 댓글(제외용)
    "comments_roots": [
        "div.comments",
        "section.comments",
        ".comments",
    ],
}

# ---------------------- 유틸 ---------------------- #
def _find_first(driver_or_el, selectors):
    for css in selectors:
        try:
            return driver_or_el.find_element(By.CSS_SELECTOR, css)
        except NoSuchElementException:
            pass
    return None

def _find_first_text(driver, selectors):
    for css in selectors:
        try:
            t = driver.find_element(By.CSS_SELECTOR, css).text.strip()
            if t:
                return t
        except NoSuchElementException:
            pass
    return None

def _find_all(driver_or_el, selectors):
    for css in selectors:
        els = driver_or_el.find_elements(By.CSS_SELECTOR, css)
        if els:
            return css, els
    return None, []

def _parse_bg_image(style_value):
    if not style_value:
        return None
    m = re.search(r'url\(["\']?(.*?)["\']?\)', style_value)
    return m.group(1) if m else None

def _normalize_datetime(raw, today_dt=None):
    """'15:50' → 오늘, 'MM/DD' → 올해, 'YYYY.MM.DD [HH:MM]' 등 파싱."""
    if not raw:
        return None
    raw = raw.strip()
    today = (today_dt or datetime.now()).date()

    if re.fullmatch(r"\d{1,2}:\d{2}", raw):
        h, m = map(int, raw.split(":"))
        return datetime(today.year, today.month, today.day, h, m)

    if re.fullmatch(r"\d{1,2}/\d{1,2}", raw):
        mm, dd = map(int, raw.split("/"))
        try:
            return datetime(today.year, mm, dd)
        except ValueError:
            return None

    for fmt in ("%Y.%m.%d %H:%M", "%Y-%m-%d %H:%M", "%Y/%m/%d %H:%M",
                "%Y.%m.%d", "%Y-%m-%d", "%Y/%m/%d"):
        try:
            return datetime.strptime(raw, fmt)
        except ValueError:
            pass
    return None

def _click_all_more_in(el, max_clicks=20):
    """컨테이너 내부 '더보기' 모두 클릭(댓글 영역 제외)."""
    for _ in range(max_clicks):
        btns = el.find_elements(By.XPATH, SEL["more_buttons_xpath"])
        if not btns:
            break

        # 댓글 영역 제외
        comments = []
        for css in SEL["comments_roots"]:
            comments += el.find_elements(By.CSS_SELECTOR, css)

        def _in_comments(b):
            for root in comments:
                try:
                    if b in root.find_elements(By.XPATH, ".//*") or b == root:
                        return True
                except Exception:
                    pass
            return False

        btns = [b for b in btns if b.is_displayed() and not _in_comments(b)]
        if not btns:
            break

        for b in btns:
            try:
                el._parent.execute_script("arguments[0].scrollIntoView({block:'center'});", b)
                time.sleep(0.05)
                b.click()
                time.sleep(0.15)
            except Exception:
                continue

def _ensure_pagination_visible(driver, timeout=10):
    """목록 페이지에서 끝까지 스크롤해 pagination을 렌더링."""
    WebDriverWait(driver, timeout).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"]))
    )
    for _ in range(10):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.25)
        if driver.find_elements(By.CSS_SELECTOR, SEL["pagination_next"]):
            break

# ---------------------- 목록/네비게이션 ---------------------- #
def _collect_list_links_and_times(driver):
    """현재 목록 페이지의 게시글 링크(절대URL)와 time_raw를 '문자열'로 수집."""
    _ensure_pagination_visible(driver)
    anchors = driver.find_elements(By.CSS_SELECTOR, SEL["list_item_anchor"])
    links, times = [], []
    for a in anchors:
        href = a.get_attribute("href")
        if not href:
            continue
        links.append(urljoin(BASE, href))
        try:
            t = a.find_element(By.CSS_SELECTOR, SEL["list_item_time"]).text.strip()
        except NoSuchElementException:
            t = None
        times.append(t)
    return links, times

def _get_next_url(driver):
    """현재 목록 페이지의 '다음' 링크 절대 URL 반환(없으면 None)."""
    try:
        next_a = driver.find_element(By.CSS_SELECTOR, SEL["pagination_next"])
        href = next_a.get_attribute("href")
        if not href:
            return None
        return urljoin(BASE, href)
    except NoSuchElementException:
        return None

def _goto_next_url(driver, next_url):
    """다음 페이지로 명시 이동 + 첫 글 ID가 달라졌는지 검증."""
    if not next_url:
        return False

    # 현재 첫 글 ID
    cur_links, _ = _collect_list_links_and_times(driver)
    if not cur_links:
        return False
    m = re.search(r"/v/(\d+)", cur_links[0])
    prev_first = m.group(1) if m else cur_links[0]

    driver.get(next_url)
    _ensure_pagination_visible(driver)

    new_links, _ = _collect_list_links_and_times(driver)
    if not new_links:
        return False
    m2 = re.search(r"/v/(\d+)", new_links[0])
    new_first = m2.group(1) if m2 else new_links[0]
    return new_first != prev_first

# ---------------------- 상세 추출 ---------------------- #
def _extract_post(driver, url):
    """게시글 상세 페이지 크롤링(댓글 제외, 더보기 펼침, 텍스트/HTML/이미지 수집)."""
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body")))
    except TimeoutException:
        pass

    article = _find_first(driver, SEL["article_container"])
    if not article:
        article = driver.find_element(By.TAG_NAME, "body")

    # 댓글 숨김
    for css in SEL["comments_roots"]:
        for c in article.find_elements(By.CSS_SELECTOR, css):
            try:
                driver.execute_script("arguments[0].style.display='none';", c)
            except Exception:
                pass

    # 더보기 펼침
    _click_all_more_in(article)

    # 메타
    title = _find_first_text(driver, SEL["title"])
    author = _find_first_text(driver, SEL["author"])
    t_raw = _find_first_text(driver, SEL["datetime"])
    t_dt = _normalize_datetime(t_raw)

    # 이미지
    imgs = []
    css, img_els = _find_all(article, SEL["images"])
    if img_els and css != "div.attachthumbnail":
        for el in img_els:
            s = el.get_attribute("src")
            if s:
                imgs.append(s)
    elif img_els and css == "div.attachthumbnail":
        for el in img_els:
            u = _parse_bg_image(el.get_attribute("style"))
            if u:
                imgs.append(u)

    # 전체 텍스트/HTML
    all_text = article.text.strip() if article.text else None
    all_html = article.get_attribute("innerHTML")

    # 글 ID
    m = re.search(r"/v/(\d+)", url)
    post_id = m.group(1) if m else None

    return {
        "id": post_id,
        "url": url,
        "title": title,
        "author": author,
        "time_original": t_raw,
        "time_iso": t_dt.isoformat() if t_dt else None,
        "images": imgs,
        "all_text": all_text,
        "all_html": all_html,
    }

# ---------------------- 메인 루프 ---------------------- #
def crawl_current_board(driver, out_path=OUT_PATH, cutoff_date=CUTOFF_DATE, append=False):
    """
    현재 탭의 게시판에서 8/15(포함) 이후 글을 여러 페이지에 걸쳐 수집.
    - 목록 페이지 URL을 기억하고, 상세 후에는 back() 대신 그 URL로 명시 복귀
    - 다음 페이지는 href로 직접 이동 + 이동 검증
    """
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, SEL["list_wrap"]))
    )
    _ensure_pagination_visible(driver)
    list_url = driver.current_url

    seen = set()
    written = 0
    page_count = 0
    max_pages = 500  # 안전장치

    mode = "a" if append else "w"
    with open(out_path, mode, encoding="utf-8") as fw:
        stop = False
        while True:
            page_count += 1
            if page_count > max_pages:
                print(f"[WARN] 페이지 {max_pages} 한도 도달, 중단.")
                break

            links, times = _collect_list_links_and_times(driver)
            next_url = _get_next_url(driver)

            # 디버그(필요시 주석 해제)
            # print("PAGE:", driver.current_url, "COUNT:", len(links), "NEXT:", next_url)

            if not links:
                break

            for idx, url in enumerate(links):
                m = re.search(r"/v/(\d+)", url)
                pid = m.group(1) if m else url
                if pid in seen:
                    continue

                data = _extract_post(driver, url)

                # 날짜 판정(상세 > 목록)
                dt = None
                if data.get("time_iso"):
                    try:
                        dt = datetime.fromisoformat(data["time_iso"])
                    except Exception:
                        dt = None
                if dt is None:
                    t_raw = times[idx] if idx < len(times) else None
                    if t_raw:
                        dt = _normalize_datetime(t_raw)

                # 8/15(포함)까지 저장, 그 이전(<) 만나면 중단
                if dt is not None and dt.date() < cutoff_date:
                    stop = True
                    driver.get(list_url)
                    _ensure_pagination_visible(driver)
                    break

                fw.write(json.dumps(data, ensure_ascii=False) + "\n")
                fw.flush()
                seen.add(pid)
                written += 1

                driver.get(list_url)
                _ensure_pagination_visible(driver)

            if stop:
                break

            if not next_url:
                break

            moved = _goto_next_url(driver, next_url)
            if not moved:
                break

            list_url = driver.current_url  # 현재 목록 페이지 URL 갱신

    print(f"[DONE] JSONL 저장 완료: {out_path} (총 {written}건)")

In [18]:
crawl_current_board(driver)

[DONE] JSONL 저장 완료: everytime_crawling_club.jsonl (총 1877건)
