In [78]:
import requests
import random
import time
import os
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
from datetime import datetime, timedelta

In [79]:
import ctypes

# SetThreadExecutionState: 시스템이 슬립하거나 화면이 꺼지는 것 방지
ctypes.windll.kernel32.SetThreadExecutionState(0x80000002)

-2147483646

In [None]:
USER_AGENTS = [
    # 생략 없이 20개 전체 포함
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 Chrome/122.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_2) Gecko/20100101 Firefox/110.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:110.0) Gecko/20100101 Firefox/110.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_2 like Mac OS X) AppleWebKit/605.1.15 Version/16.2 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Linux; Android 13; SM-S918N) AppleWebKit/537.36 Chrome/113.0.0.0 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; Android 10; SM-G970F) AppleWebKit/537.36 Chrome/80.0.3987.119 SamsungBrowser/13.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/124.0.0.0 Safari/537.36 Brave/124.0.0.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/123.0.0.0 Safari/537.36 OPR/89.0.4447.83",
    "Mozilla/5.0 (X11; Linux x86_64) Chrome/117.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/118.0.5993.90 Safari/537.36",
    "Mozilla/5.0 (iPad; CPU OS 15_5 like Mac OS X) AppleWebKit/605.1.15 Version/15.5 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/125.0.0.1 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/111.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_5) AppleWebKit/605.1.15 Version/15.5 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 Chrome/104.0.0.0 Safari/537.36",
]


def get_random_headers():
    return {
        "User-Agent": random.choice(USER_AGENTS),
        "Referer": "https://www.google.com",
    }


def convert_to_public_url(href):
    parsed = urlparse(href)
    params = parse_qs(parsed.query)
    article_id = params.get("article_id", [""])[0]
    office_id = params.get("office_id", [""])[0]
    if article_id and office_id:
        return f"https://n.news.naver.com/mnews/article/{office_id}/{article_id}"
    return href


def fetch_article_details(url):
    try:
        res = requests.get(url, headers=get_random_headers(), timeout=10)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "lxml")

        # 본문
        article_elem = soup.select_one("article#dic_area")
        article = (
            article_elem.get_text(strip=True, separator="\n") if article_elem else ""
        )

        # 이미지
        image_elem = soup.select_one('meta[property="og:image"]')
        image = image_elem["content"] if image_elem else ""

        return article, image
    except Exception:
        return "", ""


def fetch_news_by_date(date: str, max_pages: int = 10):
    all_news = []

    for page in range(1, max_pages + 1):
        try:
            url = f"https://finance.naver.com/news/news_list.naver?mode=LSS3D&section_id=101&section_id2=258&section_id3=402&date={date}&page={page}"
            res = requests.get(url, headers=get_random_headers(), timeout=10)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, "lxml")

            subject_tags = soup.select("dl > dd.articleSubject, dl > dt.articleSubject")
            summary_tags = soup.select("dl > dd.articleSummary")

            if not subject_tags or not summary_tags:
                break

            for subject_tag, summary_tag in zip(subject_tags, summary_tags):
                try:
                    a_tag = subject_tag.a
                    if not a_tag:
                        continue

                    title = a_tag.get("title") or a_tag.text.strip()
                    article_url = convert_to_public_url(a_tag["href"])
                    press = summary_tag.select_one(".press").text.strip()
                    wdate = summary_tag.select_one(".wdate").text.strip()

                    print(f"📰 크롤링 중: [{wdate}] {title} ({press}) - {article_url}")

                    # 상세 페이지에서 본문, 요약, 이미지 추가
                    article_text, image = fetch_article_details(article_url)

                    all_news.append(
                        {
                            "wdate": wdate,
                            "title": title,
                            "article": article_text,
                            "press": press,
                            "url": article_url,
                            "image": image,
                        }
                    )

                    time.sleep(random.uniform(0.5, 1.5))  # 랜덤 대기 시간
                except Exception:
                    continue
        except Exception:
            continue

    return all_news


def save_news_to_csv(news_data, date_str, folder="news_data"):
    # 날짜 파싱해서 연도와 월 추출
    date_obj = datetime.strptime(date_str, "%Y%m%d")
    year = date_obj.strftime("%Y")
    month = date_obj.strftime("%m")

    # 폴더 구조: news_data/YYYY/MM/
    full_path = os.path.join(folder, year, month)
    os.makedirs(full_path, exist_ok=True)

    # CSV 저장
    file_path = os.path.join(full_path, f"{date_str}.csv")
    pd.DataFrame(news_data).to_csv(file_path, index=False)


def crawl_news_range(start_date_str, end_date_str, max_pages=5, folder="news_data"):
    start_date = datetime.strptime(start_date_str, "%Y%m%d")
    end_date = datetime.strptime(end_date_str, "%Y%m%d")
    total_days = (end_date - start_date).days + 1
    current_date = start_date

    for i in range(total_days):
        date_str = current_date.strftime("%Y%m%d")
        progress = (i + 1) / total_days * 100
        print(f"[{i + 1}/{total_days}] 📅 크롤링 중: {date_str} ({progress:.1f}%)")

        daily_news = fetch_news_by_date(date_str, max_pages=max_pages)
        if daily_news:
            save_news_to_csv(daily_news, date_str, folder)

        current_date += timedelta(days=1)

In [None]:
# 예시 실행
crawl_news_range("20250520", "20250520", max_pages=1, folder="news_data")

In [76]:
df = pd.read_csv("news_data/20250520.csv")
df.head()

Unnamed: 0,wdate,title,article,press,url,image
0,2025-05-20 19:42,금융위 “원화마켓 내달 코인 매도..고객확인 강화”,거래목적과 자금원천 등 확인 및 검증\n거래 흐름상 자금원천·거래목적 확인·검증 대...,파이낸셜뉴스,https://n.news.naver.com/mnews/article/014/000...,https://imgnews.pstatic.net/image/014/2025/05/...
1,2025-05-20 19:41,"[마켓인]""LP는 국경 없다""...韓 출자사업에 해외VC 러시","""해외 출자 어떻게 받죠?""…혼란 여전\n글로벌 VC 79곳, 한국에 러브콜\nAI...",이데일리,https://n.news.naver.com/mnews/article/018/000...,https://imgnews.pstatic.net/image/018/2025/05/...
2,2025-05-20 19:05,"거래소, 달바글로벌 코스피 상장 승인",/사진=신민경 기자\n한국거래소는 오는 22일 유가증권시장에 달바글로벌을 상장할 예...,한국경제,https://n.news.naver.com/mnews/article/015/000...,https://imgnews.pstatic.net/image/015/2025/05/...
3,2025-05-20 18:33,[신간] 트럼프2.0과 에너지대전환,기후대응 속도조절 ‘눈치보기 격화’\n트럼프 내년 청정에너지 예산 삭감\n‘2050...,매일경제,https://n.news.naver.com/mnews/article/009/000...,https://imgnews.pstatic.net/image/009/2025/05/...
4,2025-05-20 18:18,증시 추가상승에 베팅… 한방 노리는 '빚투' 늘었다,코스피 한달 5% 올라 투심 회복\n신용거래융자 잔고 18조 육박\n외국인 순매수 ...,파이낸셜뉴스,https://n.news.naver.com/mnews/article/014/000...,https://imgnews.pstatic.net/image/014/2025/05/...
