In [1]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse


In [2]:
URL = "http://elle.co.kr/article/1893727"

BASE_DIR = "elle_1893727"
IMG_DIR  = os.path.join(BASE_DIR, "images")
TEXT_DIR = os.path.join(BASE_DIR, "text")


In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Accept-Language": "ko-KR,ko;q=0.9",
}

resp = requests.get(URL, headers=headers, timeout=20, allow_redirects=True)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "lxml")


In [4]:
h1 = soup.select_one("h1")
if not h1:
    raise RuntimeError("제목(h1) 없음")

body_nodes = []
started = False

for el in soup.body.descendants:
    if el == h1:
        started = True
        continue

    if not started or not getattr(el, "name", None):
        continue

    if el.name in ["script", "style", "noscript", "header", "footer"]:
        continue

    body_nodes.append(el)

print("body_nodes 수:", len(body_nodes))


body_nodes 수: 613


In [5]:
MAX_PARAGRAPHS = 15
paragraphs = []

for el in body_nodes:
    if el.name != "p":
        continue

    t = el.get_text(" ", strip=True)
    if not t:
        continue

   
    if re.match(r"^\s*@", t):
        continue

   
    if re.search(r"회원가입\s*및\s*로그인", t):
        continue

    paragraphs.append(t)


paragraphs = paragraphs[:MAX_PARAGRAPHS]

print("단락 수:", len(paragraphs))


단락 수: 15


In [6]:
EXCLUDE_KEYWORDS = [
    "/attach/avatar/", "/avatar/", "/icon/", "/icons/",
    "/logo", "/sprite", "data:image"
]
IMG_ATTRS = ["src", "data-src", "data-original", "data-lazy"]

image_urls = []
seen = set()

for el in body_nodes:
    for img in el.select("img"):
        src = None

        srcset = img.get("srcset") or img.get("data-srcset")
        if srcset:
            parts = [x.strip() for x in srcset.split(",") if x.strip()]
            parts.sort(
                key=lambda x: int(x.split()[-1][:-1])
                if x.split()[-1].endswith("w") else 0
            )
            src = parts[-1].split()[0]

        if not src:
            for k in IMG_ATTRS:
                if img.get(k):
                    src = img.get(k)
                    break

        if not src:
            continue

        full = urljoin(URL, src.strip())
        low = full.lower()

        if any(k in low for k in EXCLUDE_KEYWORDS):
            continue

        if full not in seen:
            seen.add(full)
            image_urls.append(full)


image_urls = image_urls[:9]

print("수집된 이미지 URL 수:", len(image_urls))


수집된 이미지 URL 수: 9


In [7]:

os.makedirs(IMG_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)


txt_path = os.path.join(TEXT_DIR, "paragraphs.txt")
with open(txt_path, "w", encoding="utf-8") as f:
    for i, p in enumerate(paragraphs, 1):
        f.write(f"[{i}]\n{p}\n\n")

print("paragraphs.txt 저장 완료:", txt_path)


for i, u in enumerate(image_urls, 1):
    ext = os.path.splitext(urlparse(u).path)[1].lower()
    if ext not in [".jpg", ".jpeg", ".png", ".webp"]:
        ext = ".jpg"

    path = os.path.join(IMG_DIR, f"{i:03d}{ext}")
    img_resp = requests.get(u, headers=headers, timeout=30)
    img_resp.raise_for_status()

    with open(path, "wb") as f:
        f.write(img_resp.content)

print("이미지 저장 완료:", IMG_DIR)


paragraphs.txt 저장 완료: elle_1893727\text\paragraphs.txt
이미지 저장 완료: elle_1893727\images
