In [1]:
!pip -q install requests beautifulsoup4 lxml pandas tqdm
%pip install requests
%pip install tqdm
%pip install beautifulsoup4 lxml

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
import time
import json
import random
from urllib.parse import urljoin, urlparse

import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm


In [3]:
BASE = "https://www.elle.co.kr"

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "ko-KR,ko;q=0.9,en;q=0.8",
})

def polite_sleep(min_s=0.6, max_s=1.3):
    time.sleep(random.uniform(min_s, max_s))

def get_soup(url, timeout=20):
    r = SESSION.get(url, timeout=timeout)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")

def clean_text(s: str) -> str:
    if not s:
        return ""
    s = re.sub(r"\s+", " ", s).strip()
   
    s = s.replace("전체 페이지를 읽으시려면 회원가입 및 로그인을 해주세요!", "").strip()
    return s

def is_elle_article_url(href: str) -> bool:
    # /article/xxxx 형태만 수집
    return bool(href) and "/article/" in href


In [4]:
def collect_article_links_from_fashion(
    start_url="https://www.elle.co.kr/fashion",
    max_pages=5
):
    """
    - fashion 섹션 페이지를 돌면서 /article/ 링크를 최대한 수집
    - 페이지네이션 구조가 바뀔 수 있어서:
      1) 현재 페이지에서 article 링크를 먼저 긁고
      2) '다음/더보기' 류 링크를 찾으면 따라가고
      3) 못 찾으면 종료
    """
    seen_pages = set()
    article_links = set()

    next_url = start_url
    for page_idx in range(max_pages):
        if not next_url or next_url in seen_pages:
            break
        seen_pages.add(next_url)

        soup = get_soup(next_url)
       
        for a in soup.select("a[href]"):
            href = a.get("href")
            if not href:
                continue
            full = urljoin(BASE, href)
            if is_elle_article_url(full):
               
                full = full.split("?")[0].rstrip("/")
                article_links.add(full)

        
        candidates = []

        
        rel_next = soup.select_one('a[rel="next"][href]')
        if rel_next:
            candidates.append(urljoin(BASE, rel_next["href"]))

        
        for a in soup.select("a[href]"):
            txt = (a.get_text(" ", strip=True) or "").lower()
            if any(k in txt for k in ["다음", "next", "more", "더보기"]):
                candidates.append(urljoin(BASE, a["href"]))

        
        for a in soup.select("a[href*='page'], a[href*='Page'], a[href*='paging']"):
            candidates.append(urljoin(BASE, a["href"]))

        
        candidates = [u for u in candidates if "/fashion" in u]
        candidates = [u.split("#")[0] for u in candidates]

        
        next_url = None
        for u in candidates:
            if u not in seen_pages:
                next_url = u
                break

        polite_sleep()

    return sorted(article_links)


links = collect_article_links_from_fashion(max_pages=3)
len(links), links[:10]


(64,
 ['https://www.elle.co.kr/article/1878702',
  'https://www.elle.co.kr/article/1882445',
  'https://www.elle.co.kr/article/1884766',
  'https://www.elle.co.kr/article/1885541',
  'https://www.elle.co.kr/article/1886775',
  'https://www.elle.co.kr/article/1888060',
  'https://www.elle.co.kr/article/1889942',
  'https://www.elle.co.kr/article/1889950',
  'https://www.elle.co.kr/article/1890164',
  'https://www.elle.co.kr/article/1890202'])

In [5]:
def parse_article(url: str) -> dict:
    soup = get_soup(url)

   
    jsonld_blocks = soup.select('script[type="application/ld+json"]')
    for block in jsonld_blocks:
        raw = block.get_text(strip=True)
        if not raw:
            continue
        try:
            data = json.loads(raw)
        except Exception:
            continue

        candidates = data if isinstance(data, list) else [data]
        for item in candidates:
            if not isinstance(item, dict):
                continue
            if item.get("@type") in ["NewsArticle", "Article", "BlogPosting"]:
                title = item.get("headline") or ""
                date_published = item.get("datePublished") or ""
                body = item.get("articleBody") or ""
                img = ""
                if isinstance(item.get("image"), str):
                    img = item["image"]
                elif isinstance(item.get("image"), dict):
                    img = item["image"].get("url", "") or ""
                elif isinstance(item.get("image"), list) and item["image"]:
                    img = item["image"][0] if isinstance(item["image"][0], str) else ""

                author = ""
                if isinstance(item.get("author"), dict):
                    author = item["author"].get("name", "") or ""
                elif isinstance(item.get("author"), list) and item["author"]:
                    if isinstance(item["author"][0], dict):
                        author = item["author"][0].get("name", "") or ""

                if body.strip():
                    return {
                        "url": url,
                        "title": clean_text(title),
                        "author": clean_text(author),
                        "date_published": clean_text(date_published),
                        "image": clean_text(img),
                        "body": clean_text(body),
                        "method": "json-ld"
                    }

    
    title = ""
    h1 = soup.find("h1")
    if h1:
        title = h1.get_text(" ", strip=True)

    
    page_text = soup.get_text("\n", strip=True)
    m = re.search(r"\bby\s+([^\n]+?)\s+(\d{4}\.\d{2}\.\d{2})\b", page_text, flags=re.IGNORECASE)
    author = m.group(1).strip() if m else ""
    date_published = m.group(2).strip() if m else ""

    
    image = ""
    og = soup.select_one('meta[property="og:image"][content]')
    if og:
        image = og.get("content", "")

   
    article = soup.find("article")
    scope = article if article else soup

    for tag in scope.select("script, style, noscript"):
        tag.decompose()

    lines = [ln.strip() for ln in scope.get_text("\n", strip=True).split("\n")]
    lines = [ln for ln in lines if len(ln) >= 20]

   
    cut_markers = ["관련기사", "MOST POPULAR", "Credit", "Copyright", "ELLE"]
    cut_idx = None
    for i, ln in enumerate(lines):
        if any(marker in ln for marker in cut_markers):
            cut_idx = i
            break
    if cut_idx is not None:
        lines = lines[:cut_idx]

    body = "\n".join(lines)

    return {
        "url": url,
        "title": clean_text(title),
        "author": clean_text(author),
        "date_published": clean_text(date_published),
        "image": clean_text(image),
        "body": clean_text(body),
        "method": "html-fallback"
    }


In [6]:

links = collect_article_links_from_fashion(max_pages=3)
print("수집된 article 링크 수:", len(links))


rows = []
for u in tqdm(links[:30]): 
    try:
        rows.append(parse_article(u))
        polite_sleep()
    except Exception as e:
        rows.append({"url": u, "error": str(e)})
        polite_sleep()

df = pd.DataFrame(rows)


df["body_preview"] = df["body"].astype(str).str.slice(0, 200)

df.head(5)


수집된 article 링크 수: 64


100%|██████████| 30/30 [00:38<00:00,  1.29s/it]


Unnamed: 0,url,title,author,date_published,image,body,method,body_preview
0,https://www.elle.co.kr/article/1878702,미니 스커트를 입으면,박기호,2025.03.18,https://www.elle.co.kr/resources/online/thumbn...,이 핫플 다음에 또 보고 싶다면? 찜하기! 시그너처 패턴을 프린트한 빅 포켓 미니스...,html-fallback,이 핫플 다음에 또 보고 싶다면? 찜하기! 시그너처 패턴을 프린트한 빅 포켓 미니스...
1,https://www.elle.co.kr/article/1882445,스카프 이곳에 두르면 몇 배 더 예뻐 보입니다,강민지,2025.05.29,https://www.elle.co.kr/resources/online/thumbn...,스카프 이곳에 두르면 몇 배 더 예뻐 보입니다 스카프 이곳에 두르면 몇 배 더 예뻐...,html-fallback,스카프 이곳에 두르면 몇 배 더 예뻐 보입니다 스카프 이곳에 두르면 몇 배 더 예뻐...
2,https://www.elle.co.kr/article/1884766,파리 패션피플은 지금 청바지를 죄다 이렇게 입어요,박지우,2025.07.15,https://www.elle.co.kr/resources/online/thumbn...,파리 패션피플은 지금 청바지를 죄다 이렇게 입어요 파리 패션피플은 지금 청바지를 죄...,html-fallback,파리 패션피플은 지금 청바지를 죄다 이렇게 입어요 파리 패션피플은 지금 청바지를 죄...
3,https://www.elle.co.kr/article/1885541,뭐 하나 허투루 안 입는 로제의 바지 레슨3,강민지,2025.07.30,https://www.elle.co.kr/resources/online/thumbn...,뭐 하나 허투루 안 입는 로제의 바지 레슨3 뭐 하나 허투루 안 입는 로제의 바지 ...,html-fallback,뭐 하나 허투루 안 입는 로제의 바지 레슨3 뭐 하나 허투루 안 입는 로제의 바지 ...
4,https://www.elle.co.kr/article/1886775,올가을 운동화보다 더 자주 신게 될 신발,강민지,2025.08.25,https://www.elle.co.kr/resources/online/thumbn...,올가을 운동화보다 더 자주 신게 될 신발 올가을 운동화보다 더 자주 신게 될 신발 ...,html-fallback,올가을 운동화보다 더 자주 신게 될 신발 올가을 운동화보다 더 자주 신게 될 신발 ...


In [7]:
out_path = "elle_fashion_articles.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")
out_path


'elle_fashion_articles.csv'