In [None]:
%pip install -q requests beautifulsoup4 pandas deep-translator
%pip install deep-translator

import os
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from deep_translator import GoogleTranslator
from IPython.display import FileLink

In [None]:
BASE_URL = "https://www.fashionsnap.com"
URL = "https://www.fashionsnap.com/ranking/fashion/"
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"

OUT_NAME = "fashionsnap_fashion_top100_ja_ko.csv"
OUT_PATH = os.path.abspath(OUT_NAME)

print("현재 작업 폴더(CWD):", os.getcwd())
print("CSV 저장 경로(절대경로):", OUT_PATH)

In [None]:
def fetch_html(url: str) -> str:
    r = requests.get(
        url,
        headers={"User-Agent": UA, "Accept-Language": "ja,en;q=0.8,ko;q=0.7"},
        timeout=30,
    )
    r.raise_for_status()
    return r.text


In [None]:
def parse_fashion_top100_strict(html: str, limit: int = 100):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.select("a[href]")

    items = []
    for i, a in enumerate(links):
        rank_txt = a.get_text(strip=True)
        href = (a.get("href") or "").strip()


        if not rank_txt.isdigit():
            continue
        rank = int(rank_txt)
        if not (1 <= rank <= 100):
            continue


        title_ja = None
        j = i + 1
        while j < len(links):
            b = links[j]
            txt = b.get_text(strip=True)
            href2 = (b.get("href") or "").strip()


            if txt.isdigit():
                break


            if href2 == href and txt and (not txt.isdigit()) and len(txt) >= 4:

                if re.fullmatch(r"[A-Z]{2,}", txt):  # 예: BUSINESS
                    j += 1
                    continue
                title_ja = txt
                break

            j += 1


        if title_ja and "/article/" in href:
            items.append({
                "rank": rank,
                "title_ja": title_ja,
                "url": urljoin(BASE_URL, href)
            })

        if len(items) >= limit:
            break


    items = sorted({x["rank"]: x for x in items}.values(), key=lambda x: x["rank"])
    return items


In [None]:
def translate_ja_to_ko(items, sleep_sec: float = 0.15):
    tr = GoogleTranslator(source="ja", target="ko")
    for it in items:
        try:
            it["title_ko"] = tr.translate(it["title_ja"])
            it["translate_error"] = ""
        except Exception as e:
            it["title_ko"] = ""
            it["translate_error"] = str(e)
        time.sleep(sleep_sec)
    return items


In [None]:
def save_csv(items, out_path: str):
    df = pd.DataFrame(items)


    for col in ["rank", "title_ja", "title_ko", "url", "translate_error"]:
        if col not in df.columns:
            df[col] = ""

    df = df[["rank", "title_ja", "title_ko", "url", "translate_error"]]
    df.to_csv(out_path, index=False, encoding="utf-8-sig")

    print("저장 완료:", out_path)
    print("파일 존재:", os.path.exists(out_path))
    return FileLink(out_path)


In [None]:
html = fetch_html(URL)
items = parse_fashion_top100_strict(html, limit=100)

print("수집 개수:", len(items))
print("샘플(번역 전):", items[:3])

items = translate_ja_to_ko(items)
print("샘플(번역 후):", [(x["title_ja"], x["title_ko"]) for x in items[:3]])

save_csv(items, OUT_PATH)
