From 84c4817cf7fceced172e3dafbf097174fdb40030 Mon Sep 17 00:00:00 2001 From: cocoyoon Date: Thu, 7 May 2026 18:40:56 +0900 Subject: [PATCH] fix(starstyle-backfill): mirror hero images to R2 before INSERT (#466) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit starstyle 의 og:image 는 admin (HTTPS) 에서 직접 hotlink 가 막힌다: - HTTPS 요청 → 301 redirect to HTTP → mixed-content 차단 - HTTP 요청 → User-Agent / Referer 가 비어 있으면 403 백필 시점에 boto3 로 upstream 이미지 다운로드 → R2 (RAW_POSTS_R2_BUCKET) 에 ``starstyle/{shard}/{external_id}.jpg`` 키로 업로드하고, raw_posts.image_url 을 R2 public URL 로 저장한다. admin 은 R2 직접 fetch (HTTPS clean, hotlink 없음) → Vercel image-proxy 우회. - HEAD 로 R2 객체 존재 확인 → 재실행 시 중복 업로드 skip (idempotent) - 다운로드/업로드 실패 시 upstream URL 로 fallback (image_url 그대로) - RAW_POSTS_R2_* env 미설정 시 mirror skip + warning - PostData @dataclass(frozen=True) → frozen 제거 (image_url 교체 위해) 검증: 500 row 재백필, 479/500 R2 mirrored. 샘플 R2 URL HTTPS 200 확인. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scripts/backfill_starstyle_posts.py | 147 +++++++++++++++++- 1 file changed, 146 insertions(+), 1 deletion(-) diff --git a/packages/ai-server/scripts/backfill_starstyle_posts.py b/packages/ai-server/scripts/backfill_starstyle_posts.py index e0a1b25f..c1602eab 100644 --- a/packages/ai-server/scripts/backfill_starstyle_posts.py +++ b/packages/ai-server/scripts/backfill_starstyle_posts.py @@ -81,6 +81,15 @@ def _env(name: str, *, required: bool = True) -> str: _SUPABASE_URL = _env("ASSETS_DATABASE_API_URL").rstrip("/") _SERVICE_ROLE_KEY = _env("ASSETS_DATABASE_SERVICE_ROLE_KEY") +# R2 — starstyle 의 og:image 는 hotlink 보호 + HTTPS→HTTP 리다이렉트 때문에 +# admin (HTTPS) 에서 직접 fetch 가 안 됨. 백필 시 R2 에 미러링해서 image_url 을 +# R2 public URL 로 저장 → admin 이 vercel proxy 우회. +_R2_ACCOUNT_ID = _env("RAW_POSTS_R2_ACCOUNT_ID", required=False) +_R2_ACCESS_KEY = _env("RAW_POSTS_R2_ACCESS_KEY_ID", required=False) +_R2_SECRET_KEY = _env("RAW_POSTS_R2_SECRET_ACCESS_KEY", required=False) +_R2_BUCKET = _env("RAW_POSTS_R2_BUCKET", required=False) or "raw" +_R2_PUBLIC_URL = (_env("RAW_POSTS_R2_PUBLIC_URL", required=False) or "").rstrip("/") + _INSERT_BATCH = 100 logger = logging.getLogger("backfill_starstyle") @@ -89,7 +98,7 @@ def _env(name: str, *, required: bool = True) -> str: # ---------------------------------------------------------- types & helpers -@dataclass(frozen=True) +@dataclass class PostData: post_id: str slug: str @@ -311,6 +320,93 @@ async def fetch_existing_external_ids(http: httpx.AsyncClient) -> Set[str]: return out +# --------------------------------------------------- R2 mirror (hero image) + + +def _r2_client(): + """boto3 S3 client for Cloudflare R2. Returns None if not configured.""" + if not (_R2_ACCOUNT_ID and _R2_ACCESS_KEY and _R2_SECRET_KEY): + return None + import boto3 # local import — backfill 만 사용 + from botocore.client import Config + + return boto3.client( + "s3", + endpoint_url=f"https://{_R2_ACCOUNT_ID}.r2.cloudflarestorage.com", + aws_access_key_id=_R2_ACCESS_KEY, + aws_secret_access_key=_R2_SECRET_KEY, + config=Config(signature_version="s3v4"), + region_name="auto", + ) + + +def _build_r2_key(external_id: str) -> str: + """``starstyle/{shard}/{id}.jpg``. pipeline._build_r2_key 와 같은 형식. + + starstyle 의 hero 는 항상 jpg (og:image 의 ``.jpg`` 확장자) — 다양한 + 포맷을 처리하지 않고 jpg 로 통일. 다른 포맷이 들어오면 그대로 jpg 로 저장 + (R2 ContentType 에서 결정, 브라우저는 sniffing). + """ + safe = "".join(c if c.isalnum() else "-" for c in external_id).strip("-") or "x" + shard = safe[:2] or "_" + return f"starstyle/{shard}/{safe}.jpg" + + +async def mirror_image_to_r2( + http: httpx.AsyncClient, + *, + s3, + image_url: str, + external_id: str, + referer: str, +) -> Optional[str]: + """upstream 이미지 다운로드 → R2 put → R2 public URL 반환. + + 실패 시 None — caller 가 image_url 로 upstream URL 그대로 사용. + 이미 R2 에 있으면 (HEAD 200) 다운로드 skip. + """ + if s3 is None or not _R2_PUBLIC_URL: + return None + key = _build_r2_key(external_id) + public_url = f"{_R2_PUBLIC_URL}/{key}" + # HEAD 로 존재 확인 — 재실행 시 중복 업로드 방지. + try: + await asyncio.to_thread(s3.head_object, Bucket=_R2_BUCKET, Key=key) + return public_url + except Exception: + pass + + try: + resp = await http.get( + image_url, + headers={"User-Agent": _USER_AGENT, "Referer": referer}, + follow_redirects=True, + timeout=30, + ) + resp.raise_for_status() + data = resp.content + ct = (resp.headers.get("content-type") or "image/jpeg").split(";")[0].strip() + if not ct.startswith("image/"): + logger.warning("mirror: skip %s — bad content-type %s", external_id, ct) + return None + except Exception as exc: + logger.warning("mirror: download failed for %s — %s", external_id, exc) + return None + + try: + await asyncio.to_thread( + s3.put_object, + Bucket=_R2_BUCKET, + Key=key, + Body=data, + ContentType=ct, + ) + except Exception as exc: + logger.warning("mirror: R2 put failed for %s — %s", external_id, exc) + return None + return public_url + + async def ensure_global_feed_source(http: httpx.AsyncClient) -> str: body = [ { @@ -531,6 +627,55 @@ async def _run(args) -> int: source_id = await ensure_global_feed_source(http) logger.info(" source_id = %s", source_id) + # R2 mirror — admin (HTTPS) 에서 starstyle 이미지 hotlink 가 막혀 + # backfill 시 R2 에 미러링하고 image_url 을 R2 public URL 로 저장. + s3 = _r2_client() + if s3 is not None: + logger.info( + "R2 mirror: bucket=%s public=%s — mirroring %d images", + _R2_BUCKET, + _R2_PUBLIC_URL, + len(posts_new), + ) + sem = asyncio.Semaphore(args.concurrency) + mirrored = 0 + failed = 0 + + async def _mirror_one(d: PostData): + nonlocal mirrored, failed + async with sem: + new_url = await mirror_image_to_r2( + http, + s3=s3, + image_url=d.image_url, + external_id=d.post_id, + referer=d.url, + ) + if new_url: + d.image_url = new_url + mirrored += 1 + else: + failed += 1 + if (mirrored + failed) % 50 == 0: + logger.info( + " mirror progress: %d done (%d mirrored, %d failed)", + mirrored + failed, + mirrored, + failed, + ) + + await asyncio.gather(*(_mirror_one(p) for p in posts_new)) + logger.info( + "R2 mirror done: %d mirrored, %d failed (fallback to upstream)", + mirrored, + failed, + ) + else: + logger.warning( + "R2 not configured (RAW_POSTS_R2_* env missing) — " + "image_url 은 upstream URL 그대로 (admin preview 깨짐)" + ) + dispatch_id = f"backfill-{uuid.uuid4().hex[:12]}" total_inserted = 0 n_batches = (len(posts_new) + _INSERT_BATCH - 1) // _INSERT_BATCH