In [34]:

from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
import os
import random, time
USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (chzzk-insights crawler)")

HEADERS = {
    "User-Agent": USER_AGENT,
    "Accept": "application/json",
    "Referer": "https://chzzk.naver.com/",
}
BASE = "https://api.chzzk.naver.com/service/v1/categories/live"
DEFAULT_TIMEOUT = float(os.getenv("HTTP_TIMEOUT", "10"))
targets = [
    "Minecraft","Lost_Ark","Player_Unknowns_Battle_Grounds","Black_Survival_Eternal_Return",
    "various_games","Teamfight_Tactics","talk","StarCraft_Remastered","MapleStory","Hearthstone",
    "League_of_Legends","World_of_Warcraft_The_War_Within","Escape_from_Tarkov","OVERWATCH",
    "art","mabinogimobile","Dungeon_Fighter","DEAD_BY_DAYLIGHT","TEKKEN8","Apex_Legends","asd"
]

In [38]:
class FetchError(Exception): pass

def sleep_with_jitter(a: float = 1.0, b: float = 2.0) -> None:
    time.sleep(random.uniform(a, b))
    

def fetch_page(next_params: dict | None = None,
    size: int = 50,
    verbose = True,
    retries = 2,
    backoff=1.5):
    # next_params 가 dict면 그 키-값을 그대로 쿼리스트링에 펼쳐서 보냄
    params = {"size": size}
    if next_params:
        if not isinstance(next_params, dict):
            next_params = None
        else:
            if verbose:
                print("[fetch_page] merging next params:", next_params)
            params.update(next_params)
    last_err = None
    for i in range(retries):
        try:
            r = requests.get(BASE, params=params, headers=HEADERS, timeout=12)
            r.raise_for_status()
            j = r.json()

            if verbose:
                print("[fetch_page] requested URL:", r.url, "->", r.status_code)
                
            content = j.get("content", {})
            items = content.get("data", [])
            # 카테고리 배열
            
            if not isinstance(items, list):
                raise FetchError(f"items is not a list: {type(items).__name__}")

            next_obj = content.get("page", {}).get("next")         # dict 또는 None
            # 보통 next_obj가 dict면 다음 호출에서 params.update(next_obj)
            
            
            if verbose:
                print(f"[fetch_page] got items={len(items)}, has_next={bool(next_obj)}")
            return items, next_obj
        except Exception as e:
            last_err = e
            if verbose: print(f"[fetch_page] warn: {e} retry {i+1}/{retries} due to {e}")
            raise FetchError(f"fetch_page failed: {last_err}")

# 여러 페이지를 순회하며 특정 카테고리 ID 목록에 해당하는 카테고리 정보 수집 *Extract 2
def summarize_categories(target_ids,
                         max_pages=20,
                         page_size=40,
                         verbose=True,
                         timeout: float = DEFAULT_TIMEOUT, sleep_range=(1.0, 2.0)):
    
    target_set = set(target_ids)
    found: dict[str, dict] = {}
    next_params = None
    
    for page in range(max_pages):

        items, next_params = fetch_page(next_params=next_params, size=page_size)
        if verbose:
            print(f"[page {page+1}] items: {len(items)}, next_params: {bool(next_params)}")

        if not items:
            break

        for it in items:
            cid = it.get("categoryId")
            if cid in target_set and cid not in found:
                found[cid] = {
                    "categoryId": cid,
                    "categoryType": it.get("categoryType"),
                    "categoryValue": it.get("categoryValue"),
                    "openLiveCount": int(it.get("openLiveCount", 0)),
                    "concurrentUserCount": int(it.get("concurrentUserCount", 0)),
                    "posterImageUrl": it.get("posterImageUrl")
                }

        # 목표 개수 채우면 조기 종료
        if len(found) == len(target_set):
            if verbose:
                print("All target categories found.")
            break

        if not next_params:
            if verbose:
                print("No more pages. Stop.")
            break

        sleep_with_jitter(*sleep_range)  # 레이트리밋
    coverage = len(found) / len(target_set)
    if coverage < 0.9:
        missing = [t for t in target_ids if t not in found]
        raise FetchError(f"coverage {coverage:.0%}, missing={missing[:5]}...")
    missing = [t for t in target_ids if t not in found]
    print(missing)
    # target_ids 순서를 보존해서 리스트로 반환
    return [found[cid] for cid in target_ids if cid in found]


In [39]:

try:
    result = summarize_categories(targets, max_pages=20, page_size=40, verbose=False)# 카테고리 정보 수집
except Exception as e:
    print(e)
    

[fetch_page] requested URL: https://api.chzzk.naver.com/service/v1/categories/live?size=40 -> 200
[fetch_page] got items=40, has_next=True
[fetch_page] merging next params: {'concurrentUserCount': 383, 'openLiveCount': 19, 'categoryId': 'Mabinogi'}
[fetch_page] requested URL: https://api.chzzk.naver.com/service/v1/categories/live?size=40&concurrentUserCount=383&openLiveCount=19&categoryId=Mabinogi -> 200
[fetch_page] got items=40, has_next=True
[fetch_page] merging next params: {'concurrentUserCount': 62, 'openLiveCount': 7, 'categoryId': 'Heroes_of_the_Strom'}
[fetch_page] requested URL: https://api.chzzk.naver.com/service/v1/categories/live?size=40&concurrentUserCount=62&openLiveCount=7&categoryId=Heroes_of_the_Strom -> 200
[fetch_page] got items=40, has_next=True
[fetch_page] merging next params: {'concurrentUserCount': 21, 'openLiveCount': 2, 'categoryId': 'Undecember'}
[fetch_page] requested URL: https://api.chzzk.naver.com/service/v1/categories/live?size=40&concurrentUserCount=21

In [37]:
print(result)

[{'categoryId': 'Minecraft', 'categoryType': 'GAME', 'categoryValue': '마인크래프트', 'openLiveCount': 71, 'concurrentUserCount': 339, 'posterImageUrl': 'https://nng-phinf.pstatic.net/MjAyMzEyMTFfMTM3/MDAxNzAyMjgyNjI1MDgw.Ozu1fi3gdfyooyBjO_SGXJBDqRWgDFLlmWAFZg6qYIUg.rlnppQX9tMgn2nvgjhXwqsJhfktN23Gdjj09CgJ--BYg.JPEG/104.%EB%A7%88%EC%9D%B8%ED%81%AC%EB%9E%98%ED%94%84%ED%8A%B8_%EC%99%84%EC%84%B1%EB%B3%B8_%EC%9D%B4%EB%8B%AC%EB%8B%98.jpg'}, {'categoryId': 'Lost_Ark', 'categoryType': 'GAME', 'categoryValue': '로스트아크', 'openLiveCount': 230, 'concurrentUserCount': 10901, 'posterImageUrl': 'https://nng-phinf.pstatic.net/MjAyNTA4MjBfODUg/MDAxNzU1NjUxNzQxOTg3.qXiQF9vOwlRGyjYR9ck5zcZQ3NyQCLZKqAd4D4oUpqsg.5c4wTQ_xLWJi7AO20YANQ_A2n2Uor22eVrmuWncsWIIg.JPEG/%EB%A1%9C%EC%8A%A4%ED%8A%B8%EC%95%84%ED%81%AC.jpg'}, {'categoryId': 'Player_Unknowns_Battle_Grounds', 'categoryType': 'GAME', 'categoryValue': 'PUBG: 배틀그라운드', 'openLiveCount': 138, 'concurrentUserCount': 930, 'posterImageUrl': 'https://nng-phinf.pstatic.ne