# KOSPI200 구성종목 스크래핑

- 데이터 소스: https://index.krx.co.kr
- 스크래핑 방법: PlayWright

In [None]:
import asyncio
from typing import Dict, List, Optional

import pandas as pd
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError

import random

# KOSPI200
BASE_URL = "https://index.krx.co.kr/contents/MKD/03/0304/03040101/MKD03040101.jsp?upmidCd=0102&idxCd=1028&idxId=K2G01P"

# KOSDAQ150
# BASE_URL = "https://index.krx.co.kr/contents/MKD/03/0304/03040101/MKD03040101.jsp?upmidCd=0103&idxCd=2203&idxId=Q5G01P"

SAVE_EVERY = 5  # 5번 조회마다 중간 저장

def monthly_first_trading_days(start: str, end: str, index_code: str = "1028") -> List[str]:
    """
    start/end: 'YYYYMMDD'
    return: ['YYYYMMDD', ...]  (월별 첫 거래일)
    """
    df = stock.get_index_ohlcv(start, end, index_code)
    if df is None or df.empty:
        raise RuntimeError("pykrx index ohlcv fetch failed. Check dates/network.")
    idx = pd.to_datetime(df.index)
    s = pd.Series(1, index=idx)
    first_days = s.groupby(s.index.to_period("M")).apply(lambda x: x.index.min())
    return [d.strftime("%Y%m%d") for d in first_days]

async def get_component_code(page) -> str:
    """
    2) BASE_URL 접속 후:
       <div class="design-tabs design-tabs-base" ... data-component-code="...">
       여기서 data-component-code 값 추출
    """
    await page.goto(BASE_URL, wait_until="domcontentloaded")
    # 탭 컨테이너가 뜰 때까지 대기
    loc = page.locator("div.design-tabs").first
    await loc.wait_for(timeout=10000)
    code = await loc.get_attribute("data-component-code")
    if not code:
        raise RuntimeError("data-component-code를 찾지 못했습니다. selector 변경 가능성.")
    return code


async def goto_constituents_tab(page, component_code: str) -> None:
    """
    3) 구성종목 탭 이동:
       URL = BASE_URL + "#" + data-component-code + "=3"
    """
    url = f"{BASE_URL}#{component_code}=3"
    await page.goto(url, wait_until="domcontentloaded")
    # 구성종목 탭 화면에서 날짜 입력창이 뜰 때까지 대기
    await page.locator("input.schdate").first.wait_for(timeout=20000)


TARGET_SUBSTR = "IDX99000001.jspx"  # 필요하면 "/contents/IDX/99/IDX99000001.jspx"로 더 좁혀도 됨

async def fetch_constituents_for_date(page, yyyymmdd: str, timeout_ms: int = 30000) -> List[str]:
    # 날짜 입력
    date_input = page.locator("input[name='schdate'][type='text']").first
    await date_input.fill(yyyymmdd)

    # 이벤트 트리거 확실히
    try:
        await date_input.press("Enter")
        await date_input.evaluate("el => el.blur()")
    except Exception:
        pass

    search_btn = page.locator("button.btn-board.btn-board-search[type='button']").first

    # "클릭 전에" 응답 대기 걸어두기
    def is_target(resp) -> bool:
        try:
            return TARGET_SUBSTR in resp.url
        except Exception:
            return False

    try:
        async with page.expect_response(is_target, timeout=timeout_ms) as resp_info:
            await search_btn.click()
        resp = await resp_info.value
    except PWTimeoutError:
        # 디버깅용: 어떤 요청이 나가는지 확인
        await page.screenshot(path=f"krx_timeout_{yyyymmdd}.png", full_page=True)
        raise RuntimeError(f"{TARGET_SUBSTR} 응답을 못 잡음: {yyyymmdd}")

    # JSON 파싱
    data = await resp.json()
    output = data.get("output", [])
    codes = sorted({"A" + str(r["isu_cd"]).zfill(6) for r in output if isinstance(r, dict) and "isu_cd" in r})

    if len(codes) < 150:
        # 화면에는 보이는데 codes가 적으면, 우리가 잡은 응답이 "다른 IDX99000001"일 가능성
        # (이 경우 URL을 더 좁혀야 함)
        print(f"[WARN] {yyyymmdd} codes={len(codes)} url={resp.url}")

    return codes

async def fetch_with_retry(page, yyyymmdd: str, retries: int = 2) -> list[str]:
    last_err = None
    for attempt in range(retries + 1):
        try:
            return await fetch_constituents_for_date(page, yyyymmdd)
        except Exception as e:
            last_err = e
            msg = str(e)
            # 네가 겪은 대표 케이스
            transient = ("No data found for resource" in msg) or ("Protocol error" in msg)
            if attempt < retries and transient:
                # 재시도 전 대기(점점 늘리기)
                backoff = 2.0 + attempt * 2.0 + random.random() * 2.0  # 2~4s, 4~6s ...
                print(f"  -> retry {attempt+1}/{retries} after {backoff:.1f}s ({yyyymmdd})")
                await asyncio.sleep(backoff)
                continue
            raise
    raise last_err

def make_wide_df(date_to_codes: Dict[str, List[str]]) -> pd.DataFrame:
    """
    날짜를 컬럼명으로 하고, 각 컬럼에 해당 날짜의 isu_cd(최대 200개)를 세로로 채움.
    """
    max_len = max((len(v) for v in date_to_codes.values()), default=0)
    out = {}
    for d, codes in sorted(date_to_codes.items()):
        # 길이 맞추기 (짧으면 None 채움)
        col = codes + [None] * (max_len - len(codes))
        out[d] = col
    return pd.DataFrame(out)


async def scrape_kospi200_constituents_by_date(
    start: str,
    end: str,
    dates: Optional[List[str]] = None,
    headless: bool = True,
    sleep_sec: float = 0.2,
    out_csv: str = "kospi200_constituents_by_date.csv",
) -> pd.DataFrame:
    """
    start/end: 'YYYYMMDD'
    dates: 날짜 리스트를 직접 주면 그 리스트만 조회 (예: '매월 첫 영업일'만)
    """
    if dates is None:
        dates = monthly_first_trading_days(start, end, index_code="1028")  # KOSPI200
        # dates = monthly_first_trading_days(start, end, index_code="2203")  # KOSDAQ150

    date_to_codes: Dict[str, List[str]] = {}

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context(locale="ko-KR")
        page = await context.new_page()

        # 1~2) BASE_URL 접속 후 component code 추출
        component_code = await get_component_code(page)

        # 3) 구성종목 탭 이동
        await goto_constituents_tab(page, component_code)

        for i, d in enumerate(dates, 1):
            try:
                # codes = await fetch_constituents_for_date(page, d, timeout_ms=25000)
                codes = await fetch_with_retry(page, d, retries=2)
                if not codes:
                    print(f"[{i}/{len(dates)}] {d} SKIP (no data)")
                    continue
                
                date_to_codes[d] = codes
                print(f"[{i}/{len(dates)}] {d} OK n={len(codes)}")
            except PWTimeoutError:
                # 휴일 등으로 조회가 실패할 수 있음
                print(f"[{i}/{len(dates)}] {d} TIMEOUT (휴일/응답없음 가능) → skip")
            except Exception as e:
                print(f"[{i}/{len(dates)}] {d} FAILED: {e}")

            # 중간 저장(스냅샷)
            if (i % SAVE_EVERY == 0) and date_to_codes:
                df_partial = make_wide_df(date_to_codes)
                df_partial.to_csv(out_csv, index=False, encoding="utf-8-sig")
                print(f"  -> partial saved: {out_csv} ({len(date_to_codes)} dates)")
        
            # await asyncio.sleep(sleep_sec)
            await asyncio.sleep(random.uniform(1.0, 3.0))

        await browser.close()

    # 마지막 저장
    if date_to_codes:
        df_final = make_wide_df(date_to_codes)
        df_final.to_csv(out_csv, index=False, encoding="utf-8-sig")
        print(f"FINAL saved: {out_csv} ({len(date_to_codes)} dates)")
        
    return df

In [None]:
# 예시: 날짜 리스트를 직접 주입
# dates = ["20251103"]  # <- 실제로는 2010~현재 월별 첫 거래일 리스트
df = await scrape_kospi200_constituents_by_date(
    start="20100101",
    end="20260203",
    # dates=dates,
    headless=True,
    out_csv="kospi200.csv",
)

[1/194] 20100104 OK n=200
[2/194] 20100201 OK n=200
[3/194] 20100302 OK n=200
[4/194] 20100401 OK n=200
[5/194] 20100503 OK n=200
  -> partial saved: kospi200.csv (5 dates)
[6/194] 20100601 OK n=200
[7/194] 20100701 OK n=200
[8/194] 20100802 OK n=200
[9/194] 20100901 OK n=200
[10/194] 20101001 OK n=200
  -> partial saved: kospi200.csv (10 dates)
[11/194] 20101101 OK n=200
[12/194] 20101201 OK n=200
[13/194] 20110103 OK n=200
[14/194] 20110201 OK n=200
[15/194] 20110302 OK n=200
  -> partial saved: kospi200.csv (15 dates)
[16/194] 20110401 OK n=200
[17/194] 20110502 OK n=200
[18/194] 20110601 OK n=200
[19/194] 20110701 OK n=200
[20/194] 20110801 OK n=200
  -> partial saved: kospi200.csv (20 dates)
[21/194] 20110901 OK n=200
[22/194] 20111004 OK n=200
[23/194] 20111101 OK n=200
[24/194] 20111201 OK n=200
[25/194] 20120102 OK n=200
  -> partial saved: kospi200.csv (25 dates)
[26/194] 20120201 OK n=200
[27/194] 20120302 OK n=200
[28/194] 20120402 OK n=200
[29/194] 20120502 OK n=200
[30/194

In [None]:
# 코스닥 150 구성종목 
# 파라미터 처리가 안되어 있어
# 위에 소스에서 코스피200 부분을 막고 코스닥150 부분을 풀고 실행

df = await scrape_kospi200_constituents_by_date(
    start="20150801",
    end="20260203",
    # dates=dates,
    headless=True,
    out_csv="kosdaq150.csv",
)

[1/127] 20150803 OK n=150
[2/127] 20150901 OK n=150
[3/127] 20151001 OK n=150
[4/127] 20151102 OK n=150
[5/127] 20151201 OK n=150
  -> partial saved: kosdaq150.csv (5 dates)
[6/127] 20160104 OK n=150
[7/127] 20160201 OK n=150
[8/127] 20160302 OK n=150
[9/127] 20160401 OK n=150
[10/127] 20160502 OK n=150
  -> partial saved: kosdaq150.csv (10 dates)
[11/127] 20160601 OK n=150
[12/127] 20160701 OK n=150
[13/127] 20160801 OK n=150
[14/127] 20160901 OK n=150
[15/127] 20161004 OK n=150
  -> partial saved: kosdaq150.csv (15 dates)
[16/127] 20161101 OK n=150
[17/127] 20161201 OK n=150
[18/127] 20170102 OK n=150
[19/127] 20170201 OK n=150
[20/127] 20170302 OK n=150
  -> partial saved: kosdaq150.csv (20 dates)
[21/127] 20170403 OK n=150
[22/127] 20170502 OK n=150
[23/127] 20170601 OK n=150
[24/127] 20170703 OK n=150
[25/127] 20170801 OK n=150
  -> partial saved: kosdaq150.csv (25 dates)
[26/127] 20170901 OK n=150
[27/127] 20171010 OK n=150
[28/127] 20171101 OK n=150
[29/127] 20171201 OK n=150
[3