In [None]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from tqdm import tqdm

def safe_get_and_wait(driver, wait, url, max_retries=2):
    """Load `url` and wait for table_1 to appear; retry on TimeoutException."""
    for attempt in range(max_retries + 1):
        try:
            driver.get(url)
            wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "#table_1 tbody tr.odd, #table_1 tbody tr.even")
                )
            )
            time.sleep(0.2)
            return True
        except TimeoutException:
            if attempt < max_retries:
                backoff = 2 ** attempt
                print(f"[warn] timeout {url} — retry {attempt+1} in {backoff}s")
                time.sleep(backoff)
            else:
                print(f"[error] giving up on {url}")
                return False


def scrape_npb_bio(ids, chunk_size=5000):
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)

    driver = webdriver.Chrome(options=opts)
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {"source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined})"},
    )
    wait = WebDriverWait(driver, 20)

    out_rows, chunk_idx = [], 1
    try:
        pbar = tqdm(ids, desc="Scraping bio (table_1)")
        for pid in pbar:
            pbar.set_postfix(pid=pid)
            url = f"http://npbstats.com/players/db/batting/?wdt_search={pid}"
            if not safe_get_and_wait(driver, wait, url):
                continue

            soup = BeautifulSoup(driver.page_source, "lxml")

            # table_1 bio only
            bio_tbl = soup.find("table", id="table_1")
            if not bio_tbl or bio_tbl.find("td", class_="dataTables_empty"):
                continue

            bio_df = pd.read_html(str(bio_tbl), flavor="lxml")[0]
            bio_df.columns = bio_df.columns.str.strip()
            bio = bio_df.iloc[0].to_dict()
            bio["SearchID"] = pid
            out_rows.append(bio)

            if len(out_rows) >= chunk_size:
                df_chunk = pd.DataFrame(out_rows)
                fname = f"npbstats_bio_chunk_{chunk_idx}.csv"
                df_chunk.to_csv(fname, index=False)
                pbar.write(f"[saved] {fname}")
                chunk_idx += 1
                out_rows.clear()

    finally:
        driver.quit()

    return pd.DataFrame(out_rows)

if __name__ == "__main__":
    ids = [str(pid) for pid in range(100001, 107185)]
    df_remain = scrape_npb_bio(ids)
    if not df_remain.empty:
        df_remain.to_csv("npbstats_bio_final.csv", index=False)