In [2]:
import time, pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from tqdm import tqdm

def safe_get_and_wait(driver, wait, url, max_retries=2):
    for attempt in range(max_retries + 1):
        try:
            driver.get(url)
            wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "#table_16 tbody tr.odd, #table_16 tbody tr.even")
                )
            )
            wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "#table_7 tbody tr.odd, #table_7 tbody tr.even")
                )
            )
            time.sleep(0.2)
            return True
        except TimeoutException:
            if attempt < max_retries:
                backoff = 2 ** attempt
                time.sleep(backoff)
            else:
                return False

def scrape_pitching(ids, chunk_size=5000):
    opts = Options()
    opts.add_argument("--headless=new"), opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)

    driver = webdriver.Chrome(options=opts)
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {"source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined})"},
    )
    wait = WebDriverWait(driver, 20)

    rows, chunk_idx = [], 1
    try:
        pbar = tqdm(ids, desc="Scraping pitching (16/7)")
        for pid in pbar:
            pbar.set_postfix(pid=pid)
            url = f"http://npbstats.com/players/db/pitching/?wdt_search={pid}"
            if not safe_get_and_wait(driver, wait, url):
                continue

            soup = BeautifulSoup(driver.page_source, "lxml")
            
            # table 1
            bio_tbl = soup.find("table", id="table_1")
            if not bio_tbl:
                continue
            bio_df = pd.read_html(str(bio_tbl), flavor="lxml")[0]
            bio_df.columns = bio_df.columns.str.strip()
            bio = bio_df.iloc[0].to_dict() | {"SearchID": pid}

            # table 7
            dash_tbl = soup.find("table", id="table_7")
            dash_df = pd.read_html(str(dash_tbl), flavor="lxml")[0]
            dash_df.columns = dash_df.columns.str.strip()
            dash_df["SearchID"] = pid

            # table 16
            pit_tbl = soup.find("table", id="table_16")
            pit_df = pd.read_html(str(pit_tbl), flavor="lxml")[0]
            pit_df.columns = pit_df.columns.str.strip()
            pit_df["SearchID"] = pid

            pit_df = pit_df.drop_duplicates("Year")
            dash_df = dash_df.drop_duplicates("Year")
            merged = pit_df.merge(dash_df, on="Year", how="left")

            for _, season in merged.iterrows():
                if season["Year"] == "データはありません。":
                    continue
                rows.append({**bio, **season.to_dict()})

            if len(rows) >= chunk_size:
                pd.DataFrame(rows).to_csv(f"pitching_chunk_{chunk_idx}.csv", index=False)
                rows.clear(); chunk_idx += 1
    finally:
        driver.quit()

    return pd.DataFrame(rows)

if __name__ == "__main__":
    ids = [str(pid) for pid in range(100001, 107185)]

    df_pitch = scrape_pitching(ids)
    if not df_pitch.empty:
        df_pitch.to_csv("pitching_final.csv", index=False)

Scraping pitching (16/7):  25%|██▍       | 1765/7184 [6:38:23<32:26:47, 21.56s/it, pid=101766]

[warn] timeout http://npbstats.com/players/db/pitching/?wdt_search=101766 — retry 1 in 1s


Scraping pitching (16/7):  30%|███       | 2188/7184 [8:16:24<17:12:49, 12.40s/it, pid=102189]

[saved] pitching_chunk_1.csv


Scraping pitching (16/7):  35%|███▌      | 2532/7184 [9:33:27<30:04:26, 23.27s/it, pid=102533]

[warn] timeout http://npbstats.com/players/db/pitching/?wdt_search=102533 — retry 1 in 1s
[warn] timeout http://npbstats.com/players/db/pitching/?wdt_search=102533 — retry 2 in 2s


Scraping pitching (16/7):  38%|███▊      | 2765/7184 [10:34:09<25:19:12, 20.63s/it, pid=102766]

[warn] timeout http://npbstats.com/players/db/pitching/?wdt_search=102766 — retry 1 in 1s
[warn] timeout http://npbstats.com/players/db/pitching/?wdt_search=102766 — retry 2 in 2s


Scraping pitching (16/7):  39%|███▉      | 2794/7184 [10:43:33<18:15:32, 14.97s/it, pid=102795]

[warn] timeout http://npbstats.com/players/db/pitching/?wdt_search=102795 — retry 1 in 1s


Scraping pitching (16/7):  53%|█████▎    | 3776/7184 [14:17:27<15:32:52, 16.42s/it, pid=103777]

[saved] pitching_chunk_2.csv


Scraping pitching (16/7):  72%|███████▏  | 5199/7184 [19:51:24<7:15:27, 13.16s/it, pid=105200] 

[saved] pitching_chunk_3.csv


Scraping pitching (16/7):  77%|███████▋  | 5533/7184 [21:11:38<7:23:26, 16.12s/it, pid=105534] 

[warn] timeout http://npbstats.com/players/db/pitching/?wdt_search=105534 — retry 1 in 1s


Scraping pitching (16/7):  79%|███████▉  | 5708/7184 [21:56:38<8:09:13, 19.89s/it, pid=105709] 

[warn] timeout http://npbstats.com/players/db/pitching/?wdt_search=105709 — retry 1 in 1s


Scraping pitching (16/7):  94%|█████████▍| 6755/7184 [25:57:13<1:19:01, 11.05s/it, pid=106756] 

[saved] pitching_chunk_4.csv


Scraping pitching (16/7): 100%|██████████| 7184/7184 [27:34:53<00:00, 13.82s/it, pid=107184]  


Done.


In [8]:
import glob
import pandas as pd
from pathlib import Path

csv_files = sorted(glob.glob("pitching_chunk_*.csv") + ["pitching_final.csv"], key=lambda p: (p.count("_"), p))

Found 5 files: ['pitching_final.csv', 'pitching_chunk_1.csv', 'pitching_chunk_2.csv', 'pitching_chunk_3.csv', 'pitching_chunk_4.csv']


In [None]:
frames = []
for f in csv_files:
    df = pd.read_csv(f)
    frames.append(df)

In [None]:
big_df = pd.concat(frames, ignore_index=True)
big_df = big_df.sort_values("SearchID", key=lambda s: s.astype(int))

out_path = Path("npbstats_pitching_full.csv")
big_df.to_csv(out_path, index=False)