In [None]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def scrape_npb_stats(ids):
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    )
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)

    driver = webdriver.Chrome(options=opts)
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {
            "source": (
                "Object.defineProperty(navigator, 'webdriver', "
                "{get: () => undefined})"
            )
        },
    )

    wait = WebDriverWait(driver, 20)
    all_records = []
    out_rows = []
    
    try:
        for pid in ids:
            url = f"http://npbstats.com/players/db/batting/?wdt_search={pid}"
            driver.get(url)

            wait.until(
                EC.presence_of_element_located(
                    (
                        By.CSS_SELECTOR,
                        "#table_47 tbody tr.odd, #table_47 tbody tr.even",
                    )
                )
            )
            time.sleep(0.2)

            soup = BeautifulSoup(driver.page_source, "lxml")

            # table_1 bio
            bio_tbl = soup.find("table", id="table_1")
            bio_df = pd.read_html(str(bio_tbl), flavor="lxml")[0]
            bio_df.columns = bio_df.columns.str.strip()
            bio = bio_df.iloc[0].to_dict()
            bio["SearchID"] = pid

            # table_47 standard 
            fid_tbl = soup.find("table", id="table_47")
            fid_df = pd.read_html(str(fid_tbl), flavor="lxml")[0]
            fid_df.columns = fid_df.columns.str.strip()
            fid_df["SearchID"] = pid

            for _, season in fid_df.iterrows():
                out_rows.append({**bio, **season.to_dict()})
    finally:
        driver.quit()

    return pd.DataFrame(out_rows)


if __name__ == "__main__":
    ids = [str(pid) for pid in range(100001, 100003)]
    df = scrape_npb_stats(ids)

    print(df.head(20).to_string(index=False))

    print("rows:", len(df), "| columns:", len(df.columns))
    print("columns:", list(df.columns))

           Player      Birthplace Height/Weight Bats/Throws      Position  Brithdate SearchID    Year  Lg     Tm Pos  G    PO    A   E   DP  PB  WP    FP   RF  RRF  rRng  rErr  rWP  rPB  rCth  rDP  rArm   FS
Eiichiro Yamamoto  Shimane, Japan  157cm / 58kg Right/Right Right fielder 03/08/1902   100001 1936.1S JBL Kyojin  2B  2   NaN  NaN NaN  NaN NaN NaN   NaN  NaN  0.0   0.0   0.0  NaN  NaN   NaN  0.0   NaN  0.0
Eiichiro Yamamoto  Shimane, Japan  157cm / 58kg Right/Right Right fielder 03/08/1902   100001 1936.1S JBL Kyojin  OF  2   NaN  NaN NaN  NaN NaN NaN   NaN  NaN  0.0   0.0   0.0  NaN  NaN   NaN  NaN   0.0  0.0
Eiichiro Yamamoto  Shimane, Japan  157cm / 58kg Right/Right Right fielder 03/08/1902   100001 1936.2F JBL Kyojin  OF 14  13.0  2.0 1.0  1.0 NaN NaN 0.938 1.07 -0.5  -0.1  -0.1  NaN  NaN   NaN  NaN   1.1  0.9
Eiichiro Yamamoto  Shimane, Japan  157cm / 58kg Right/Right Right fielder 03/08/1902   100001 1937.1S JBL Kyojin  OF 14   4.0  0.0 0.0  0.0 NaN NaN 1.000 0.29 -1.8  -0.

In [None]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from tqdm import tqdm

def safe_get_and_wait(driver, wait, url, max_retries=2):
    """Load `url` and wait for table 47 to appear; retry on TimeoutException."""
    for attempt in range(max_retries + 1):
        try:
            driver.get(url)
            wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "#table_47 tbody tr.odd, #table_47 tbody tr.even")
                )
            )
            time.sleep(0.2)
            return True
        except TimeoutException:
            if attempt < max_retries:
                backoff = 2 ** attempt
                print(f"[warn] timeout {url} — retry {attempt+1} in {backoff}s")
                time.sleep(backoff)
            else:
                print(f"[error] giving up on {url}")
                return False

def scrape_npb_stats(ids, chunk_size=5000):
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)

    driver = webdriver.Chrome(options=opts)
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {"source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined})"},
    )
    wait = WebDriverWait(driver, 20)

    out_rows, chunk_idx = [], 1
    try:
        pbar = tqdm(ids, desc="Scraping fielding")
        for pid in pbar:
            pbar.set_postfix(pid=pid)

            url = f"http://npbstats.com/players/db/batting/?wdt_search={pid}"
            if not safe_get_and_wait(driver, wait, url):
                continue

            soup = BeautifulSoup(driver.page_source, "lxml")

            # table 1
            bio_tbl = soup.find("table", id="table_1")
            if not bio_tbl or bio_tbl.find("td", class_="dataTables_empty"):
                continue
            bio_df = pd.read_html(str(bio_tbl), flavor="lxml")[0]
            bio_df.columns = bio_df.columns.str.strip()
            bio = bio_df.iloc[0].to_dict()
            bio["SearchID"] = pid

            # table 47
            fld_tbl = soup.find("table", id="table_47")
            fld_df = pd.read_html(str(fld_tbl), flavor="lxml")[0]
            fld_df.columns = fld_df.columns.str.strip()
            fld_df["SearchID"] = pid
            fld_df = fld_df[fld_df["Year"] != "データはありません。"]
            if fld_df.empty:
                continue

            for _, season in fld_df.iterrows():
                out_rows.append({**bio, **season.to_dict()})

            if len(out_rows) >= chunk_size:
                df_chunk = pd.DataFrame(out_rows)
                fname = f"npbstats_fielding_chunk_{chunk_idx}.csv"
                df_chunk.to_csv(fname, index=False)
                pbar.write(f"[saved] {fname}")
                chunk_idx += 1
                out_rows.clear()

    finally:
        driver.quit()

    return pd.DataFrame(out_rows)


if __name__ == "__main__":
    ids = [str(pid) for pid in range(100001, 107185)]
    df_remain = scrape_npb_stats(ids)

    if not df_remain.empty:
        df_remain.to_csv("npbstats_fielding_final.csv", index=False)

Scraping fielding:   6%|▋         | 460/7184 [2:35:39<37:24:09, 20.03s/it, pid=100461]

[saved] npbstats_fielding_chunk_1.csv


Scraping fielding:  15%|█▍        | 1058/7184 [5:41:31<36:55:54, 21.70s/it, pid=101059]

[saved] npbstats_fielding_chunk_2.csv


Scraping fielding:  18%|█▊        | 1260/7184 [6:40:54<36:44:43, 22.33s/it, pid=101261]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=101261 — retry 1 in 1s


Scraping fielding:  22%|██▏       | 1604/7184 [8:32:04<27:09:05, 17.52s/it, pid=101605]

[saved] npbstats_fielding_chunk_3.csv


Scraping fielding:  25%|██▌       | 1802/7184 [9:35:58<40:28:22, 27.07s/it, pid=101803]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=101803 — retry 1 in 1s
[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=101803 — retry 2 in 2s


Scraping fielding:  27%|██▋       | 1965/7184 [10:36:53<42:41:13, 29.44s/it, pid=101966]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=101966 — retry 1 in 1s


Scraping fielding:  29%|██▉       | 2071/7184 [11:13:30<22:23:14, 15.76s/it, pid=102072]

[saved] npbstats_fielding_chunk_4.csv


Scraping fielding:  34%|███▍      | 2478/7184 [13:22:34<24:56:28, 19.08s/it, pid=102479]

[saved] npbstats_fielding_chunk_5.csv


Scraping fielding:  40%|████      | 2907/7184 [15:33:52<19:58:09, 16.81s/it, pid=102908]

[saved] npbstats_fielding_chunk_6.csv


Scraping fielding:  45%|████▍     | 3207/7184 [17:09:59<30:20:12, 27.46s/it, pid=103208]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=103208 — retry 1 in 1s


Scraping fielding:  47%|████▋     | 3358/7184 [18:01:21<18:58:53, 17.86s/it, pid=103359]

[saved] npbstats_fielding_chunk_7.csv


Scraping fielding:  49%|████▊     | 3485/7184 [18:42:32<25:22:03, 24.69s/it, pid=103486]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=103486 — retry 1 in 1s


Scraping fielding:  49%|████▊     | 3486/7184 [18:44:15<49:18:52, 48.01s/it, pid=103487]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=103487 — retry 1 in 1s


Scraping fielding:  50%|████▉     | 3582/7184 [19:20:18<23:09:20, 23.14s/it, pid=103583]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=103583 — retry 1 in 1s


Scraping fielding:  53%|█████▎    | 3826/7184 [20:48:38<16:30:12, 17.69s/it, pid=103827]

[saved] npbstats_fielding_chunk_8.csv


Scraping fielding:  54%|█████▍    | 3905/7184 [21:14:27<21:56:42, 24.09s/it, pid=103906]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=103906 — retry 1 in 1s


Scraping fielding:  54%|█████▍    | 3908/7184 [21:17:04<34:31:18, 37.94s/it, pid=103909]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=103909 — retry 1 in 1s


Scraping fielding:  60%|█████▉    | 4288/7184 [23:25:46<16:12:25, 20.15s/it, pid=104289]

[saved] npbstats_fielding_chunk_9.csv


Scraping fielding:  67%|██████▋   | 4813/7184 [26:15:01<10:59:14, 16.68s/it, pid=104814]

[saved] npbstats_fielding_chunk_10.csv


Scraping fielding:  67%|██████▋   | 4832/7184 [26:22:04<21:57:39, 33.61s/it, pid=104833]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=104833 — retry 1 in 1s


Scraping fielding:  74%|███████▍  | 5329/7184 [28:27:33<5:11:45, 10.08s/it, pid=105330] 

[saved] npbstats_fielding_chunk_11.csv


Scraping fielding:  82%|████████▏ | 5858/7184 [30:07:48<8:14:55, 22.40s/it, pid=105859]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=105859 — retry 1 in 1s


Scraping fielding:  82%|████████▏ | 5870/7184 [30:11:42<7:22:59, 20.23s/it, pid=105871] 

[saved] npbstats_fielding_chunk_12.csv


Scraping fielding:  82%|████████▏ | 5882/7184 [30:15:06<6:44:12, 18.63s/it, pid=105883]

[warn] timeout http://npbstats.com/players/db/batting/?wdt_search=105883 — retry 1 in 1s


Scraping fielding:  91%|█████████ | 6529/7184 [32:09:14<1:42:31,  9.39s/it, pid=106530] 

[saved] npbstats_fielding_chunk_13.csv


Scraping fielding: 100%|██████████| 7184/7184 [34:00:32<00:00, 17.04s/it, pid=107184]  


Saved npbstats_fielding_final.csv
Done scraping.


In [None]:
import glob
import pandas as pd
from pathlib import Path

csv_files = sorted(glob.glob("npbstats_fielding_chunk_*.csv") + ["npbstats_fielding_final.csv"], key=lambda p: (p.count("_"), p))

Found 14 files: ['npbstats_fielding_final.csv', 'npbstats_fielding_chunk_1.csv', 'npbstats_fielding_chunk_10.csv', 'npbstats_fielding_chunk_11.csv', 'npbstats_fielding_chunk_12.csv', 'npbstats_fielding_chunk_13.csv', 'npbstats_fielding_chunk_2.csv', 'npbstats_fielding_chunk_3.csv', 'npbstats_fielding_chunk_4.csv', 'npbstats_fielding_chunk_5.csv', 'npbstats_fielding_chunk_6.csv', 'npbstats_fielding_chunk_7.csv', 'npbstats_fielding_chunk_8.csv', 'npbstats_fielding_chunk_9.csv']


In [None]:
frames = []
for f in csv_files:
    df = pd.read_csv(f)
    frames.append(df)

In [None]:
big_df = pd.concat(frames, ignore_index=True)
big_df = big_df.sort_values("SearchID", key=lambda s: s.astype(int))  # ← sort

out_path = Path("npbstats_fielding_full.csv")
big_df.to_csv(out_path, index=False