In [None]:
# http://npbstats.com/players/db/batting/?wdt_search=100001 
# http://npbstats.com/players/db/batting/?wdt_search=107184

In [None]:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from tqdm import tqdm

def safe_get_and_wait(driver, wait, url, max_retries=2):
    for attempt in range(max_retries + 1):
        try:
            driver.get(url)
            wait.until(
                EC.presence_of_element_located(
                    (
                        By.CSS_SELECTOR,
                        "#table_15 tbody tr.odd, #table_15 tbody tr.even",
                    )
                )
            )
            wait.until(
                EC.presence_of_element_located(
                    (
                        By.CSS_SELECTOR,
                        "#table_7 tbody tr.odd, #table_7 tbody tr.even",
                    )
                )
            )
            
            time.sleep(0.2)
            return True
        except TimeoutException:
            if attempt < max_retries:
                backoff = 2 ** attempt
                time.sleep(backoff)
            else:
                return False

def scrape_npb_stats(ids, chunk_size=5000):
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
    opts.add_experimental_option("excludeSwitches", ["enable-automation"])
    opts.add_experimental_option("useAutomationExtension", False)
    driver = webdriver.Chrome(options=opts)
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {"source": "Object.defineProperty(navigator,'webdriver',{get:()=>undefined})"},
    )
    wait = WebDriverWait(driver, 20)

    out_rows = []
    chunk_idx = 1

    try:
        pbar = tqdm(ids, desc="batters")
        for pid in pbar:
            pbar.set_postfix(pid=pid)
            url = f"http://npbstats.com/players/db/batting/?wdt_search={pid}"
            if not safe_get_and_wait(driver, wait, url):
                continue

            soup = BeautifulSoup(driver.page_source, "lxml")

            # table_1 bio
            bio_tbl = soup.find("table", id="table_1")
            if not bio_tbl or bio_tbl.find("td", class_="dataTables_empty"):
                continue
            bio_df = pd.read_html(str(bio_tbl), flavor="lxml")[0]
            bio_df.columns = bio_df.columns.str.strip()
            bio = bio_df.iloc[0].to_dict()
            bio["SearchID"] = pid

            # table_7 dashboard
            dash_tbl = soup.find("table", id="table_7")
            dash_df = pd.read_html(str(dash_tbl), flavor="lxml")[0]
            dash_df.columns = dash_df.columns.str.strip()
            dash_df["SearchID"] = pid

            # table_15 batting
            bat_tbl = soup.find("table", id="table_15")
            bat_df = pd.read_html(str(bat_tbl), flavor="lxml")[0]
            bat_df.columns = bat_df.columns.str.strip()
            bat_df["SearchID"] = pid

            # dedupe on Year
            bat_df  = bat_df.drop_duplicates(subset="Year", keep="first")
            dash_df = dash_df.drop_duplicates(subset="Year", keep="first")

            # merge
            merged = bat_df.merge(dash_df, on="Year", how="left")

            # collect
            for _, season in merged.iterrows():
                out_rows.append({**bio, **season.to_dict()})

            # chunk save
            if len(out_rows) >= chunk_size:
                df_chunk = pd.DataFrame(out_rows)
                fname = f"npbstats_batter_chunk_{chunk_idx}.csv"
                df_chunk.to_csv(fname, index=False)
                pbar.write(f"[saved] {fname}")
                chunk_idx += 1
                out_rows.clear()

    finally:
        driver.quit()

    return pd.DataFrame(out_rows)

if __name__ == "__main__":
    ids = [str(pid) for pid in range(100001, 107185)]
    df_remain = scrape_npb_stats(ids)

    if not df_remain.empty:
        df_remain.to_csv("npbstats_batter_final.csv", index=False)

Scraping batters:   8%|▊         | 583/7184 [2:00:58<34:36:37, 18.88s/it, pid=100584]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=100584, retry 1 in 1s


Scraping batters:  11%|█▏        | 826/7184 [2:49:10<18:18:34, 10.37s/it, pid=100827]

[saved] npbstats_batter_chunk_1.csv


Scraping batters:  14%|█▍        | 1034/7184 [3:31:10<26:46:40, 15.67s/it, pid=101035]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=101035, retry 1 in 1s


Scraping batters:  16%|█▋        | 1172/7184 [4:02:38<27:34:20, 16.51s/it, pid=101173]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=101173, retry 1 in 1s


Scraping batters:  16%|█▋        | 1175/7184 [4:05:08<54:15:22, 32.50s/it, pid=101176]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=101176, retry 1 in 1s


Scraping batters:  16%|█▋        | 1176/7184 [4:07:38<112:59:11, 67.70s/it, pid=101177]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=101177, retry 1 in 1s


Scraping batters:  19%|█▊        | 1342/7184 [4:47:18<31:21:42, 19.33s/it, pid=101343] 

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=101343, retry 1 in 1s


Scraping batters:  23%|██▎       | 1632/7184 [5:51:39<15:33:05, 10.08s/it, pid=101633]

[saved] npbstats_batter_chunk_2.csv


Scraping batters:  25%|██▌       | 1829/7184 [6:33:35<21:56:41, 14.75s/it, pid=101830]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=101830, retry 1 in 1s
[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=101830, retry 2 in 2s


Scraping batters:  26%|██▌       | 1833/7184 [6:36:14<37:08:51, 24.99s/it, pid=101834]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=101834, retry 1 in 1s


Scraping batters:  32%|███▏      | 2327/7184 [8:15:06<19:43:42, 14.62s/it, pid=102328]

[saved] npbstats_batter_chunk_3.csv


Scraping batters:  41%|████▏     | 2965/7184 [10:29:23<17:54:09, 15.28s/it, pid=102966]

[saved] npbstats_batter_chunk_4.csv


Scraping batters:  50%|████▉     | 3586/7184 [12:36:17<13:03:45, 13.07s/it, pid=103587]

[saved] npbstats_batter_chunk_5.csv


Scraping batters:  58%|█████▊    | 4191/7184 [14:50:18<8:56:51, 10.76s/it, pid=104192] 

[saved] npbstats_batter_chunk_6.csv


Scraping batters:  61%|██████▏   | 4412/7184 [15:35:32<17:06:10, 22.21s/it, pid=104413]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=104413, retry 1 in 1s


Scraping batters:  64%|██████▎   | 4578/7184 [16:20:40<15:20:46, 21.20s/it, pid=104579]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=104579, retry 1 in 1s


Scraping batters:  68%|██████▊   | 4857/7184 [17:22:41<7:34:00, 11.71s/it, pid=104858] 

[saved] npbstats_batter_chunk_7.csv


Scraping batters:  69%|██████▉   | 4947/7184 [17:44:26<12:06:12, 19.48s/it, pid=104948]

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=104948, retry 1 in 1s


Scraping batters:  77%|███████▋  | 5531/7184 [19:57:56<8:03:51, 17.56s/it, pid=105532] 

[saved] npbstats_batter_chunk_8.csv


Scraping batters:  80%|███████▉  | 5742/7184 [20:46:51<8:50:20, 22.07s/it, pid=105743] 

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=105743, retry 1 in 1s


Scraping batters:  81%|████████▏ | 5845/7184 [21:09:49<4:26:51, 11.96s/it, pid=105846] 

[warn] timeout loading http://npbstats.com/players/db/batting/?wdt_search=105846, retry 1 in 1s


Scraping batters:  88%|████████▊ | 6307/7184 [22:46:57<3:36:52, 14.84s/it, pid=106308]

[saved] npbstats_batter_chunk_9.csv


Scraping batters: 100%|██████████| 7184/7184 [25:37:52<00:00, 12.84s/it, pid=107184]  


Saved npbstats_batter_final.csv
Done scraping.


In [None]:
import glob
import pandas as pd
from pathlib import Path

csv_files = sorted(
    glob.glob("npbstats_batter_chunk_*.csv") + ["npbstats_batter_final.csv"],
    key=lambda p: (p.count("_"), p)
)

Found 10 files: ['npbstats_batter_final.csv', 'npbstats_batter_chunk_1.csv', 'npbstats_batter_chunk_2.csv', 'npbstats_batter_chunk_3.csv', 'npbstats_batter_chunk_4.csv', 'npbstats_batter_chunk_5.csv', 'npbstats_batter_chunk_6.csv', 'npbstats_batter_chunk_7.csv', 'npbstats_batter_chunk_8.csv', 'npbstats_batter_chunk_9.csv']


In [19]:
frames = []
for f in csv_files:
    print(f"→ reading {f}")
    df = pd.read_csv(f)
    frames.append(df)

→ reading npbstats_batter_final.csv
→ reading npbstats_batter_chunk_1.csv
→ reading npbstats_batter_chunk_2.csv
→ reading npbstats_batter_chunk_3.csv
→ reading npbstats_batter_chunk_4.csv
→ reading npbstats_batter_chunk_5.csv
→ reading npbstats_batter_chunk_6.csv
→ reading npbstats_batter_chunk_7.csv
→ reading npbstats_batter_chunk_8.csv
→ reading npbstats_batter_chunk_9.csv


In [None]:
big_df = pd.concat(frames, ignore_index=True)
big_df = big_df.sort_values("SearchID", key=lambda s: s.astype(int))  # ← sort

out_path = Path("npbstats_batter_full.csv")
big_df.to_csv(out_path, index=False)
print(f"Saved {out_path}  |  rows: {len(big_df)}")

Saved npbstats_batter_full.csv  |  rows: 48408


: 