In [None]:
import warnings
import ssl
import logging
import string
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import urllib3

warnings.simplefilter(action='ignore', category=FutureWarning)
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

ctx = ssl.create_default_context()
ctx.set_ciphers('DEFAULT@SECLEVEL=1')
http0 = urllib3.PoolManager(ssl_version=urllib3.util.ssl_.PROTOCOL_TLS, ssl_context=ctx)

letter = 'a'
url0 = f'https://npb.jp/bis/eng/players/active/index_{letter}.html'
resp0 = http0.request('GET', url0, preload_content=False)
content0 = resp0.read()
soup0 = BeautifulSoup(content0, 'lxml')

player_info = []
for a in soup0.find_all('a', class_='player_unit_1'):
    name = a.find(class_='name').text.strip()
    href = a['href']
    player_info.append((name, href))

player_info[:5]

Found 37 players for 'a'


[('Abe, Shota', '/bis/eng/players/01105153.html'),
 ('Abe, Toshiki', '/bis/eng/players/61165132.html'),
 ('Acevedo, Stiven', '/bis/eng/players/23925150.html'),
 ('Aduwa, Makoto', '/bis/eng/players/31135134.html'),
 ('Aibara, Ryunosuke', '/bis/eng/players/31135155.html')]

In [None]:
batting_records = []
start = datetime.now()

for letter in string.ascii_lowercase:
    url = f'https://npb.jp/bis/eng/players/active/index_{letter}.html'
    resp = http0.request('GET', url, preload_content=False)
    content = resp.read()
    soup = BeautifulSoup(content, 'lxml')

    for a in soup.find_all('a', class_='player_unit_1'):
        name = a.find(class_='name').text.strip()
        href = a['href']
        player_id = href.split('/')[-1].replace('.html', '')

        player_url = f'https://npb.jp{href}'
        resp_p = http0.request('GET', player_url, preload_content=False)
        page = resp_p.read()
        psoup = BeautifulSoup(page, 'lxml')

        if not psoup.find_all(class_='tab_btn current'):
            continue

        bio_cells = psoup.find_all(id='pc_bio')[0].find_all('td')
        position = bio_cells[0].text
        bats, throws = bio_cells[1].text.split(' / ')
        height = int(bio_cells[2].text.split('cm/ ')[0])
        weight = int(bio_cells[2].text.split('cm/ ')[1].replace('kg',''))
        born = bio_cells[3].text

        table = psoup.find('table', id='tablefix_b')
        if table:
            df = pd.read_html(str(table))[0]
            df = df.loc[~df.iloc[:,0].astype(str).str.contains(r'Lefthand|Switch-hitter', na=False)]

            df['Name'] = name
            df['PlayerID'] = player_id
            df['Position'] = position
            df['Bats'] = bats
            df['Throws'] = throws
            df['Height_cm'] = height
            df['Weight_kg'] = weight
            df['Born'] = born

            batting_records.append(df)

batting_df = pd.concat(batting_records, ignore_index=True)
batting_df.to_csv('npb_Batting_active.csv', index=False)

pitching_records = []
p_start = datetime.now()

for letter in string.ascii_lowercase:
    url = f'https://npb.jp/bis/eng/players/active/index_{letter}.html'
    resp = http0.request('GET', url, preload_content=False)
    content = resp.read()
    soup = BeautifulSoup(content, 'lxml')

    for a in soup.find_all('a', class_='player_unit_1'):
        name = a.find(class_='name').text.strip()
        href = a['href']
        player_id = href.split('/')[-1].replace('.html', '')

        player_url = f'https://npb.jp{href}'
        resp_p = http0.request('GET', player_url, preload_content=False)
        page = resp_p.read()
        psoup = BeautifulSoup(page, 'lxml')

        if not psoup.find_all(class_='tab_btn current'):
            continue

        bio_cells = psoup.find_all(id='pc_bio')[0].find_all('td')
        position = bio_cells[0].text
        bats, throws = bio_cells[1].text.split(' / ')
        height = int(bio_cells[2].text.split('cm/ ')[0])
        weight = int(bio_cells[2].text.split('cm/ ')[1].replace('kg',''))
        born = bio_cells[3].text

        table = psoup.find('table', id='tablefix_p')
        if table:
            df = pd.read_html(str(table))[0]
            df = df.loc[~df.iloc[:,0].astype(str)
                        .str.contains(r'Lefthand|Switch-hitter', na=False)]

            df['Name'] = name
            df['PlayerID'] = player_id
            df['Position'] = position
            df['Bats'] = bats
            df['Throws'] = throws
            df['Height_cm'] = height
            df['Weight_kg'] = weight
            df['Born'] = born

            pitching_records.append(df)

pitching_df = pd.concat(pitching_records, ignore_index=True)
pitching_df.to_csv('npb_Pitching_active.csv', index=False)
