# Umpire Scraping

Given a season, scrape baseball-reference.com for the homeplate umpire

In [1]:
import csv
import requests
from bs4 import BeautifulSoup, Comment
bbref_addr = 'https://www.baseball-reference.com'

In [9]:
def scrape_hp_umps(year=2015):
    
    schedule_page = requests.get(f"{bbref_addr}/leagues/MLB/{year}-schedule.shtml")
    soup = BeautifulSoup(schedule_page.content, 'html.parser')
    
    games_list = soup.find_all("p", attrs={"game"})
    games_list_href = [x.find_all("a", href=True)[-1]['href'] for x in games_list]
    
    games_list_href = [x for x in games_list_href if x.startswith('/boxes')]
        
    hp_umpire_dict = {}
    for i, game in enumerate(games_list_href):

        if i % 100 == 0:
            print(f"Game {i}")

        # get the id for the game
        game_id = game.split('/')[-1][:-7]

        # build the url for the game
        game_addr = bbref_addr + game
        
        # scrape the webpage for the game
        game_page = requests.get(game_addr)
        soup = BeautifulSoup(game_page.content, 'html.parser')

        # the umpires are stashed in a comments section
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        ump_string = [x for x in comments if 'Umpires' in x][0]
        ump_string = ump_string.strip()
        ump_string = ump_string.replace(',', '')

        # extract the HP umpire
        ump_list = ump_string.split()
        hp_idx = ump_list.index('HP')
        first_name = ump_list[hp_idx+2]
        last_name = ump_list[hp_idx+3]

        hp_umpire_dict[game_id] = first_name + ' ' + last_name
        
    w = csv.writer(open(f"../data/hp_umpires_{year}.csv", "w"))
    for key, val in hp_umpire_dict.items():
        w.writerow([key, val])

In [10]:
for year in [2019]:
    print("==================================")
    print(f"getting data for {year} season...")
    print("==================================")
    scrape_hp_umps(year)
    print()

getting data for 2019 season...
Game 0
Game 100
Game 200
Game 300
Game 400
Game 500
Game 600
Game 700
Game 800
Game 900
Game 1000
Game 1100
Game 1200
Game 1300
Game 1400
Game 1500
Game 1600
Game 1700
Game 1800
Game 1900
Game 2000
Game 2100
Game 2200
Game 2300

