In [1]:
import requests
from bs4 import BeautifulSoup as bs

game_links = []
for current_year in range(2016,2021):
    url = f"https://www.baseball-reference.com/leagues/MLB/{current_year}-schedule.shtml"
    resp = requests.get(url)
    soup=bs(resp.text)
    games = soup.findAll('a',text='Boxscore')
    game_links.extend([x['href'] for x in games])
print("Number of games to download: ", len(game_links))
game_links[0]

Number of games to download:  10812


'/boxes/KCA/KCA201604030.shtml'

In [2]:
import sys
!{sys.executable} -m pip install datetime



In [5]:
# these are functions related to parsing the baseball reference page

def get_game_summary(soup, game_id):
    game = {'game_id': game_id}
    scorebox = soup.find('div', {'class':'scorebox'})
    teams = scorebox.findAll('a',{'itemprop':'name'})
    game['away_team_abbr'] = teams[0]['href'].split('/')[2]
    game['home_team_abbr'] = teams[1]['href'].split('/')[2]
    meta = scorebox.find('div', {'class':'scorebox_meta'}).findAll('div')
    game['date'] = meta[0].text.strip()
    game['start_time'] = meta[1].text[12:-6].strip()
    return game

def get_table_summary(soup, table_no):
    stats_tables = soup.findAll('table', {'class':'stats_table'})
    t = stats_tables[table_no].find('tfoot')
    summary = {x['data-stat']:x.text.strip() for x in t.findAll('td')}
    return summary

def get_pitcher_data(soup, table_no):
    stats_tables = soup.findAll('table', {'class':'stats_table'})
    t = stats_tables[table_no]
    data = []
    rows = t.findAll('tr')[1:-1] # not the header and footer rows
    for r in rows:
        summary = {x['data-stat']:x.text.strip() for x in r.findAll('td')}
        summary['name'] = r.find('th',{'data-stat':'player'}).find('a')['href'].split('/')[-1][:-6].strip()
        data.append(summary)
    return data

def process_link(url):
    resp = requests.get(url)
    game_id = url.split('/')[-1][:-6]

    # strange preprocessing routine
    uncommented_html = ''
    for h in resp.text.split('\n'):
        if '<!--     <div' in h: continue
        if h.strip() == '<!--': continue
        if h.strip() == '-->': continue
        uncommented_html += h + '\n'

    soup = bs(uncommented_html)
    data = {
        'game': get_game_summary(soup, game_id),
        'away_batting': get_table_summary(soup, 1),
        'home_batting':get_table_summary(soup, 2),
        'away_pitching':get_table_summary(soup, 3),
        'home_pitching':get_table_summary(soup, 4),
        'away_pitchers': get_pitcher_data(soup, 3),
        'home_pitchers': get_pitcher_data(soup, 4)
    }
    return data

In [6]:
import datetime as dt
game_data = []
for link in game_links:
    url = 'https://www.baseball-reference.com' + link
    game_data.append(process_link(url))
    if len(game_data)%1000==0: print(dt.datetime.now().time(), len(game_data))

03:02:12.251225 1000
03:07:37.599201 2000
03:13:02.730740 3000
03:18:27.117554 4000
03:23:59.851903 5000
03:29:20.490718 6000
03:34:43.006264 7000
03:40:11.257383 8000
03:45:49.035673 9000
03:51:23.880528 10000


In [7]:
game_data[0]

{'game': {'game_id': 'KCA201604030',
  'away_team_abbr': 'NYM',
  'home_team_abbr': 'KCR',
  'date': 'Sunday, April 3, 2016',
  'start_time': '7:38 p.m.'},
 'away_batting': {'AB': '33',
  'R': '3',
  'H': '7',
  'RBI': '3',
  'BB': '6',
  'SO': '9',
  'PA': '39',
  'batting_avg': '.212',
  'onbase_perc': '.333',
  'slugging_perc': '.242',
  'onbase_plus_slugging': '.576',
  'pitches': '177',
  'strikes_total': '105',
  'wpa_bat': '-0.449',
  'leverage_index_avg': '1.58',
  'wpa_bat_pos': '0.746',
  'wpa_bat_neg': '-1.195%',
  'cwpa_bat': '-0.24%',
  'cli_avg': '1.42',
  're24_bat': '-1.7',
  'PO': '24',
  'A': '15',
  'details': ''},
 'home_batting': {'AB': '30',
  'R': '4',
  'H': '9',
  'RBI': '4',
  'BB': '2',
  'SO': '3',
  'PA': '33',
  'batting_avg': '.300',
  'onbase_perc': '.333',
  'slugging_perc': '.300',
  'onbase_plus_slugging': '.633',
  'pitches': '114',
  'strikes_total': '71',
  'wpa_bat': '0.052',
  'leverage_index_avg': '0.74',
  'wpa_bat_pos': '0.488',
  'wpa_bat_neg

In [8]:
import pickle
pickle.dump(game_data, open('game_data.pkl', 'wb'))