In [None]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: baseball-download.py
#
# BY: Dmitry Sedov 
#
# CREATED: Tue Apr 14 2020
#
# DESC: This code downloads data about baseball teams / games from 
#       https://www.baseball-reference.com
#
# EXEC:
#      
################################################################################
################################################################################

In [None]:
################################ Libraries #####################################

import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import os

################################################################################

In [None]:
################################ Constants #####################################

main_url = 'https://www.baseball-reference.com'
teams_url = 'https://www.baseball-reference.com/teams/'
wiki_url = 'https://en.wikipedia.org/wiki/Major_League_Baseball'
teams_output_folder = '/Users/muser/dfolder/Research/stadiums/data/baseball/teams/'
games_output_folder = '/Users/muser/dfolder/Research/stadiums/data/baseball/games/'

################################################################################

In [None]:
######################### Get list of all teams ################################

teams_page = requests.get(teams_url).text
teams_tables = pd.read_html(teams_page)
mlb_teams = teams_tables[0].loc[teams_tables[0]['To'] == '2020'].copy()
mlb_teams['Rk'] = mlb_teams['Rk'].astype(int)
mlb_teams.rename(columns = {'Franchise': 'name'}, inplace = True)

################################################################################

In [None]:
######################### Get links of all teams ###############################

teams_soup = BeautifulSoup(teams_page, 'html.parser')
active_teams = teams_soup.find('table', {'id': 'teams_active'})
temp = ({'link': tag['href'], 
         'name': tag.text} for tag in active_teams.find_all('a', 
                                                            attrs = {'class': None},
                                                            href = True))
team_links = pd.DataFrame(temp)

################################################################################

In [None]:
####################### Merge for a teams dataset ##############################

mlb_teams = pd.merge(mlb_teams, 
                     team_links, 
                     how = 'outer',
                     on = 'name', 
                     validate = 'one_to_one')

# Change links for some teams
mlb_teams.loc[mlb_teams['name'] == 'Los Angeles Angels', 'link'] = '/teams/LAA/'
mlb_teams.loc[mlb_teams['name'] == 'Miami Marlins', 'link'] = '/teams/MIA/'
mlb_teams.loc[mlb_teams['name'] == 'Tampa Bay Rays', 'link'] = '/teams/TBR/'

################################################################################

In [None]:
####################### Function to get the stadium ############################

pattern = re.compile(r'Ballpark')

def get_stadium(link, year):
    print(link)
    team_year_url = main_url + link + str(year) + '.shtml'
    team_year_soup = BeautifulSoup(requests.get(team_year_url).text, 
                                   'html.parser')
    stadium = team_year_soup.find('strong',
                                  text = pattern).next_sibling.strip()
    return stadium
    
################################################################################

In [None]:
mlb_teams['stadium_2017'] = mlb_teams.apply(lambda row: get_stadium(row['link'], 2017), 
                                            axis = 1)

In [None]:
mlb_teams['stadium_2018'] = mlb_teams.apply(lambda row: get_stadium(row['link'], 2018), 
                                            axis = 1)

In [None]:
mlb_teams['stadium_2019'] = mlb_teams.apply(lambda row: get_stadium(row['link'], 2019), 
                                            axis = 1)

In [None]:
mlb_teams['code'] = mlb_teams['link'].apply(lambda x: x.split('/')[2])

In [None]:
mlb_teams.to_csv(os.path.join(teams_output_folder, 'baseball_teams.csv'), 
                 index = False)

In [None]:
######################## Function to get the games ############################

def get_games(link, year):
    team_code = link.split('/')[2]
    print(team_code)
    schedule_year_url = main_url + link + str(year) + '-schedule-scores.shtml'
    schedule_tables = pd.read_html(schedule_year_url)
    # Filter non-games rows out
    mask = schedule_tables[0].iloc[:,0].apply(lambda x: x.isdigit())
    schedule_table = schedule_tables[0].loc[mask].copy()
    schedule_table['year'] = year
    schedule_table.to_csv(os.path.join(games_output_folder, 
                                       str(year), 
                                       f'{team_code}_games.csv'), 
                          index = False)
    return None
    
################################################################################

In [None]:
mlb_teams.apply(lambda row: get_games(row['link'], 2017), axis = 1)

In [None]:
mlb_teams.apply(lambda row: get_games(row['link'], 2018), axis = 1)

In [None]:
mlb_teams.apply(lambda row: get_games(row['link'], 2019), axis = 1)

In [None]:
# Download the list of the teams locations from Wikipedia
wiki_tables = pd.read_html(wiki_url, attrs = {'class': 'wikitable'})

In [None]:
teams_locations = wiki_tables[0].droplevel(1, axis = 1)
mask = (teams_locations['Capacity'] != 'National League')
teams_locations = teams_locations[mask].sort_values('Team').reset_index(drop = True)

In [None]:
teams_locations.to_csv(os.path.join(teams_output_folder, 
                                    'baseball_locations.csv'), 
                       index = False)

In [None]:
mlb_teams = pd.read_csv(os.path.join(teams_output_folder, 'baseball_teams.csv'))

In [None]:
pd.merge(mlb_teams, 
         teams_locations, 
         how = 'left', 
         left_on = 'name',
         right_on = 'Team', 
         validate = 'one_to_one').shape