In [None]:
################################################################################                                                                                                                                                              
################################################################################
#
# FILE: hockey-download.py
#
# BY: Dmitry Sedov 
#
# CREATED: Tue Apr 14 2020
#
# DESC: This code downloads data about hockey teams / games from 
#       https://www.hockey-reference.com
#
# EXEC:
#      
################################################################################
################################################################################

In [None]:
################################ Libraries #####################################

import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import os
from urllib.error import HTTPError

################################################################################

In [None]:
################################ Constants #####################################

main_url = 'https://www.hockey-reference.com'
teams_url = 'https://www.hockey-reference.com/teams/'
wiki_url = 'https://en.wikipedia.org/wiki/National_Hockey_League'
teams_output_folder = '/Users/muser/dfolder/Research/stadiums/data/hockey/teams/'
games_output_folder = '/Users/muser/dfolder/Research/stadiums/data/hockey/games/'

################################################################################

In [None]:
######################### Get list of all teams ################################

teams_page = requests.get(teams_url).text
teams_soup = BeautifulSoup(teams_page)
active_teams_table = teams_soup.find('table', {'id': 'active_franchises'})
head = str(active_teams_table.find('thead'))
rows = '\n'.join(str(x) for x in active_teams_table.find_all('tr', {'class': 'full_table'}))
active_teams_table = '<table>\n' + head + '\n<tbody>\n' + rows + '\n</tbody>\n' + '</table>'
active_teams_table = pd.read_html(active_teams_table)
active_teams_table = active_teams_table[0]
active_teams_table.rename(columns = {'Franchise': 'name'}, inplace = True)

################################################################################

In [None]:
active_teams_table

In [None]:
######################### Get links of all teams ###############################

teams_soup = BeautifulSoup(teams_page, 'html.parser')
active_teams = teams_soup.find('table', {'id': 'active_franchises'})
temp = ({'link': tag['href'], 
         'name': tag.text} for tag in active_teams.find_all('a', 
                                                            attrs = {'class': None},
                                                            href = True))
team_links = pd.DataFrame(temp)

team_links['link'] = team_links['link'].apply(lambda x: '/'+'/'.join(x.split('/')[1:3]) + '/')

################################################################################

In [None]:
####################### Merge for a teams dataset ##############################

hockey_teams = pd.merge(active_teams_table, 
                        team_links, 
                        how = 'outer',
                        on = 'name', 
                        validate = 'one_to_one')

# Change links for some teams
hockey_teams.loc[hockey_teams['name'] == 'Arizona Coyotes', 'link'] = '/teams/ARI/'

################################################################################

In [None]:
####################### Function to get the stadium ############################

pattern = re.compile(r'Primary Arena')

def get_stadium(link, year):
    print(link)
    team_year_url = main_url + link + str(year) + '.html'
    team_year_soup = BeautifulSoup(requests.get(team_year_url).text, 
                                   'html.parser')
    try:
        stadium = team_year_soup.find('strong',
                                      text = pattern).parent.a.text
    except AttributeError:
        return None
    return stadium
    
################################################################################

In [None]:
hockey_teams['stadium_2017'] = hockey_teams.apply(lambda row: get_stadium(row['link'], 2017),
                                                  axis = 1)

In [None]:
hockey_teams['stadium_2018'] = hockey_teams.apply(lambda row: get_stadium(row['link'], 2018),
                                                  axis = 1)

In [None]:
hockey_teams['stadium_2019'] = hockey_teams.apply(lambda row: get_stadium(row['link'], 2019),
                                                  axis = 1)

In [None]:
hockey_teams['stadium_2020'] = hockey_teams.apply(lambda row: get_stadium(row['link'], 2020),
                                                  axis = 1)

In [None]:
hockey_teams['code'] = hockey_teams['link'].apply(lambda x: x.split('/')[2])

In [None]:
hockey_teams.loc[hockey_teams['name'] == 'Detroit Red Wings', ['stadium_2018', 'stadium_2019', 'stadium_2020']] = 'Little Caesars Arena'

In [None]:
hockey_teams.to_csv(os.path.join(teams_output_folder, 'hockey_teams.csv'),
                    index = False)

In [None]:
######################## Function to get the games ############################

def get_games(link, year):
    team_code = link.split('/')[2]
    print(team_code)
    schedule_year_url = main_url + link + str(year) + '_games.html'
    try:
        schedule_tables = pd.read_html(schedule_year_url)
    except HTTPError:
        return None
    season = schedule_tables[0]
    # Filter non-games rows out
    mask = season.iloc[:,0].apply(lambda x: x.isdigit())
    season_schedule_table = season.loc[mask].copy()
    season_schedule_table['team_code'] = team_code
    season_schedule_table.to_csv(os.path.join(games_output_folder,
                                              str(year),
                                              f'{team_code}_season_games.csv'),
                                 index = False)
    try:
        playoff = schedule_tables[1]
    except IndexError:
        print(f'No playoffs for {team_code}.')
        return None
    try:
        mask = playoff.iloc[:,0].apply(lambda x: x.isdigit())
        playoff_schedule_table = playoff.loc[mask].copy()
    except AttributeError:
        playoff_schedule_table = playoff
    playoff_schedule_table['team_code'] = team_code
    playoff_schedule_table.to_csv(os.path.join(games_output_folder,
                                               str(year),
                                               f'{team_code}_playoff_games.csv'), 
                                  index = False)
    return None
    
################################################################################

In [None]:
hockey_teams.apply(lambda row: get_games(row['link'], 2017), axis = 1)

In [None]:
hockey_teams.apply(lambda row: get_games(row['link'], 2018), axis = 1)

In [None]:
hockey_teams.apply(lambda row: get_games(row['link'], 2019), axis = 1)

In [None]:
hockey_teams.apply(lambda row: get_games(row['link'], 2020), axis = 1)

In [None]:
# Download the list of the teams locations from Wikipedia
wiki_tables = pd.read_html(wiki_url, attrs = {'class': 'wikitable'})

In [None]:
# Clean table
teams_locations = wiki_tables[0].droplevel(1, axis = 1)
mask = (teams_locations['Capacity'] != 'Western Conference')
teams_locations = teams_locations[mask].sort_values('Team').reset_index(drop = True)
teams_locations['Team'] = teams_locations['Team'].apply(lambda x: re.sub('[^a-zA-Z ]+', '', x))

In [None]:
teams_locations.to_csv(os.path.join(teams_output_folder, 
                                    'hockey_locations.csv'), 
                       index = False)

In [None]:
hockey_teams = pd.read_csv(os.path.join(teams_output_folder, 'hockey_teams.csv'))

In [None]:
pd.merge(hockey_teams, 
         teams_locations, 
         how = 'left', 
         left_on = 'name',
         right_on = 'Team', 
         validate = 'one_to_one').shape