## Scraping Hockey Reference

#### Workflow

- I need to scrape several years of data for each team and all individual players
- The team and player statistics will be in separate dataframes.
- Aditionally, each year of data will also be in separate dataframes.
- I will set up my scraper to grab team statistics for each team in a given year and make that a temporary dataframe which I will turn into individual csvs.
- The individual player statistics will also be separated by year and saved into individual csvs.

### Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import urllib3
import requests
import time
import re

#### Creating base URL

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
base_url = 'https://www.hockey-reference.com/teams/'

#### Function to grab a txt file of team links

In [3]:
def get_page(url):
    page = urlopen(base_url)
    soup = BeautifulSoup(page, 'lxml')
    file = open('hockey-reference_urls.txt', 'w')
    file.write(str(soup))
    file.close()

def get_team_links(url):
    page = urlopen(url)
    soup = BeautifulSoup(page, 'lxml')

In [4]:
get_page(base_url)

In [5]:
with open('hockey-reference_urls.txt', 'r') as file:
    for line in file:
        line = line.strip()

In [6]:
page = open("hockey-reference_urls.txt", 'r')
soup = BeautifulSoup(page, "lxml")
div = soup.find('div', {'class': 'overthrow table_container'})


#### Saving the team links in a variable

In [7]:
team_links = []
links = div.find_all('a')
for link in links:
    team_links.append(link.get('href'))
# As teams have moved and changed over the years
# I had to manually add Arizona and Atlanta to this list
team_links.insert(1, '/teams/ARI/')
team_links.insert(2, '/teams/ATL/')

In [8]:
team_links

['/teams/ANA/',
 '/teams/ARI/',
 '/teams/ATL/',
 '/teams/PHX/',
 '/teams/BOS/',
 '/teams/BUF/',
 '/teams/CGY/',
 '/teams/CAR/',
 '/teams/CHI/',
 '/teams/COL/',
 '/teams/CBJ/',
 '/teams/DAL/',
 '/teams/DET/',
 '/teams/EDM/',
 '/teams/FLA/',
 '/teams/LAK/',
 '/teams/MIN/',
 '/teams/MTL/',
 '/teams/NSH/',
 '/teams/NJD/',
 '/teams/NYI/',
 '/teams/NYR/',
 '/teams/OTT/',
 '/teams/PHI/',
 '/teams/PIT/',
 '/teams/SJS/',
 '/teams/STL/',
 '/teams/TBL/',
 '/teams/TOR/',
 '/teams/VAN/',
 '/teams/VEG/',
 '/teams/WSH/',
 '/teams/WPG/']

### Function for scraping individual player data

- This function finds a specific table on each teams page which contains individual player stats for a given year.
- There will be a for loop later which will call this function and iterate through each team and each year.

In [9]:
def get_player_table(url):
    res = requests.get(url)
    skater_soup = BeautifulSoup(res.content, 'lxml')
    team_name = skater_soup.find('h1', {'itemprop': 'name'}).find_all('span')[1].text
    table = skater_soup.find('div', {'id': 'all_skaters'}).find('table', {'id': 'skaters'}).find('tbody')
    player_stats = []
    for row in table.find_all('tr'):
        players = {}
        for element in row:
            players['Player'] = row.find('a').text
            players['Age'] = row.find('td', {'data-stat': 'age'}).text
            players['Position'] = row.find('td', {'data-stat': 'pos'}).text
            players['Games Played'] = row.find('td', {'data-stat': 'games_played'}).text
            players['Goals'] = row.find('td', {'data-stat': 'goals'}).text
            players['Assists'] = row.find('td', {'data-stat': 'assists'}).text
            players['Points'] = row.find('td', {'data-stat': 'points'}).text
            players['Plus Minus'] = row.find('td', {'data-stat': 'plus_minus'}).text
            players['Penalty Minutes'] = row.find('td', {'data-stat': 'pen_min'}).text
            players['ES Goals'] = row.find('td', {'data-stat': 'goals_ev'}).text
            players['PP Goals'] = row.find('td', {'data-stat': 'goals_pp'}).text
            players['SH Goals'] = row.find('td', {'data-stat': 'goals_sh'}).text
            players['GW Goals'] = row.find('td', {'data-stat': 'goals_gw'}).text
            players['ES Assists'] = row.find('td', {'data-stat': 'assists_ev'}).text
            players['PP Assists'] = row.find('td', {'data-stat': 'assists_pp'}).text
            players['SH Assists'] = row.find('td', {'data-stat': 'assists_sh'}).text
            players['Shots'] = row.find('td', {'data-stat': 'shots'}).text
            players['Shooting Perecentage'] = row.find('td', {'data-stat': 'shot_pct'}).text
            players['Time on Ice'] = row.find('td', {'data-stat': 'time_on_ice'}).text
            players['Time on Ice Avg'] = row.find('td', {'data-stat': 'time_on_ice_avg'}).text
            players['Offenisve Point Shares'] = row.find('td', {'data-stat': 'ops'}).text
            players['Defensive Point Shares'] = row.find('td', {'data-stat': 'dps'}).text
            players['Point Shares'] = row.find('td', {'data-stat': 'ps'}).text
            players['ES Blocks'] = row.find('td', {'data-stat': 'blocks'}).text
            players['ES Hits'] = row.find('td', {'data-stat': 'hits'}).text
            players['ES Face-Off Wins'] = row.find('td', {'data-stat': 'faceoff_wins'}).text
            players['ES Face-Off Losses'] = row.find('td', {'data-stat': 'faceoff_losses'}).text
            players['ES Face-Off Pct'] = row.find('td', {'data-stat': 'faceoff_percentage'}).text
            players['Team'] = team_name
        player_stats.append(players)
    return player_stats

### Function for scraping team statistics

- This function finds a specific table on each teams page
- I had to get creative in scraping this table as the data in this table was formatted differently than the individual player data table.
- Once I find the specific table within the "team_soup" variable, this function uses the dictionary structure of the data to assign column names
- Again this function will be called later in a for loop to iterate through team and year. Each year will get its own csv file.

In [10]:
def get_team_table(url):
    res = requests.get(url)
    team_soup = BeautifulSoup(res.content, 'lxml')
    team_name = team_soup.find('h1', {'itemprop': 'name'}).find_all('span')[1].text
    table = team_soup.find('div', {'id': 'all_team_stats'}).find('table', {'id': 'team_stats'})
    team_list = []
    team = {'team': team_name}
    for row in table.find('tbody').find('tr').find_all('td'):
        stat = row.text
        temp = row.attrs
        column = temp['data-stat']
        team.update({column: stat})
    team_list.append(team)
    return team_list
        

### Function for scraping year results data

- This scrape will get me the finishing results for each team for each year.
- Eventually this will be combined with the team statistics data for modeling purposes

In [68]:
url = 'https://www.hockey-reference.com/leagues/NHL_2018.html'
def get_league_season(url):
    res = requests.get(url).content
    res = res.decode('utf-8')
    season_soup = BeautifulSoup(re.sub('<!--|-->', '', res), 'lxml')
    table = season_soup.find('div', {'id': 'div_stats'}).find('tbody')
    league_season = []
    for row in table.find_all('tr'):
        season = {}
        for element in row.find_all('td'):
            stat = element.text
            temp = element.attrs
            column = temp['data-stat']
            season.update({column: stat})
        league_season.append(season)
    league_season = pd.DataFrame(league_season)
    cols=[i for i in league_season.columns if i not in ['team_name']]
    for col in cols:
        league_season[col]=pd.to_numeric(league_season[col])
    return league_season
    

In [131]:
league_season_2000 = get_league_season('https://www.hockey-reference.com/leagues/NHL_2000.html')

In [133]:
league_season_2000.to_csv('2000 league season.csv')

In [66]:
league_url = 'https://www.hockey-reference.com/leagues/'
links = ['NHL_2007.html', 'NHL_2008.html', 'NHL_2009.html', 'NHL_2010.html', 
         'NHL_2011.html', 'NHL_2012.html', 'NHL_2013.html', 'NHL_2014.html',
         'NHL_2015.html', 'NHL_2016.html', 'NHL_2017.html', 'NHL_2018.html']
years = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
         '2015', '2016', '2017', '2018']

for link in links:
    url = league_url + link
    league_year = get_league_season(url)
    years = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
             '2015', '2016', '2017', '2018']
    for year in years:
        time.sleep(3)               


SyntaxError: can't assign to function call (<ipython-input-66-fbea6b67b5b7>, line 14)

In [57]:
get_league_season('https://www.hockey-reference.com/leagues/NHL_2018.html')

Unnamed: 0,average_age,chances_pp,games,goals,goals_against_ev,goals_ev,goals_pp,goals_sh,losses,losses_ot,...,save_pct,shot_pct,shots,shots_against,sos,srs,team_name,total_goals_per_game,wins,wins_shootout
0,28.4,274,82,267,145,193,58,10,18,11,...,0.923,9.9,2641,2659,0.03,0.71,Nashville Predators*,5.83,53,6
1,26.8,274,82,277,159,200,64,9,20,10,...,0.917,10.3,2643,2613,0.02,0.74,Winnipeg Jets*,6.04,52,4
2,27.5,276,82,296,172,216,66,9,23,5,...,0.912,10.7,2737,2756,-0.07,0.66,Tampa Bay Lightning*,6.49,54,6
3,28.6,258,82,270,161,197,61,9,20,12,...,0.912,9.9,2703,2399,-0.07,0.62,Boston Bruins*,5.9,50,3
4,28.0,248,82,272,182,218,53,8,24,7,...,0.911,10.1,2774,2619,-0.01,0.52,Vegas Golden Knights*,6.1,51,4
5,28.4,244,82,259,178,197,55,4,26,7,...,0.909,10.7,2400,2637,-0.04,0.21,Washington Capitals*,6.07,49,3
6,28.3,224,82,277,189,213,56,4,26,7,...,0.915,10.1,2700,2844,-0.06,0.49,Toronto Maple Leafs*,6.21,49,7
7,28.7,214,82,235,159,183,38,10,25,13,...,0.923,9.3,2475,2716,0.01,0.24,Anaheim Ducks*,5.5,44,4
8,29.5,240,82,253,176,194,49,7,26,11,...,0.91,10.0,2506,2595,0.04,0.29,Minnesota Wild*,5.91,45,3
9,27.7,260,82,272,195,198,68,6,29,6,...,0.902,9.6,2845,2575,-0.04,0.23,Pittsburgh Penguins*,6.37,47,2


In [None]:
url = 'https://www.hockey-reference.com/leagues'

In [54]:
get_league_season('https://www.hockey-reference.com/leagues/NHL_2018.html')

{'team_name': 'Buffalo Sabres',
 'average_age': '27.1',
 'games': '82',
 'wins': '25',
 'losses': '45',
 'losses_ot': '12',
 'points': '62',
 'points_pct': '.378',
 'goals': '199',
 'opp_goals': '280',
 'wins_shootout': '1',
 'losses_shootout': '2',
 'srs': '-0.98',
 'sos': '0.01',
 'total_goals_per_game': '5.84',
 'goals_ev': '140',
 'goals_against_ev': '216',
 'goals_pp': '49',
 'chances_pp': '257',
 'power_play_pct': '19.07',
 'opp_goals_pp': '52',
 'opp_chances_pp': '235',
 'pen_kill_pct': '77.87',
 'goals_sh': '9',
 'opp_goals_sh': '10',
 'pen_min_per_game': '8.2',
 'pen_min_per_game_opp': '8.8',
 'shots': '2557',
 'shot_pct': '7.7',
 'shots_against': '2681',
 'save_pct': '.896',
 'pdo': '98.0'}

In [42]:
league_season.dtypes

average_age             float64
chances_pp                int64
games                     int64
goals                     int64
goals_against_ev          int64
goals_ev                  int64
goals_pp                  int64
goals_sh                  int64
losses                    int64
losses_ot                 int64
losses_shootout           int64
opp_chances_pp            int64
opp_goals                 int64
opp_goals_pp              int64
opp_goals_sh              int64
pdo                     float64
pen_kill_pct            float64
pen_min_per_game        float64
pen_min_per_game_opp    float64
points                    int64
points_pct              float64
power_play_pct          float64
save_pct                float64
shot_pct                float64
shots                     int64
shots_against             int64
sos                     float64
srs                     float64
team_name                object
total_goals_per_game    float64
wins                      int64
wins_sho

- These cells were used for testing on individual web pages
- I saved the function calls as universal functions

In [None]:
team_year = get_team_table('https://www.hockey-reference.com/teams/ANA/2018.html')

In [None]:
player_year = get_player_table('https://www.hockey-reference.com/teams/ANA/2018.html')

In [None]:
league_season = get_team_season('https://www.hockey-reference.com/leagues/NHL_2018.html')

### For loop for scraping team statistic data

- Because I need each year to be separate, I am manually calling each year in the loop which will be temporarily saved in a dataframe. Then that dataframe will be converted to a csv. Each csv will get its own dataframe name in a separate EDA notebook.

In [None]:
base_url = 'https://www.hockey-reference.com'
teams = team_links
years = ['2007.html']
year_df = pd.DataFrame()
for team in teams:
    try:
        for year in years:
            url = base_url + team + year
            team_year = get_team_table(url)
            team_df = pd.DataFrame(team_year)
            year_df = pd.concat([year_df, team_df], axis=0)
            year_df.reset_index(drop=True, inplace=True)
            cols=[i for i in year_df.columns if i not in ['team']]
            for col in cols:
                year_df[col]=pd.to_numeric(year_df[col])
            time.sleep(3)               
    except:
        continue

In [None]:
# The Phoenix Coyotes changed their name to the Arizona Coyotes in 2016.
# My scraper was giving me dual entries for this team in a few years.
# This cell was used to remove the duplicate information before saving to csv.

year_df.drop([2], axis=0, inplace=True)
year_df.reset_index(drop=True, inplace=True)

In [None]:
year_df.head()

### Saving to CSV

- Again each year was saved independently

In [None]:
year_df.to_csv('2007 team stats.csv')

### For loop for scraping individual player stats

- Similar to the previous for loop, this will output one year of data which will be saved to csv.

In [None]:
base_url = 'https://www.hockey-reference.com/leagues'
years = ['NHL_2018.html']
league_year = pd.DataFrame()
for year in years:
    try:
        url = base_url + year
        league_season = get_team_table(url)
        team_df = pd.DataFrame(league_season)
        year_player_stats = pd.concat([year_df, team_df], axis=0)
        year_player_stats.reset_index(drop=True, inplace=True)
        time.sleep(3)
    except:
        continue

In [None]:
year_player_stats

### Saving to CSV

- Again each year will be saved to csv. This csv will contain stats for every player in the league for that year.

In [None]:
year_df.to_csv('2011 skater stats.csv')

### For lop for scraping league year stats

In [None]:
base_url = 'https://www.hockey-reference.com'
teams = team_links
years = ['2007.html']
year_df = pd.DataFrame()
for team in teams:
    try:
        for year in years:
            url = base_url + team + year
            team_year = get_team_table(url)
            team_df = pd.DataFrame(team_year)
            year_df = pd.concat([year_df, team_df], axis=0)
            year_df.reset_index(drop=True, inplace=True)
            cols=[i for i in year_df.columns if i not in ['team']]
            for col in cols:
                year_df[col]=pd.to_numeric(year_df[col])
            time.sleep(3)               
    except:
        continue

In [168]:
base_url = 'https://www.hockey-reference.com/leagues/NHL_2018.html'
def get_playoff_results(url):
    res = requests.get(url).content
    res = res.decode('utf-8')
    playoff_soup = BeautifulSoup(re.sub('<!--|-->', '', res), 'lxml')
    table = playoff_soup.find('div', {'id': 'div_all_playoffs'}).find('table', {'id': 'all_playoffs'})
    for row in table.find_all('tr'):
        for element in row.find('td'):
            print(element)
#         for element in row.find_all('td')[0:1]:
#             print(element.text)

In [169]:
get_playoff_results('https://www.hockey-reference.com/leagues/NHL_2018.html')

<span class="tooltip opener" data-id="s15">Stanley Cup Final</span>
<table width="100%">
<tr>
<td><a href="/boxscores/201805280VEG.html">Game 1, May 28</a></td>
<td>May 28</td>
<td>Washington Capitals</td>
<td class="center">4</td>
<td class="winner">Vegas Golden Knights</td>
<td class="center winner">6</td>
<td class="center">
</td>
</tr>
<tr>
<td><a href="/boxscores/201805300VEG.html">Game 2, May 30</a></td>
<td>May 30</td>
<td class="winner">Washington Capitals</td>
<td class="center winner">3</td>
<td>Vegas Golden Knights</td>
<td class="center">2</td>
<td class="center">
</td>
</tr>
<tr>
<td><a href="/boxscores/201806020WSH.html">Game 3, June 2</a></td>
<td>June 2</td>
<td>Vegas Golden Knights</td>
<td class="center">1</td>
<td class="winner">Washington Capitals</td>
<td class="center winner">3</td>
<td class="center">
</td>
</tr>
<tr>
<td><a href="/boxscores/201806040WSH.html">Game 4, June 4</a></td>
<td>June 4</td>
<td>Vegas Golden Knights</td>
<td class="center">2</td>
<td clas