# Scraping Hockey Reference

### Workflow

- I need to scrape several years of data for each team and all individual players
- The team and player statistics will be in separate dataframes.
- Aditionally, each year of data will also be in separate dataframes.
- I will set up my scraper to grab team statistics for each team in a given year and make that a temporary dataframe which I will turn into individual csvs.
- The individual player statistics will also be separated by year and saved into individual csvs.

### Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import urllib3
import requests
import time
import re

#### Creating base URL

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
base_url = 'https://www.hockey-reference.com/teams/'

#### Function to grab a txt file of team links

In [3]:
def get_page(url):
    page = urlopen(base_url)
    soup = BeautifulSoup(page, 'lxml')
    file = open('hockey-reference_urls.txt', 'w')
    file.write(str(soup))
    file.close()

def get_team_links(url):
    page = urlopen(url)
    soup = BeautifulSoup(page, 'lxml')

In [4]:
get_page(base_url)

In [5]:
with open('hockey-reference_urls.txt', 'r') as file:
    for line in file:
        line = line.strip()

In [6]:
page = open("hockey-reference_urls.txt", 'r')
soup = BeautifulSoup(page, "lxml")
div = soup.find('div', {'class': 'overthrow table_container'})


#### Saving the team links in a variable

In [7]:
team_links = []
links = div.find_all('a')
for link in links:
    team_links.append(link.get('href'))
# As teams have moved and changed over the years
# I had to manually add Arizona and Atlanta to this list
team_links.insert(1, '/teams/ARI/')
team_links.insert(2, '/teams/ATL/')

In [79]:
print(len(team_links))

33


  ---

## Function for scraping individual player data

- This function finds a specific table on each teams page which contains individual player stats for a given year.
- There will be a for loop later which will call this function and iterate through each team and each year.

In [18]:
def get_player_table(url):
    res = requests.get(url)
    skater_soup = BeautifulSoup(res.content, 'lxml')
    team_name = skater_soup.find('h1', {'itemprop': 'name'}).find_all('span')[1].text
    table = skater_soup.find('div', {'id': 'all_skaters'}).find('table', {'id': 'skaters'}).find('tbody')
    player_stats = []
    for row in table.find_all('tr'):
        players = {}
        for element in row:
            players['player'] = row.find('a').text
            players['age'] = row.find('td', {'data-stat': 'age'}).text
            players['position'] = row.find('td', {'data-stat': 'pos'}).text
            players['games_played'] = row.find('td', {'data-stat': 'games_played'}).text
            players['goals'] = row.find('td', {'data-stat': 'goals'}).text
            players['assists'] = row.find('td', {'data-stat': 'assists'}).text
            players['points'] = row.find('td', {'data-stat': 'points'}).text
            players['plus_minus'] = row.find('td', {'data-stat': 'plus_minus'}).text
            players['penalty_minutes'] = row.find('td', {'data-stat': 'pen_min'}).text
            players['es_goals'] = row.find('td', {'data-stat': 'goals_ev'}).text
            players['pp_goals'] = row.find('td', {'data-stat': 'goals_pp'}).text
            players['sh_goals'] = row.find('td', {'data-stat': 'goals_sh'}).text
            players['gw_goals'] = row.find('td', {'data-stat': 'goals_gw'}).text
            players['es_assists'] = row.find('td', {'data-stat': 'assists_ev'}).text
            players['pp_assists'] = row.find('td', {'data-stat': 'assists_pp'}).text
            players['sh_assists'] = row.find('td', {'data-stat': 'assists_sh'}).text
            players['shots'] = row.find('td', {'data-stat': 'shots'}).text
            players['shot_pct'] = row.find('td', {'data-stat': 'shot_pct'}).text
            players['toi'] = row.find('td', {'data-stat': 'time_on_ice'}).text
            players['toi_avg'] = row.find('td', {'data-stat': 'time_on_ice_avg'}).text
            players['ops'] = row.find('td', {'data-stat': 'ops'}).text
            players['dps'] = row.find('td', {'data-stat': 'dps'}).text
            players['point_shares'] = row.find('td', {'data-stat': 'ps'}).text
            players['es_blocks'] = row.find('td', {'data-stat': 'blocks'}).text
            players['es_hits'] = row.find('td', {'data-stat': 'hits'}).text
            players['es_faceoff_wins'] = row.find('td', {'data-stat': 'faceoff_wins'}).text
            players['es_faceoff_losses'] = row.find('td', {'data-stat': 'faceoff_losses'}).text
            players['es_faceoff_pct'] = row.find('td', {'data-stat': 'faceoff_percentage'}).text
            players['team'] = team_name
        player_stats.append(players)
    return player_stats

### For loop for scraping individual player stats

- Similar to the previous for loop, this will output one year of data which will be saved to csv.

In [None]:
base_url = 'https://www.hockey-reference.com'
teams = team_links
years = ['2007.html']
player_year_df = pd.DataFrame()
for team in teams:
    try:
        for year in years:
            url = base_url + team + year
            team_year = get_player_table(url)
            team_df = pd.DataFrame(team_year)
            player_year_df = pd.concat([year_df, team_df], axis=0)
            player_year_df.reset_index(drop=True, inplace=True)
            cols=[i for i in year_df.columns if i not in ['team', 'player', 'position', 'toi_avg']]
            for col in cols:
                player_year_df[col]=pd.to_numeric(year_df[col])
            time.sleep(3)               
    except:
        continue

#### Adding a year column to the dataframe

In [None]:
player_year_df['year'] = 2007

#### Saving to CSV

In [None]:
player_year_df.to_csv('2007 player stats.csv')

  ---

## Function for scraping team statistics

- This function finds a specific table on each teams page
- I had to get creative in scraping this table as the data in this table was formatted differently than the individual player data table.
- Once I find the specific table within the "season_soup" variable, this function uses the dictionary structure of the data to assign column names
- As I need a CSV for each individual year, I will not use a for loop for scraping here. Each CSV will be saved individually.

In [None]:
url = 'https://www.hockey-reference.com/leagues/NHL_2007.html'
def get_league_season(url):
    res = requests.get(url).content
    res = res.decode('utf-8')
    season_soup = BeautifulSoup(re.sub('<!--|-->', '', res), 'lxml')
    table = season_soup.find('div', {'id': 'div_stats'}).find('tbody')
    league_season = []
    for row in table.find_all('tr'):
        season = {}
        for element in row.find_all('td'):
            stat = element.text
            temp = element.attrs
            column = temp['data-stat']
            season.update({column: stat})
        league_season.append(season)
    league_season = pd.DataFrame(league_season)
    cols=[i for i in league_season.columns if i not in ['team_name']]
    for col in cols:
        league_season[col]=pd.to_numeric(league_season[col])
    return league_season

In [90]:
league_season = get_league_season('https://www.hockey-reference.com/leagues/NHL_2007.html')

In [91]:
league_season.head()

Unnamed: 0,average_age,chances_pp,games,goals,goals_against_ev,goals_ev,goals_pp,goals_sh,losses,losses_ot,...,save_pct,shot_pct,shots,shots_against,sos,srs,team_name,total_goals_per_game,wins,wins_shootout
0,28.4,274,82,267,145,193,58,10,18,11,...,0.923,9.9,2641,2659,0.03,0.71,Nashville Predators*,5.83,53,6
1,26.8,274,82,277,159,200,64,9,20,10,...,0.917,10.3,2643,2613,0.02,0.74,Winnipeg Jets*,6.04,52,4
2,27.5,276,82,296,172,216,66,9,23,5,...,0.912,10.7,2737,2756,-0.07,0.66,Tampa Bay Lightning*,6.49,54,6
3,28.6,258,82,270,161,197,61,9,20,12,...,0.912,9.9,2703,2399,-0.07,0.62,Boston Bruins*,5.9,50,3
4,28.0,248,82,272,182,218,53,8,24,7,...,0.911,10.1,2774,2619,-0.01,0.52,Vegas Golden Knights*,6.1,51,4


#### Saving to CSV

- Again each year was saved independently

In [None]:
league_season.to_csv('2007 team stats.csv')

  ---

## Function for scraping Advanced Player Statistics

In [115]:
def get_advanced_player(url):
    res = requests.get(url).content
    res = res.decode('utf-8')
    advanced_soup = BeautifulSoup(re.sub('<!--|-->', '', res), 'lxml')
    team_name = advanced_soup.find('h1', {'itemprop': 'name'}).find_all('span')[1].text
    table = advanced_soup.find('div', {'id': 'div_stats_adv_rs'}).find('tbody')
    advanced_season = []
    for row in table.find_all('tr'):
        season = {}
        for element in row.find_all('td'):
            stat = element.text
            temp = element.attrs
            column = temp['data-stat']
            season.update({column: stat})
        advanced_season.append(season)
    advanced_season = pd.DataFrame(advanced_season)
    advanced_season['team'] = team_name
    cols=[i for i in advanced_season.columns if i not in ['player', 'pos', 'team', 'toi_pbp_per_60_all', 'toi_pbp_per_60_ev']]
    for col in cols:
        advanced_season[col]=pd.to_numeric(advanced_season[col])
    return advanced_season

### For loop for scraping advanced player stats

- Similar to the previous for loop, this will output one year of data which will be saved to csv.
- Turns out there are no skater advanced statistics for 2007.

In [152]:
base_url = 'https://www.hockey-reference.com'
teams = team_links
years = ['2008.html']
advanced_year_df = pd.DataFrame()
for team in teams:
    try:
        for year in years:
            url = base_url + team + year
            advanced_year = get_advanced_player(url)
            advanced_df = pd.DataFrame(advanced_year)
            advanced_year_df = pd.concat([advanced_year_df, advanced_df], axis=0)
            advanced_year_df.reset_index(drop=True, inplace=True)
            time.sleep(2)               
    except:
        continue

In [149]:
advanced_year_df.to_csv('2008 advanced stats.csv')