In [4]:
# import necessary libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

## Testing on one Roster Report

In [5]:
# url of the one roster report we will try
url = 'https://www.nhl.com/scores/htmlreports/20132014/RO020101.HTM'

response = requests.get(url)

# check for a successful response
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # get the date of the game
    months = [
        "January","February","March","April","May","June",
        "July","August","September","October","November","December"
    ]
    game_date = next(
        (text for text in soup.stripped_strings
        if any(month in text for month in months) and "," in text),
    None
    )

    # get the names of the 2 teams
    team_headings = soup.find_all("td", class_=lambda x: x and "teamHeading" in x)
    team_names = [t.get_text(strip=True) for t in team_headings[:2]]
    team_tables = soup.find_all("td", valign="top", class_="border")[:2]
    
    # stores the team paired with its captains and alternates
    results = {}
    
    for i, (team_name, team) in enumerate(zip(team_names, team_tables)):
        # store which team is home or away
        is_home = True if i == 1 else False

        captains = []
        alternates = []
    
        rows = team.find_all("tr")
    
        for row in rows:
            cols = row.find_all("td")
            if len(cols) == 3:
                name = cols[2].get_text(strip=True)

                if "(C)" in name:
                    clean_name = name.replace("(C)", "").strip()
                    captains.append(clean_name)
            
                if "(A)" in name:
                    clean_name = name.replace("(A)", "").strip()
                    alternates.append(clean_name)
    
        results[team_name] = {
            "Date": game_date,
            "IsHome": is_home,
            "Captain": captains,
            "Alternates": alternates
    }

    print(results)
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


{'NEW JERSEY DEVILS': {'Date': 'Thursday, October 17, 2013', 'IsHome': False, 'Captain': ['BRYCE SALVADOR'], 'Alternates': ['TRAVIS ZAJAC', 'PATRIK ELIAS']}, 'OTTAWA SENATORS': {'Date': 'Thursday, October 17, 2013', 'IsHome': True, 'Captain': ['JASON SPEZZA'], 'Alternates': ['CHRIS PHILLIPS', 'CHRIS NEIL']}}


It worked, now we can do the same for every other report!

In [6]:
# define a function from the inside of the above if statement
# takes a response and returns the data we want
def get_results(response):
    soup = BeautifulSoup(response.text, 'html.parser')

    # get the date of the game
    months = [
        "January","February","March","April","May","June",
        "July","August","September","October","November","December"
    ]
    game_date = next(
        (text for text in soup.stripped_strings
        if any(month in text for month in months) and "," in text),
    None
    )

    # get the names of the 2 teams
    team_headings = soup.find_all("td", class_=lambda x: x and "teamHeading" in x)
    team_names = [t.get_text(strip=True) for t in team_headings[:2]]
    team_tables = soup.find_all("td", valign="top", class_="border")[:2]
    
    # stores the team paired with its captains and alternates
    results = {}
    
    for i, (team_name, team) in enumerate(zip(team_names, team_tables)):
        # store which team is home or away
        is_home = True if i == 1 else False

        captains = []
        alternates = []
    
        rows = team.find_all("tr")
    
        for row in rows:
            cols = row.find_all("td")
            if len(cols) == 3:
                name = cols[2].get_text(strip=True)

                if "(C)" in name:
                    clean_name = name.replace("(C)", "").strip()
                    captains.append(clean_name)
            
                if "(A)" in name:
                    clean_name = name.replace("(A)", "").strip()
                    alternates.append(clean_name)
    
        results[team_name] = {
            "Date": game_date,
            "IsHome": is_home,
            "Captain": captains,
            "Alternates": alternates
    }

    return results

## Putting results into a DataFrame

In [7]:
# initialize our dataframe that will store all the results from every game
master_df = pd.DataFrame(columns=[
    "Date",
    "Season",
    "GameType",
    "GameNumber",
    "Team",
    "IsHome",
    "Captain",
    "Alternates"
])

In [8]:
# function that takes results and metadata from a game and puts it into our master dataframe
def add_data(master_df, results, gameNum, gameType, season):
    game_df = (
        pd.DataFrame(results)
        .T
        .reset_index()
        .rename(columns={"index": "Team"})
    )

    # convert date to datetime
    game_df["Date"] = pd.to_datetime(game_df["Date"])

    # add game metadata
    game_df["GameNumber"] = gameNum
    game_df["GameType"] = gameType
    game_df["Season"] = season

    master_df = pd.concat([master_df, game_df], ignore_index=True)
    
    return master_df

## Preseason (game type 1)

What I found by attemping to go through a few preseasons is that the NHL is missing reports for a significant # of games.

As a result, I cannot iterate until I get a 404 error because those are randomly spread throughout.

It's not ideal but for preseason games I will manually get the final game # from every year and iterate up to that, skipping 404s.

In [9]:
gameType = 1
gameNum0708 = 105
gameNum0809 = 105
gameNum0910 = 103
gameNum1011 = 100
gameNum1112 = 101
gameNum1213 = 0 # lockout shortened, no preseason
gameNum1314 = 104
gameNum1415 = 105
gameNum1516 = 104
gameNum1617 = 106
gameNum1718 = 109 
gameNum1819 = 108
gameNum1920 = 105
gameNum2021 = 0 # covid
gameNum2122 = 103
gameNum2223 = 106
gameNum2324 = 111
gameNum2425 = 101
gameNum2526 = 104

In [None]:
# define a function to iterate through all preseason games per season skipping over missing ones
def preseason_iterate(season, gameNum, master_df):
    for game in range(1, gameNum+1):
        url = f'https://www.nhl.com/scores/htmlreports/{season}/RO01{game:04d}.HTM' #gameType always 01 here
        response = requests.get(url)

        # only get results and add the game to our data if there isnt an error
        if response.status_code == 200:
            results = get_results(response)
            master_df = add_data(master_df, results, game, 1, season)
        else:
            # if the roster report is missing, print a message
            print(f'No roster report for game {game} of the {str(season)[0:4]}-{str(season)[4:9]} preseason')

        # take a short nap so the nhl is less likely to get mad at us
        time.sleep(0.1)

    # update the dataframe after the for loop
    return master_df
        

In [46]:
master_df = preseason_iterate(20082009, gameNum0809, master_df)

No roster report for game 4 of the 2008-2009 preseason
No roster report for game 5 of the 2008-2009 preseason
No roster report for game 11 of the 2008-2009 preseason
No roster report for game 17 of the 2008-2009 preseason
No roster report for game 23 of the 2008-2009 preseason
No roster report for game 31 of the 2008-2009 preseason
No roster report for game 35 of the 2008-2009 preseason
No roster report for game 45 of the 2008-2009 preseason
No roster report for game 46 of the 2008-2009 preseason
No roster report for game 54 of the 2008-2009 preseason
No roster report for game 103 of the 2008-2009 preseason


## Regular Season (game type 2)

The most simple of the 3 types, can just iterate through each season until we reach a 404.

In [None]:
def regular_iterate(season, master_df):
    # do game 1 prior the while loop to have response initialized to enter it
    gameNum = 1
    url = f'https://www.nhl.com/scores/htmlreports/{season}/RO02{gameNum:04d}.HTM' #gameType always 02 here
    response = requests.get(url)

    # for regular season just continue to add to gameNum until we get an error
    while response.status_code == 200:
        url = f'https://www.nhl.com/scores/htmlreports/{season}/RO02{gameNum:04d}.HTM' #gameType always 02 here
        response = requests.get(url)
        # only get results and add the game to our data if there isnt an error
        if response.status_code == 200:
            results = get_results(response)
            master_df = add_data(master_df, results, gameNum, 2, season)
        else:
            # when there are no more roster reports print what the final game was to validate
            print(f'Final game of {str(season)[0:4]}-{str(season)[4:9]}: {gameNum-1}')
        gameNum += 1
        
        # take a short nap so the nhl is less likely to get mad at us
        time.sleep(0.1)

    # update the dataframe after the while loop
    return master_df

In [45]:
master_df = regular_iterate(20082009, master_df)

Final name of 20082009: 1230


## Playoffs (game type 3)

Can't increment game # here because it uses a unique format
4 digit game #: 
- 1st 2 digits: round #, 01 for 1st round, 02 for 2nd round, 03 for conf finals, 04 for cup finals*
- 3rd digit: series #, 1-8 for round 1, 1-4 for round 2, 1-2 for conf finals, only 1 for cup finals.
- 4th digit: game # of the series (1 up to 4 through 7)

\* 00 for 2020 stanley cup qualifiers

Ex: game 7 of the second conference final series would be 0327



In [None]:
# define a function to iterate every game of every series in every playoff round
def postseason_iterate(season, master_df):
    # outer for loop to iterate through the 4 rounds
    for roundNum in range(1, 5):
        # for loop to iterate through each series in a round, decreasing as they play more rounds
        for seriesNum in range(1, int(8/pow(2, roundNum-1))+1):
            # do game 1 prior the while loop to have response initialized to enter it
            gameNum = 1
            url = f'https://www.nhl.com/scores/htmlreports/{season}/RO03{roundNum:02d}{seriesNum}{gameNum}.HTM' # game type always 03
            response = requests.get(url)
            results = get_results(response)
            master_df = add_data(master_df, results, gameNum, 3, season)
            gameNum += 1

            # inner while loop to iterate through each game of a series
            while response.status_code == 200:
                url = f'https://www.nhl.com/scores/htmlreports/{season}/RO03{roundNum:02d}{seriesNum}{gameNum}.HTM'
                response = requests.get(url)
                if response.status_code == 200:
                    results = get_results(response)
                    master_df = add_data(master_df, results, gameNum, 3, season)
                gameNum += 1
                
                # take a short nap so the nhl is less likely to get mad at us
                time.sleep(0.1)
    
    # 2020 play in round exception
    if season == 20192020:
        for seriesNum in range(0,10):
            # do game 1 prior the while loop to have response initialized to enter it
            gameNum = 1
            url = f'https://www.nhl.com/scores/htmlreports/{season}/RO0300{seriesNum}{gameNum}.HTM'
            response = requests.get(url)
            results = get_results(response)
            master_df = add_data(master_df, results, gameNum, 3, season)
            gameNum += 1

            # inner while loop to iterate through each game of a series
            while response.status_code == 200:
                url = f'https://www.nhl.com/scores/htmlreports/{season}/RO0300{seriesNum}{gameNum}.HTM'
                response = requests.get(url)
                if response.status_code == 200:
                    results = get_results(response)
                    master_df = add_data(master_df, results, gameNum, 3, season)
                gameNum += 1
                
                # take a short nap so the nhl is less likely to get mad at us
                time.sleep(0.1)


   # update the dataframe after the for loop
    return master_df

In [47]:
master_df = postseason_iterate(20082009, master_df)

In [48]:
master_df.set_index('Date', inplace=True)

In [49]:
master_df.head(10)

Unnamed: 0_level_0,Season,GameType,GameNumber,Team,IsHome,Captain,Alternates
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NaT,20192020,3,1,CAROLINA HURRICANES,True,[JORDAN STAAL],"[JORDAN MARTINOOK, JACCOB SLAVIN]"
NaT,20192020,3,1,PITTSBURGH PENGUINS,True,[SIDNEY CROSBY],"[KRIS LETANG, EVGENI MALKIN]"
NaT,20192020,3,1,MONTREAL CANADIENS,False,[SHEA WEBER],"[BRENDAN GALLAGHER, PAUL BYRON]"
NaT,20192020,3,1,CALGARY FLAMES,True,[MARK GIORDANO],"[MIKAEL BACKLUND, SEAN MONAHAN]"
NaT,20192020,3,1,WINNIPEG JETS,False,[BLAKE WHEELER],"[JOSH MORRISSEY, MARK SCHEIFELE]"
NaT,20192020,3,1,FLORIDA PANTHERS,False,[ALEKSANDER BARKOV],"[KEITH YANDLE, JONATHAN HUBERDEAU]"
NaT,20192020,3,1,NEW YORK ISLANDERS,True,[ANDERS LEE],"[JOSH BAILEY, CAL CLUTTERBUCK]"
NaT,20192020,3,1,NEW YORK RANGERS,False,[],"[JESPER FAST, CHRIS KREIDER, MIKA ZIBANEJAD]"
NaT,20192020,3,1,CHICAGO BLACKHAWKS,False,[JONATHAN TOEWS],"[DUNCAN KEITH, PATRICK KANE]"
NaT,20192020,3,1,EDMONTON OILERS,True,[CONNOR MCDAVID],"[ADAM LARSSON, RYAN NUGENT-HOPKINS]"


In [50]:
master_df.sort_values('Date', inplace=True)

In [51]:
master_df.to_csv('output.csv')