In [1]:
# import necessary libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

## Testing on one Roster Report

In [2]:
# url of the one roster report we will try
url = 'https://www.nhl.com/scores/htmlreports/20132014/RO020101.HTM'

response = requests.get(url)

# check for a successful response
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

    # get the date of the game
    months = [
        "January","February","March","April","May","June",
        "July","August","September","October","November","December"
    ]
    game_date = next(
        (text for text in soup.stripped_strings
        if any(month in text for month in months) and "," in text),
    None
    )

    # get the names of the 2 teams
    team_headings = soup.find_all("td", class_=lambda x: x and "teamHeading" in x)
    team_names = [t.get_text(strip=True) for t in team_headings[:2]]
    team_tables = soup.find_all("td", valign="top", class_="border")[:2]
    
    # stores the team paired with its captains and alternates
    results = {}
    
    for i, (team_name, team) in enumerate(zip(team_names, team_tables)):
        # store which team is home or away
        is_home = True if i == 1 else False

        captains = []
        alternates = []
    
        rows = team.find_all("tr")
    
        for row in rows:
            cols = row.find_all("td")
            if len(cols) == 3:
                name = cols[2].get_text(strip=True)

                if "(C)" in name:
                    clean_name = name.replace("(C)", "").strip()
                    captains.append(clean_name)
            
                if "(A)" in name:
                    clean_name = name.replace("(A)", "").strip()
                    alternates.append(clean_name)
    
        results[team_name] = {
            "Date": game_date,
            "IsHome": is_home,
            "Captain": captains,
            "Alternates": alternates
    }

    print(results)
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


{'NEW JERSEY DEVILS': {'Date': 'Thursday, October 17, 2013', 'IsHome': False, 'Captain': ['BRYCE SALVADOR'], 'Alternates': ['TRAVIS ZAJAC', 'PATRIK ELIAS']}, 'OTTAWA SENATORS': {'Date': 'Thursday, October 17, 2013', 'IsHome': True, 'Captain': ['JASON SPEZZA'], 'Alternates': ['CHRIS PHILLIPS', 'CHRIS NEIL']}}


It worked, now we can do the same for every other report!

In [3]:
# define a function from the inside of the above if statement
# takes a response and returns the data we want
def get_results(response):
    soup = BeautifulSoup(response.text, 'html.parser')

    # get the date of the game
    months = [
        "January","February","March","April","May","June",
        "July","August","September","October","November","December"
    ]
    game_date = next(
        (text for text in soup.stripped_strings
        if any(month in text for month in months) and "," in text),
    None
    )

    # get the names of the 2 teams
    team_headings = soup.find_all("td", class_=lambda x: x and "teamHeading" in x)
    team_names = [t.get_text(strip=True) for t in team_headings[:2]]
    team_tables = soup.find_all("td", valign="top", class_="border")[:2]
    
    # stores the team paired with its captains and alternates
    results = {}
    
    for i, (team_name, team) in enumerate(zip(team_names, team_tables)):
        # store which team is home or away
        is_home = True if i == 1 else False

        captains = []
        alternates = []
    
        rows = team.find_all("tr")
    
        for row in rows:
            cols = row.find_all("td")
            if len(cols) == 3:
                name = cols[2].get_text(strip=True)

                if "(C)" in name:
                    clean_name = name.replace("(C)", "").strip()
                    captains.append(clean_name)
            
                if "(A)" in name:
                    clean_name = name.replace("(A)", "").strip()
                    alternates.append(clean_name)
    
        results[team_name] = {
            "Date": game_date,
            "IsHome": is_home,
            "Captain": captains,
            "Alternates": alternates
    }

    return results

## Putting results into a DataFrame

In [4]:
# initialize our dataframe that will store all the results from every game
master_df = pd.DataFrame(columns=[
    "Date",
    "Season",
    "GameType",
    "GameNumber",
    "Team",
    "IsHome",
    "Captain",
    "Alternates"
])

In [5]:
# function that takes results and metadata from a game and puts it into our master dataframe
def add_data(master_df, results, gameNum, gameType, season):
    game_df = (
        pd.DataFrame(results)
        .T
        .reset_index()
        .rename(columns={"index": "Team"})
    )

    # convert date to datetime
    game_df["Date"] = pd.to_datetime(game_df["Date"])

    # add game metadata
    game_df["GameNumber"] = gameNum
    game_df["GameType"] = gameType
    game_df["Season"] = season

    master_df = pd.concat([master_df, game_df], ignore_index=True)
    
    return master_df

## Preseason (game type 1)

What I found by attemping to go through a few preseasons is that the NHL is missing reports for a significant # of games.

As a result, I cannot iterate until I get a 404 error because those are randomly spread throughout.

It's not ideal but for preseason games I will manually get the final game # from every year and iterate up to that, skipping 404s.

In [6]:
gameType = 1
gameNum0708 = 105
gameNum0809 = 105
gameNum0910 = 103
gameNum1011 = 100
gameNum1112 = 101
gameNum1213 = 0 # lockout shortened, no preseason
gameNum1314 = 104
gameNum1415 = 105
gameNum1516 = 104
gameNum1617 = 106
gameNum1718 = 109 
gameNum1819 = 108
gameNum1920 = 105
gameNum2021 = 0 # covid
gameNum2122 = 103
gameNum2223 = 106
gameNum2324 = 111
gameNum2425 = 101
gameNum2526 = 104

In [7]:
season = 20242025
gameType = 2
gameNum = 1


for gameNum in range(1,2):
    url = f'https://www.nhl.com/scores/htmlreports/{season}/RO{gameType:02d}{gameNum:04d}.HTM'
    response = requests.get(url)
    results = get_results(response)
    master_df = add_data(master_df, results, gameNum, gameType, season)
    time.sleep(0.1)
    
# 20242025 regular season took 14 minutes to run and gave 1312 games (the proper amount)

  master_df = pd.concat([master_df, game_df], ignore_index=True)


In [None]:
#master_df.set_index('Date', inplace=True)
master_df.head(10)

Unnamed: 0_level_0,Season,GameType,GameNumber,Team,IsHome,Captain,Alternates
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-10-04,20242025,2,1,NEW JERSEY DEVILS,False,[NICO HISCHIER],"[ONDREJ PALAT, JACK HUGHES]"
2024-10-04,20242025,2,1,BUFFALO SABRES,True,[RASMUS DAHLIN],"[DYLAN COZENS, ALEX TUCH]"


## Regular Season (game type 2)

The most simple of the 3 types, can just iterate through each season until we reach a 404.

## Playoffs (game type 3)

Can't increment game # here because it uses a unique format
4 digit game #: 
- 1st 2 digits: round #, 01 for 1st round, 02 for 2nd round, 03 for conf finals, 04 for cup finals
- 3rd digit: series #, 1-8 for round 1, 1-4 for round 2, 1-2 for conf finals, onlt 1 for cup finals.
- 4th digit: game # of the series (4-7)

Ex: game 7 of the second conference final series would be 0327
