In [1]:
# get upcoming games and then run predictions

In [2]:
# imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd


In [3]:
# set master url
master_url = "https://www.flashscore.com/nhl/fixtures/"


In [4]:
# event better code to extract the game date plus time as well
driver = webdriver.Chrome()
driver.get(master_url)
time.sleep(5)

# click reject cookies button when you first visit the site
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# wait for the OneTrust cookie banner to appear
try:
    reject_all_btn = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler"))
    )
    driver.execute_script("arguments[0].click();", reject_all_btn)  # avoids iframe overlay issues
    print("✅ Reject All clicked")
except:
    print("⚠️ Reject All button not found or not clickable")

# # new code to click show more matches button
# while True:
#     try:
#         btn = driver.find_element(By.LINK_TEXT, "Show more matches")
#         driver.execute_script("arguments[0].scrollIntoView({block:'center'}); arguments[0].click();", btn)
#         time.sleep(2)
#     except NoSuchElementException:
#         break

# Get all game divs
games = driver.find_elements(By.CSS_SELECTOR, "div.event__match")

# get len of games
len_games = len(games)

game_data = []

# set default zero var for OT periods if no OT
def_zero = 0

for idx, game in enumerate(games):
    if idx % 100 == 0:
        print(f'{idx} of {len_games}')
    try:
        # get game time
        game_time = game.find_element(By.CSS_SELECTOR, "div.event__time").text
        game_link = game.find_element(By.TAG_NAME, "a").get_attribute("href")
    
        # get home and away teams
        home_team = game.find_element(By.CSS_SELECTOR, "div.event__participant.event__participant--home").text
        away_team = game.find_element(By.CSS_SELECTOR, "div.event__participant.event__participant--away").text

    except NoSuchElementException:
        home_team = away_team = game_time = None

    # thanks to the @ home is the bottom team even though the site labeling is off
    # this is somewhat concerning for other leagues need to deeper dive to ensure home vs away is correct
    # or is it good bc i did check NPB (will check again) but @ seems like a lazy fix to accomodate MLB
    game_data.append({
        "Game Link": game_link,
        "Game Date Time": game_time,
        "Home Team": home_team, 
        "Away Team": away_team,
    })

# Convert to DataFrame
df = pd.DataFrame(game_data)

# close webpage
driver.quit()

# inspect df
df.info()
df.head()


✅ Reject All clicked
0 of 114
100 of 114
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Game Link       114 non-null    object
 1   Game Date Time  114 non-null    object
 2   Home Team       114 non-null    object
 3   Away Team       114 non-null    object
dtypes: object(4)
memory usage: 3.7+ KB


Unnamed: 0,Game Link,Game Date Time,Home Team,Away Team
0,https://www.flashscore.com/match/hockey/caroli...,14.11. 19:00,Carolina Hurricanes,Vancouver Canucks
1,https://www.flashscore.com/match/hockey/philad...,14.11. 20:00,St. Louis Blues,Philadelphia Flyers
2,https://www.flashscore.com/match/hockey/new-yo...,14.11. 21:00,Utah Mammoth,New York Islanders
3,https://www.flashscore.com/match/hockey/florid...,15.11. 17:00,Florida Panthers,Tampa Bay Lightning
4,https://www.flashscore.com/match/hockey/anahei...,15.11. 18:00,Minnesota Wild,Anaheim Ducks


In [5]:
# filter out any rows with key missing data in this col subset
required_cols = ["Game Date Time", "Home Team", "Away Team"]

df = df.dropna(subset=required_cols)             # remove None / NaN
df = df[~df[required_cols].isin([""]).any(axis=1)]  # remove empty strings

# inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Game Link       114 non-null    object
 1   Game Date Time  114 non-null    object
 2   Home Team       114 non-null    object
 3   Away Team       114 non-null    object
dtypes: object(4)
memory usage: 3.7+ KB


In [7]:
# set target year
target_year = '2025'

# split date and time
df[['Date', 'Time']] = df['Game Date Time'].str.split(' ', n=1, expand=True)

# change feb 29 to 28; leap year is dumb
df['Date'] = df['Date'].str.replace('29.02.', '28.02.')

# get month of each game
df['Day_int'] = df['Date'].str.split('.').str[0]
df['Month_int'] = df['Date'].str.split('.').str[1]

# add col for year: months 9-12 are target year and all others are target year + 1
df['Year'] = df['Month_int'].apply(lambda x: int(target_year) if int(x) >= 9 else int(target_year) + 1)

# create final date col
df['Clean_date'] = pd.to_datetime(df['Day_int'].astype(str) + '/' + df['Month_int'].astype(str) + '/' + df['Year'].astype(str), dayfirst=True)

# drop Day_int and Year
df = df.drop(['Day_int', 'Year'], axis=1)

# add col for season
df['Season'] = int(target_year)

df.head()

Unnamed: 0,Game Link,Game Date Time,Home Team,Away Team,Date,Time,Month_int,Clean_date,Season
0,https://www.flashscore.com/match/hockey/caroli...,14.11. 19:00,Carolina Hurricanes,Vancouver Canucks,14.11.,19:00,11,2025-11-14,2025
1,https://www.flashscore.com/match/hockey/philad...,14.11. 20:00,St. Louis Blues,Philadelphia Flyers,14.11.,20:00,11,2025-11-14,2025
2,https://www.flashscore.com/match/hockey/new-yo...,14.11. 21:00,Utah Mammoth,New York Islanders,14.11.,21:00,11,2025-11-14,2025
3,https://www.flashscore.com/match/hockey/florid...,15.11. 17:00,Florida Panthers,Tampa Bay Lightning,15.11.,17:00,11,2025-11-15,2025
4,https://www.flashscore.com/match/hockey/anahei...,15.11. 18:00,Minnesota Wild,Anaheim Ducks,15.11.,18:00,11,2025-11-15,2025


In [8]:
# clean Time col
df['Time'] = df['Time'].str.split('\n').str[0]

# drop orig date + time col plus orig date col
df = df.drop(['Game Date Time', 'Date'], axis=1)

# rename Clean date to date
df = df.rename(columns={'Clean_date': 'Date'})

df.head()


Unnamed: 0,Game Link,Home Team,Away Team,Time,Month_int,Date,Season
0,https://www.flashscore.com/match/hockey/caroli...,Carolina Hurricanes,Vancouver Canucks,19:00,11,2025-11-14,2025
1,https://www.flashscore.com/match/hockey/philad...,St. Louis Blues,Philadelphia Flyers,20:00,11,2025-11-14,2025
2,https://www.flashscore.com/match/hockey/new-yo...,Utah Mammoth,New York Islanders,21:00,11,2025-11-14,2025
3,https://www.flashscore.com/match/hockey/florid...,Florida Panthers,Tampa Bay Lightning,17:00,11,2025-11-15,2025
4,https://www.flashscore.com/match/hockey/anahei...,Minnesota Wild,Anaheim Ducks,18:00,11,2025-11-15,2025


In [9]:
# read in modeling data

In [10]:
# get most recent records for each team


In [11]:
# join to the most recent records

In [12]:
# read in trained model

In [13]:
# make predictions for reg tie