In [1]:
# get upcoming games and then run predictions

In [2]:
# imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd


In [3]:
# set master url
master_url = "https://www.flashscore.com/nhl/fixtures/"


In [4]:
# event better code to extract the game date plus time as well
driver = webdriver.Chrome()
driver.get(master_url)
time.sleep(5)

# click reject cookies button when you first visit the site
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# wait for the OneTrust cookie banner to appear
try:
    reject_all_btn = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler"))
    )
    driver.execute_script("arguments[0].click();", reject_all_btn)  # avoids iframe overlay issues
    print("✅ Reject All clicked")
except:
    print("⚠️ Reject All button not found or not clickable")

# # new code to click show more matches button
# while True:
#     try:
#         btn = driver.find_element(By.LINK_TEXT, "Show more matches")
#         driver.execute_script("arguments[0].scrollIntoView({block:'center'}); arguments[0].click();", btn)
#         time.sleep(2)
#     except NoSuchElementException:
#         break

# Get all game divs
games = driver.find_elements(By.CSS_SELECTOR, "div.event__match")

# get len of games
len_games = len(games)

game_data = []

# set default zero var for OT periods if no OT
def_zero = 0

for idx, game in enumerate(games):
    if idx % 100 == 0:
        print(f'{idx} of {len_games}')
    try:
        # get game time
        game_time = game.find_element(By.CSS_SELECTOR, "div.event__time").text
        game_link = game.find_element(By.TAG_NAME, "a").get_attribute("href")
    
        # get home and away teams
        home_team = game.find_element(By.CSS_SELECTOR, "div.event__participant.event__participant--home").text
        away_team = game.find_element(By.CSS_SELECTOR, "div.event__participant.event__participant--away").text

    except NoSuchElementException:
        home_team = away_team = game_time = None

    # thanks to the @ home is the bottom team even though the site labeling is off
    # this is somewhat concerning for other leagues need to deeper dive to ensure home vs away is correct
    # or is it good bc i did check NPB (will check again) but @ seems like a lazy fix to accomodate MLB
    game_data.append({
        "Game Link": game_link,
        "Game Date Time": game_time,
        "Home Team": home_team, 
        "Away Team": away_team,
    })

# Convert to DataFrame
df = pd.DataFrame(game_data)

# close webpage
driver.quit()

# inspect df
df.info()
df.head()


✅ Reject All clicked
0 of 114
100 of 114
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Game Link       114 non-null    object
 1   Game Date Time  114 non-null    object
 2   Home Team       114 non-null    object
 3   Away Team       114 non-null    object
dtypes: object(4)
memory usage: 3.7+ KB


Unnamed: 0,Game Link,Game Date Time,Home Team,Away Team
0,https://www.flashscore.com/match/hockey/caroli...,14.11. 19:00,Carolina Hurricanes,Vancouver Canucks
1,https://www.flashscore.com/match/hockey/philad...,14.11. 20:00,St. Louis Blues,Philadelphia Flyers
2,https://www.flashscore.com/match/hockey/new-yo...,14.11. 21:00,Utah Mammoth,New York Islanders
3,https://www.flashscore.com/match/hockey/florid...,15.11. 17:00,Florida Panthers,Tampa Bay Lightning
4,https://www.flashscore.com/match/hockey/anahei...,15.11. 18:00,Minnesota Wild,Anaheim Ducks


In [5]:
# filter out any rows with key missing data in this col subset
required_cols = ["Game Date Time", "Home Team", "Away Team"]

df = df.dropna(subset=required_cols)             # remove None / NaN
df = df[~df[required_cols].isin([""]).any(axis=1)]  # remove empty strings

# inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Game Link       114 non-null    object
 1   Game Date Time  114 non-null    object
 2   Home Team       114 non-null    object
 3   Away Team       114 non-null    object
dtypes: object(4)
memory usage: 3.7+ KB


In [7]:
# set target year
target_year = '2025'

# split date and time
df[['Date', 'Time']] = df['Game Date Time'].str.split(' ', n=1, expand=True)

# change feb 29 to 28; leap year is dumb
df['Date'] = df['Date'].str.replace('29.02.', '28.02.')

# get month of each game
df['Day_int'] = df['Date'].str.split('.').str[0]
df['Month_int'] = df['Date'].str.split('.').str[1]

# add col for year: months 9-12 are target year and all others are target year + 1
df['Year'] = df['Month_int'].apply(lambda x: int(target_year) if int(x) >= 9 else int(target_year) + 1)

# create final date col
df['Clean_date'] = pd.to_datetime(df['Day_int'].astype(str) + '/' + df['Month_int'].astype(str) + '/' + df['Year'].astype(str), dayfirst=True)

# drop Day_int and Year
df = df.drop(['Day_int', 'Year'], axis=1)

# add col for season
df['Season'] = int(target_year)

df.head()

Unnamed: 0,Game Link,Game Date Time,Home Team,Away Team,Date,Time,Month_int,Clean_date,Season
0,https://www.flashscore.com/match/hockey/caroli...,14.11. 19:00,Carolina Hurricanes,Vancouver Canucks,14.11.,19:00,11,2025-11-14,2025
1,https://www.flashscore.com/match/hockey/philad...,14.11. 20:00,St. Louis Blues,Philadelphia Flyers,14.11.,20:00,11,2025-11-14,2025
2,https://www.flashscore.com/match/hockey/new-yo...,14.11. 21:00,Utah Mammoth,New York Islanders,14.11.,21:00,11,2025-11-14,2025
3,https://www.flashscore.com/match/hockey/florid...,15.11. 17:00,Florida Panthers,Tampa Bay Lightning,15.11.,17:00,11,2025-11-15,2025
4,https://www.flashscore.com/match/hockey/anahei...,15.11. 18:00,Minnesota Wild,Anaheim Ducks,15.11.,18:00,11,2025-11-15,2025


In [8]:
# clean Time col
df['Time'] = df['Time'].str.split('\n').str[0]

# drop orig date + time col plus orig date col
df = df.drop(['Game Date Time', 'Date'], axis=1)

# rename Clean date to date
df = df.rename(columns={'Clean_date': 'Date'})

df.head()


Unnamed: 0,Game Link,Home Team,Away Team,Time,Month_int,Date,Season
0,https://www.flashscore.com/match/hockey/caroli...,Carolina Hurricanes,Vancouver Canucks,19:00,11,2025-11-14,2025
1,https://www.flashscore.com/match/hockey/philad...,St. Louis Blues,Philadelphia Flyers,20:00,11,2025-11-14,2025
2,https://www.flashscore.com/match/hockey/new-yo...,Utah Mammoth,New York Islanders,21:00,11,2025-11-14,2025
3,https://www.flashscore.com/match/hockey/florid...,Florida Panthers,Tampa Bay Lightning,17:00,11,2025-11-15,2025
4,https://www.flashscore.com/match/hockey/anahei...,Minnesota Wild,Anaheim Ducks,18:00,11,2025-11-15,2025


In [17]:
# add day of week and month cols
df['Day_of_Week'] = df['Date'].dt.day_name()
df['Month'] = df['Date'].dt.month_name()

# drop month int
df = df.drop('Month_int', axis=1)

df.head()

Unnamed: 0,Game Link,Home Team,Away Team,Time,Date,Season,Day_of_Week,Month
0,https://www.flashscore.com/match/hockey/caroli...,Carolina Hurricanes,Vancouver Canucks,19:00,2025-11-14,2025,Friday,November
1,https://www.flashscore.com/match/hockey/philad...,St. Louis Blues,Philadelphia Flyers,20:00,2025-11-14,2025,Friday,November
2,https://www.flashscore.com/match/hockey/new-yo...,Utah Mammoth,New York Islanders,21:00,2025-11-14,2025,Friday,November
3,https://www.flashscore.com/match/hockey/florid...,Florida Panthers,Tampa Bay Lightning,17:00,2025-11-15,2025,Saturday,November
4,https://www.flashscore.com/match/hockey/anahei...,Minnesota Wild,Anaheim Ducks,18:00,2025-11-15,2025,Saturday,November


In [18]:
# split out hour part of time and get histogram of game start hours
df['Game_Start_Hour'] = df['Time'].str.split(':').str[0].astype(int)
df.head()

Unnamed: 0,Game Link,Home Team,Away Team,Time,Date,Season,Day_of_Week,Month,Game_Start_Hour
0,https://www.flashscore.com/match/hockey/caroli...,Carolina Hurricanes,Vancouver Canucks,19:00,2025-11-14,2025,Friday,November,19
1,https://www.flashscore.com/match/hockey/philad...,St. Louis Blues,Philadelphia Flyers,20:00,2025-11-14,2025,Friday,November,20
2,https://www.flashscore.com/match/hockey/new-yo...,Utah Mammoth,New York Islanders,21:00,2025-11-14,2025,Friday,November,21
3,https://www.flashscore.com/match/hockey/florid...,Florida Panthers,Tampa Bay Lightning,17:00,2025-11-15,2025,Saturday,November,17
4,https://www.flashscore.com/match/hockey/anahei...,Minnesota Wild,Anaheim Ducks,18:00,2025-11-15,2025,Saturday,November,18


In [19]:
# classify game time start hour
def classify_start_hour(hour):
    if 2 <= hour < 15:
        return 'Early'
    elif 15 <= hour < 20:
        return 'Mid'
    elif 20 <= hour < 24 or hour <= 1:
        return 'Late'
    else:
        return 'Unknown'

df['Start_Hour_Group'] = df['Game_Start_Hour'].apply(classify_start_hour)
df.head()

Unnamed: 0,Game Link,Home Team,Away Team,Time,Date,Season,Day_of_Week,Month,Game_Start_Hour,Start_Hour_Group
0,https://www.flashscore.com/match/hockey/caroli...,Carolina Hurricanes,Vancouver Canucks,19:00,2025-11-14,2025,Friday,November,19,Mid
1,https://www.flashscore.com/match/hockey/philad...,St. Louis Blues,Philadelphia Flyers,20:00,2025-11-14,2025,Friday,November,20,Late
2,https://www.flashscore.com/match/hockey/new-yo...,Utah Mammoth,New York Islanders,21:00,2025-11-14,2025,Friday,November,21,Late
3,https://www.flashscore.com/match/hockey/florid...,Florida Panthers,Tampa Bay Lightning,17:00,2025-11-15,2025,Saturday,November,17,Mid
4,https://www.flashscore.com/match/hockey/anahei...,Minnesota Wild,Anaheim Ducks,18:00,2025-11-15,2025,Saturday,November,18,Mid


In [23]:
# rename Home Team and Away Team cols for merging
df = df.rename(columns={'Home Team': 'Home_Team', 'Away Team': 'Away_Team'})
df.head()

Unnamed: 0,Game Link,Home_Team,Away_Team,Time,Date,Season,Day_of_Week,Month,Game_Start_Hour,Start_Hour_Group
0,https://www.flashscore.com/match/hockey/caroli...,Carolina Hurricanes,Vancouver Canucks,19:00,2025-11-14,2025,Friday,November,19,Mid
1,https://www.flashscore.com/match/hockey/philad...,St. Louis Blues,Philadelphia Flyers,20:00,2025-11-14,2025,Friday,November,20,Late
2,https://www.flashscore.com/match/hockey/new-yo...,Utah Mammoth,New York Islanders,21:00,2025-11-14,2025,Friday,November,21,Late
3,https://www.flashscore.com/match/hockey/florid...,Florida Panthers,Tampa Bay Lightning,17:00,2025-11-15,2025,Saturday,November,17,Mid
4,https://www.flashscore.com/match/hockey/anahei...,Minnesota Wild,Anaheim Ducks,18:00,2025-11-15,2025,Saturday,November,18,Mid


In [35]:
# trim to games that are upcoming today only
df_trim = df[df['Date'] == pd.Timestamp.now().normalize()]
df_trim

Unnamed: 0,Game Link,Home_Team,Away_Team,Time,Date,Season,Day_of_Week,Month,Game_Start_Hour,Start_Hour_Group
0,https://www.flashscore.com/match/hockey/caroli...,Carolina Hurricanes,Vancouver Canucks,19:00,2025-11-14,2025,Friday,November,19,Mid
1,https://www.flashscore.com/match/hockey/philad...,St. Louis Blues,Philadelphia Flyers,20:00,2025-11-14,2025,Friday,November,20,Late
2,https://www.flashscore.com/match/hockey/new-yo...,Utah Mammoth,New York Islanders,21:00,2025-11-14,2025,Friday,November,21,Late


In [43]:
# read in most recent records for each team
home_records = pd.read_excel(r'data/home_team_recent_records.xlsx', header=0)
away_records = pd.read_excel(r'data/away_team_recent_records.xlsx', header=0)

# drop these cols from both dfs since they're not needed
drop_cols = ['Season', 'Month', 'Day_of_Week']
home_records = home_records.drop(drop_cols, axis=1)
away_records = away_records.drop(drop_cols, axis=1)

# # add suffixes to cols to differentiate home vs away stats
# home_rename = {col: f'{col}_Home' for col in home_records.columns if col != 'Home_Team'}
# away_rename = {col: f'{col}_Away' for col in away_records.columns if col != 'Away_Team'}
# home_records = home_records.rename(columns=home_rename)
# away_records = away_records.rename(columns=away_rename)

# inspect
home_records.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Home_Team                         32 non-null     object 
 1   prop_Reg_Tie_Home                 32 non-null     float64
 2   prop_Reg_Tie_Away                 32 non-null     float64
 3   prop_reg_home_goal_diff_Away      32 non-null     float64
 4   prop_Reg_Away_Win_Away            32 non-null     float64
 5   prop_Reg_Away_Win_Home            32 non-null     float64
 6   prop_Reg_Home_Win_Away            32 non-null     float64
 7   prop_reg_away_goal_diff_Away      32 non-null     float64
 8   avg_reg_away_goals_per_game_Away  32 non-null     float64
 9   avg_reg_home_goals_per_game_Away  32 non-null     float64
 10  prop_reg_home_goal_diff_Home      32 non-null     float64
 11  prop_reg_away_goal_diff_Home      32 non-null     float64
 12  prop_Reg_H

In [41]:
# join most recent data to upcoming games df
upcoming_games = df_trim.merge(home_records, on='Home_Team', how='left')
upcoming_games = upcoming_games.merge(away_records, on='Away_Team', how='left')

# drop Game_Link, Time, Date, 
upcoming_games = upcoming_games.drop(['Game Link', 'Game_Start_Hour', 'Start_Hour_Group'], axis=1)

upcoming_games.info()
upcoming_games.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 35 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   Home_Team                              3 non-null      object        
 1   Away_Team                              3 non-null      object        
 2   Time                                   3 non-null      object        
 3   Date                                   3 non-null      datetime64[ns]
 4   Season                                 3 non-null      int64         
 5   Day_of_Week                            3 non-null      object        
 6   Month                                  3 non-null      object        
 7   prop_Reg_Tie_Home_Home                 3 non-null      float64       
 8   prop_Reg_Tie_Away_Home                 3 non-null      float64       
 9   prop_reg_home_goal_diff_Away_Home      3 non-null      float64       

Unnamed: 0,Home_Team,Away_Team,Time,Date,Season,Day_of_Week,Month,prop_Reg_Tie_Home_Home,prop_Reg_Tie_Away_Home,prop_reg_home_goal_diff_Away_Home,...,prop_Reg_Away_Win_Home_Away,prop_Reg_Home_Win_Away_Away,prop_reg_away_goal_diff_Away_Away,avg_reg_away_goals_per_game_Away_Away,avg_reg_home_goals_per_game_Away_Away,prop_reg_home_goal_diff_Home_Away,prop_reg_away_goal_diff_Home_Away,prop_Reg_Home_Win_Home_Away,avg_reg_away_goals_per_game_Home_Away,avg_reg_home_goals_per_game_Home_Away
0,Carolina Hurricanes,Vancouver Canucks,19:00,2025-11-14,2025,Friday,November,0.1,0.181818,0.410714,...,0.230769,0.5,0.480519,3.083333,3.333333,0.527778,0.472222,0.384615,2.615385,2.923077
1,St. Louis Blues,Philadelphia Flyers,20:00,2025-11-14,2025,Friday,November,0.230769,0.266667,0.644737,...,0.285714,0.444444,0.431818,2.111111,2.777778,0.513158,0.486842,0.357143,2.642857,2.785714
2,Utah Mammoth,New York Islanders,21:00,2025-11-14,2025,Friday,November,0.111111,0.333333,0.511111,...,0.090909,0.363636,0.486486,3.272727,3.454545,0.59322,0.40678,0.454545,2.181818,3.181818


In [30]:
# read in trained model
import joblib

basic_model = joblib.load('model/catboost_model_reg_tie.pkl')
basic_model

<catboost.core.CatBoostClassifier at 0x12b3f2775c0>

In [42]:
# isolate features model needs for upcoming games
feature_cols = basic_model.feature_names_

# trim cols for data to predict
predict_df = upcoming_games[feature_cols]
predict_df.info()
predict_df.head()

KeyError: "['prop_Reg_Home_Win_Home', 'prop_Reg_Away_Win_Home', 'prop_Reg_Tie_Home', 'prop_reg_home_goal_diff_Home', 'prop_reg_away_goal_diff_Home', 'avg_reg_home_goals_per_game_Home', 'avg_reg_away_goals_per_game_Home', 'prop_Reg_Home_Win_Away', 'prop_Reg_Away_Win_Away', 'prop_Reg_Tie_Away', 'prop_reg_home_goal_diff_Away', 'prop_reg_away_goal_diff_Away', 'avg_reg_home_goals_per_game_Away', 'avg_reg_away_goals_per_game_Away'] not in index"

In [None]:
from catboost import CatBoostClassifier

# make predictions
# 5. Predict and evaluate proba
y_pred_proba = basic_model.predict_proba(test_pool)[:, 1]
