In [1]:
# make predictions on upcoming games


In [2]:
import pandas as pd

In [3]:
# read in upcoming games data
# write upcoming games to excel
upcoming_games = pd.read_excel(f'upcoming_nhl_games_2025.xlsx', header=0)
upcoming_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Game Link        114 non-null    object        
 1   Home_Team        114 non-null    object        
 2   Away_Team        114 non-null    object        
 3   Time             114 non-null    object        
 4   Date             114 non-null    datetime64[ns]
 5   Season           114 non-null    int64         
 6   Day_of_Week      114 non-null    object        
 7   Month            114 non-null    object        
 8   Game_Start_Hour  114 non-null    int64         
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 8.1+ KB


In [4]:
# trim to games that are upcoming today only
upcoming_games_trim = upcoming_games[upcoming_games['Date'] == pd.Timestamp.now().normalize()]

# # trim to games that are upcoming tomorrow only
# tomorrow = pd.Timestamp.now().normalize() + pd.Timedelta(days=1)
# upcoming_games_trim = upcoming_games[upcoming_games['Date'] == tomorrow]

upcoming_games_trim

Unnamed: 0,Game Link,Home_Team,Away_Team,Time,Date,Season,Day_of_Week,Month,Game_Start_Hour
0,https://www.flashscore.com/match/hockey/caroli...,Carolina Hurricanes,Vancouver Canucks,19:00,2025-11-14,2025,Friday,November,19
1,https://www.flashscore.com/match/hockey/philad...,St. Louis Blues,Philadelphia Flyers,20:00,2025-11-14,2025,Friday,November,20
2,https://www.flashscore.com/match/hockey/new-yo...,Utah Mammoth,New York Islanders,21:00,2025-11-14,2025,Friday,November,21


In [12]:
# read in modeling data
modeling_data = pd.read_excel(r'data/modeling_data.xlsx', header=0)

# filter to target season
target_season = upcoming_games_trim['Season'].iloc[0]
modeling_data = modeling_data[modeling_data['Season'] == target_season]

# drop cols that contain P1-P3 in the col name
modeling_data = modeling_data[modeling_data.columns.drop(list(modeling_data.filter(regex='P[1-3]')))]

# drop other cols
# list of drop cols that won't be used in modeling
drop_cols = ['Game_ID', 'Reg_Home_Win', 'Reg_Away_Win',
         'Season', 'Month', 'Day_of_Week', 'Reg_Tie', 'Odds_1', 'Odds_X', 'Odds_2'
    ]

modeling_data = modeling_data.drop(columns=drop_cols)

# inspect
modeling_data.info()
modeling_data.head()


<class 'pandas.core.frame.DataFrame'>
Index: 382 entries, 89 to 3428
Data columns (total 29 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Date                              382 non-null    datetime64[ns]
 1   Home_Team                         382 non-null    object        
 2   Away_Team                         382 non-null    object        
 3   Conf_Matchup                      382 non-null    bool          
 4   Div_Matchup                       382 non-null    bool          
 5   Conf_Pair                         382 non-null    object        
 6   Div_Pair                          382 non-null    object        
 7   Team_Pair                         382 non-null    object        
 8   prop_Reg_Home_Win_Home            343 non-null    float64       
 9   prop_Reg_Away_Win_Home            343 non-null    float64       
 10  prop_Reg_Tie_Home                 343 non-null    flo

Unnamed: 0,Date,Home_Team,Away_Team,Conf_Matchup,Div_Matchup,Conf_Pair,Div_Pair,Team_Pair,prop_Reg_Home_Win_Home,prop_Reg_Away_Win_Home,...,prop_reg_away_goal_diff_Away,avg_reg_home_goals_per_game_Away,avg_reg_away_goals_per_game_Away,prop_Reg_Home_Win_H2H,prop_Reg_Away_Win_H2H,prop_Reg_Tie_H2H,prop_reg_home_goal_diff_H2H,prop_reg_away_goal_diff_H2H,avg_reg_home_goals_per_game_H2H,avg_reg_away_goals_per_game_H2H
89,2025-09-22,Anaheim Ducks,Utah Mammoth,True,False,Western-Western,Pacific-Central,Anaheim Ducks vs Utah Mammoth,,,...,0.4,3.0,2.0,,,,,,,
90,2025-09-24,Anaheim Ducks,Los Angeles Kings,True,True,Western-Western,Pacific-Pacific,Anaheim Ducks vs Los Angeles Kings,1.0,0.0,...,0.75,1.0,3.0,,,,,,,
91,2025-09-29,Anaheim Ducks,San Jose Sharks,True,True,Western-Western,Pacific-Pacific,Anaheim Ducks vs San Jose Sharks,0.5,0.5,...,,,,,,,,,,
92,2025-10-14,Anaheim Ducks,Pittsburgh Penguins,False,False,Western-Eastern,Pacific-Metropolitan,Anaheim Ducks vs Pittsburgh Penguins,0.666667,0.333333,...,0.571429,1.8,2.4,,,,,,,
93,2025-10-16,Anaheim Ducks,Carolina Hurricanes,False,False,Western-Eastern,Pacific-Metropolitan,Anaheim Ducks vs Carolina Hurricanes,0.75,0.25,...,0.555556,3.0,3.75,,,,,,,


In [13]:
# store list of home teams and away teams in upcoming games
home_teams = upcoming_games_trim['Home_Team'].unique().tolist()
away_teams = upcoming_games_trim['Away_Team'].unique().tolist()

print(f'Home Teams: {home_teams}')
print(f'Away Teams: {away_teams}')

Home Teams: ['Carolina Hurricanes', 'St. Louis Blues', 'Utah Mammoth']
Away Teams: ['Vancouver Canucks', 'Philadelphia Flyers', 'New York Islanders']


In [14]:

# filter to teams in upcoming games
home_data = modeling_data[modeling_data['Home_Team'].isin(home_teams)]
away_data = modeling_data[modeling_data['Away_Team'].isin(away_teams)].drop(columns=[x for x in modeling_data.columns if x.endswith('_H2H')], axis=1)

# get last row for each Home_Team
last_rows_home = home_data.groupby('Home_Team').tail(1).reset_index(drop=True)

# drop cols that end with _Away
last_rows_home = last_rows_home[last_rows_home.columns.drop(list(last_rows_home.filter(regex='_Away$')))]

# drop Away_Team
last_rows_home = last_rows_home.drop(columns=['Away_Team', 'Date'])
last_rows_home

Unnamed: 0,Home_Team,Conf_Matchup,Div_Matchup,Conf_Pair,Div_Pair,Team_Pair,prop_Reg_Home_Win_Home,prop_Reg_Away_Win_Home,prop_Reg_Tie_Home,prop_reg_home_goal_diff_Home,prop_reg_away_goal_diff_Home,avg_reg_home_goals_per_game_Home,avg_reg_away_goals_per_game_Home,prop_Reg_Home_Win_H2H,prop_Reg_Away_Win_H2H,prop_Reg_Tie_H2H,prop_reg_home_goal_diff_H2H,prop_reg_away_goal_diff_H2H,avg_reg_home_goals_per_game_H2H,avg_reg_away_goals_per_game_H2H
0,Carolina Hurricanes,True,True,Eastern-Eastern,Metropolitan-Metropolitan,Carolina Hurricanes vs Washington Capitals,0.555556,0.333333,0.111111,0.555556,0.444444,3.888889,3.111111,,,,,,,
1,St. Louis Blues,True,False,Western-Western,Central-Pacific,St. Louis Blues vs Calgary Flames,0.25,0.5,0.25,0.43038,0.56962,2.833333,3.75,,,,,,,
2,Utah Mammoth,False,False,Western-Eastern,Central-Atlantic,Utah Mammoth vs Tampa Bay Lightning,0.625,0.25,0.125,0.541667,0.458333,3.25,2.75,,,,,,,


In [15]:
# get last row for each Away_Team
last_rows_away = away_data.groupby('Away_Team').tail(1).reset_index(drop=True)

# drop cols that end with _Home
last_rows_away = last_rows_away[last_rows_away.columns.drop(list(last_rows_away.filter(regex='_Home$')))]

# drop Home_Team
last_rows_away = last_rows_away.drop(columns=['Home_Team', 'Date'])

last_rows_away.info()
last_rows_away.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Away_Team                         3 non-null      object 
 1   Conf_Matchup                      3 non-null      bool   
 2   Div_Matchup                       3 non-null      bool   
 3   Conf_Pair                         3 non-null      object 
 4   Div_Pair                          3 non-null      object 
 5   Team_Pair                         3 non-null      object 
 6   prop_Reg_Home_Win_Away            3 non-null      float64
 7   prop_Reg_Away_Win_Away            3 non-null      float64
 8   prop_Reg_Tie_Away                 3 non-null      float64
 9   prop_reg_home_goal_diff_Away      3 non-null      float64
 10  prop_reg_away_goal_diff_Away      3 non-null      float64
 11  avg_reg_home_goals_per_game_Away  3 non-null      float64
 12  avg_reg_away

Unnamed: 0,Away_Team,Conf_Matchup,Div_Matchup,Conf_Pair,Div_Pair,Team_Pair,prop_Reg_Home_Win_Away,prop_Reg_Away_Win_Away,prop_Reg_Tie_Away,prop_reg_home_goal_diff_Away,prop_reg_away_goal_diff_Away,avg_reg_home_goals_per_game_Away,avg_reg_away_goals_per_game_Away
0,Philadelphia Flyers,True,True,Eastern-Eastern,Metropolitan-Metropolitan,Washington Capitals vs Philadelphia Flyers,0.5,0.0,0.5,0.6,0.4,3.0,2.0
1,Vancouver Canucks,False,False,Eastern-Western,Metropolitan-Pacific,Washington Capitals vs Vancouver Canucks,0.5,0.333333,0.166667,0.45,0.55,3.0,3.666667
2,New York Islanders,True,True,Eastern-Eastern,Metropolitan-Metropolitan,Washington Capitals vs New York Islanders,0.5,0.375,0.125,0.57377,0.42623,4.375,3.25


In [16]:
# left join home and away data on team names
upcoming_games_final = pd.merge(upcoming_games_trim, last_rows_home, on='Home_Team', how='left')
upcoming_games_final = pd.merge(upcoming_games_final, last_rows_away, on='Away_Team', how='left')

upcoming_games_final.info()
upcoming_games_final.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 40 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Game Link                         3 non-null      object        
 1   Home_Team                         3 non-null      object        
 2   Away_Team                         3 non-null      object        
 3   Time                              3 non-null      object        
 4   Date                              3 non-null      datetime64[ns]
 5   Season                            3 non-null      int64         
 6   Day_of_Week                       3 non-null      object        
 7   Month                             3 non-null      object        
 8   Game_Start_Hour                   3 non-null      int64         
 9   Conf_Matchup_x                    3 non-null      bool          
 10  Div_Matchup_x                     3 non-null      bool

Unnamed: 0,Game Link,Home_Team,Away_Team,Time,Date,Season,Day_of_Week,Month,Game_Start_Hour,Conf_Matchup_x,...,Conf_Pair_y,Div_Pair_y,Team_Pair_y,prop_Reg_Home_Win_Away,prop_Reg_Away_Win_Away,prop_Reg_Tie_Away,prop_reg_home_goal_diff_Away,prop_reg_away_goal_diff_Away,avg_reg_home_goals_per_game_Away,avg_reg_away_goals_per_game_Away
0,https://www.flashscore.com/match/hockey/caroli...,Carolina Hurricanes,Vancouver Canucks,19:00,2025-11-14,2025,Friday,November,19,True,...,Eastern-Western,Metropolitan-Pacific,Washington Capitals vs Vancouver Canucks,0.5,0.333333,0.166667,0.45,0.55,3.0,3.666667
1,https://www.flashscore.com/match/hockey/philad...,St. Louis Blues,Philadelphia Flyers,20:00,2025-11-14,2025,Friday,November,20,True,...,Eastern-Eastern,Metropolitan-Metropolitan,Washington Capitals vs Philadelphia Flyers,0.5,0.0,0.5,0.6,0.4,3.0,2.0
2,https://www.flashscore.com/match/hockey/new-yo...,Utah Mammoth,New York Islanders,21:00,2025-11-14,2025,Friday,November,21,False,...,Eastern-Eastern,Metropolitan-Metropolitan,Washington Capitals vs New York Islanders,0.5,0.375,0.125,0.57377,0.42623,4.375,3.25


In [17]:
# read in trained model
import joblib

basic_model = joblib.load('model/catboost_model_reg_tie.pkl')


In [18]:
# isolate features model needs for upcoming games
feature_cols = basic_model.feature_names_

# trim cols for data to predict
predict_df = upcoming_games_final[feature_cols]
predict_df.info()
predict_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 24 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Season                            3 non-null      int64  
 1   Month                             3 non-null      object 
 2   Day_of_Week                       3 non-null      object 
 3   prop_Reg_Home_Win_Home            3 non-null      float64
 4   prop_Reg_Away_Win_Home            3 non-null      float64
 5   prop_Reg_Tie_Home                 3 non-null      float64
 6   prop_reg_home_goal_diff_Home      3 non-null      float64
 7   prop_reg_away_goal_diff_Home      3 non-null      float64
 8   avg_reg_home_goals_per_game_Home  3 non-null      float64
 9   avg_reg_away_goals_per_game_Home  3 non-null      float64
 10  prop_Reg_Home_Win_Away            3 non-null      float64
 11  prop_Reg_Away_Win_Away            3 non-null      float64
 12  prop_Reg_Tie

Unnamed: 0,Season,Month,Day_of_Week,prop_Reg_Home_Win_Home,prop_Reg_Away_Win_Home,prop_Reg_Tie_Home,prop_reg_home_goal_diff_Home,prop_reg_away_goal_diff_Home,avg_reg_home_goals_per_game_Home,avg_reg_away_goals_per_game_Home,...,prop_reg_away_goal_diff_Away,avg_reg_home_goals_per_game_Away,avg_reg_away_goals_per_game_Away,prop_Reg_Home_Win_H2H,prop_Reg_Away_Win_H2H,prop_Reg_Tie_H2H,prop_reg_home_goal_diff_H2H,prop_reg_away_goal_diff_H2H,avg_reg_home_goals_per_game_H2H,avg_reg_away_goals_per_game_H2H
0,2025,November,Friday,0.555556,0.333333,0.111111,0.555556,0.444444,3.888889,3.111111,...,0.55,3.0,3.666667,,,,,,,
1,2025,November,Friday,0.25,0.5,0.25,0.43038,0.56962,2.833333,3.75,...,0.4,3.0,2.0,,,,,,,
2,2025,November,Friday,0.625,0.25,0.125,0.541667,0.458333,3.25,2.75,...,0.42623,4.375,3.25,,,,,,,


In [19]:
from catboost import CatBoostClassifier

# make predictions
# 5. Predict and evaluate proba
y_pred_proba = basic_model.predict_proba(predict_df)[:, 1]

# add preds toupcoming_games_final
upcoming_games_final['Pred_Tie_Proba'] = y_pred_proba

# inspect
inspect_cols = ['Date', 'Time', 'Home_Team', 'Away_Team', 'Pred_Tie_Proba'] + feature_cols
upcoming_games_final[inspect_cols].sort_values(by='Pred_Tie_Proba', ascending=False)


Unnamed: 0,Date,Time,Home_Team,Away_Team,Pred_Tie_Proba,Season,Month,Day_of_Week,prop_Reg_Home_Win_Home,prop_Reg_Away_Win_Home,...,prop_reg_away_goal_diff_Away,avg_reg_home_goals_per_game_Away,avg_reg_away_goals_per_game_Away,prop_Reg_Home_Win_H2H,prop_Reg_Away_Win_H2H,prop_Reg_Tie_H2H,prop_reg_home_goal_diff_H2H,prop_reg_away_goal_diff_H2H,avg_reg_home_goals_per_game_H2H,avg_reg_away_goals_per_game_H2H
2,2025-11-14,21:00,Utah Mammoth,New York Islanders,0.513614,2025,November,Friday,0.625,0.25,...,0.42623,4.375,3.25,,,,,,,
0,2025-11-14,19:00,Carolina Hurricanes,Vancouver Canucks,0.510269,2025,November,Friday,0.555556,0.333333,...,0.55,3.0,3.666667,,,,,,,
1,2025-11-14,20:00,St. Louis Blues,Philadelphia Flyers,0.506703,2025,November,Friday,0.25,0.5,...,0.4,3.0,2.0,,,,,,,
