In [1]:
# union all 1x2_odds files together and join to master data

In [2]:
# imports
import pandas as pd

In [3]:
# read in master games data
master_data = pd.read_excel(r'data/master_games_data.xlsx', header=0)

# inspect
master_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3381 entries, 0 to 3380
Data columns (total 43 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Game_Link                 3381 non-null   object        
 1   Extra_Time                3381 non-null   object        
 2   Home_Team                 3381 non-null   object        
 3   Away_Team                 3381 non-null   object        
 4   Home_Score                3381 non-null   int64         
 5   Away_Score                3381 non-null   int64         
 6   P1_Home_Score             3381 non-null   int64         
 7   P1_Away_Score             3381 non-null   int64         
 8   P2_Home_Score             3381 non-null   int64         
 9   P2_Away_Score             3381 non-null   int64         
 10  P3_Home_Score             3381 non-null   int64         
 11  P3_Away_Score             3381 non-null   int64         
 12  P4_Home_Score       

In [4]:
# trim data roughly to regular season games only
master_data = master_data[
    (master_data['Month'].isin(['October', 'November', 'December', 'January', 'February', 'March'])) # trim to regular season months with a smidge of oct pre-season
                              | 
    (master_data['Month'].isin(['April']) & master_data['Date'].dt.day <= 15) # include up to april 15th
]


In [5]:
# TODO - perform EDA on records that resulted in a regular time tie and see if there's any patterns or trends in this subset that could be indicative of predictive features
master_data[master_data['Reg_Tie'] == True].describe()

Unnamed: 0,Home_Score,Away_Score,P1_Home_Score,P1_Away_Score,P2_Home_Score,P2_Away_Score,P3_Home_Score,P3_Away_Score,P4_Home_Score,P4_Away_Score,...,P5_Away_Score,Month_int,Date,Season,Total_Score,regular_time_score_home,regular_time_score_away,regular_time_score_total,Week_of_Year,Game_Start_Hour
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,714.0,...,714.0,714.0,714,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,3.137255,3.07563,0.809524,0.810924,0.92437,0.911765,0.872549,0.883754,0.385154,0.334734,...,0.29972,6.834734,2024-09-11 14:43:21.680672256,2023.694678,6.212885,2.606443,2.606443,5.212885,28.043417,19.222689
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2023-09-24 00:00:00,2023.0,1.0,0.0,0.0,0.0,1.0,8.0
25%,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,2024-01-22 00:00:00,2023.0,5.0,2.0,2.0,4.0,10.0,19.0
50%,3.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,9.0,2024-10-26 12:00:00,2024.0,7.0,3.0,3.0,6.0,39.0,19.0
75%,4.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,11.0,2025-03-11 18:00:00,2024.0,7.0,3.0,3.0,6.0,45.0,21.0
max,8.0,7.0,5.0,4.0,4.0,4.0,5.0,5.0,1.0,1.0,...,4.0,12.0,2025-11-11 00:00:00,2025.0,15.0,7.0,7.0,14.0,52.0,22.0
std,1.246903,1.277662,0.823988,0.828553,0.881408,0.873035,0.866553,0.845209,0.486973,0.472228,...,0.700491,4.176757,,0.68998,2.318788,1.159394,1.159394,2.318788,18.216543,2.24689


In [6]:
# function that takes the df, sorts it by the target grain, then performs the groupby
def grain_sort_cumsum_props(home_vs_away: str, df: pd.DataFrame) -> pd.DataFrame:
    # add home or away team to sort grain
    home_plus_year = [home_vs_away, 'Season']
    
    # combine to create final sort grain
    sort_grain = home_plus_year + ['Date']

    # sort
    df = df.sort_values(by=sort_grain)

    # hard code cumsum cols
    cum_cols = ["Reg_Home_Win", "Reg_Away_Win", "Reg_Tie", 'P1_Home_Score', 'P1_Away_Score', 
                'P2_Home_Score', 'P2_Away_Score', 'P3_Home_Score', 'P3_Away_Score',
                'regular_time_score_home', 'regular_time_score_away']
    
    # add count of games for home or away
    df[f'cum_{home_vs_away}_Games'] = df.groupby(home_plus_year).cumcount() + 1

    # compute cumsum for each group
    df[[f"cum_{col}" for col in cum_cols]] = df.groupby(home_plus_year)[cum_cols].cumsum()

    # calc prop home reg win
    df['prop_Reg_Home_Win'] = df['cum_Reg_Home_Win'] / df[f'cum_{home_vs_away}_Games']
    df['prop_Reg_Away_Win'] = df['cum_Reg_Away_Win'] / df[f'cum_{home_vs_away}_Games']
    df['prop_Reg_Tie'] = df['cum_Reg_Tie'] / df[f'cum_{home_vs_away}_Games']

    # calc props for what % of goals scored in each period
    df['prop_P1_Home_Score'] = df['cum_P1_Home_Score'] / df['cum_regular_time_score_home']
    df['prop_P2_Home_Score'] = df['cum_P2_Home_Score'] / df['cum_regular_time_score_home']
    df['prop_P3_Home_Score'] = df['cum_P3_Home_Score'] / df['cum_regular_time_score_home']

    df['prop_P1_Away_Score'] = df['cum_P1_Away_Score'] / df['cum_regular_time_score_away']
    df['prop_P2_Away_Score'] = df['cum_P2_Away_Score'] / df['cum_regular_time_score_away']
    df['prop_P3_Away_Score'] = df['cum_P3_Away_Score'] / df['cum_regular_time_score_away']

    # calc goal diffs at the period level
    df['prop_P1_Home_Goal_Diff'] = df['cum_P1_Home_Score'] / df['cum_P1_Away_Score']
    df['prop_P2_Home_Goal_Diff'] = df['cum_P2_Home_Score'] / df['cum_P2_Away_Score']
    df['prop_P3_Home_Goal_Diff'] = df['cum_P3_Home_Score'] / df['cum_P3_Away_Score']

    df['prop_P1_Away_Goal_Diff'] = df['cum_P1_Away_Score'] / df['cum_P1_Home_Score']
    df['prop_P2_Away_Goal_Diff'] = df['cum_P2_Away_Score'] / df['cum_P2_Home_Score']
    df['prop_P3_Away_Goal_Diff'] = df['cum_P3_Away_Score'] / df['cum_P3_Home_Score']

    # calc props for goals scored / allowed
    df['prop_reg_home_goal_diff'] = df['cum_regular_time_score_home'] / (df['cum_regular_time_score_home'] + df['cum_regular_time_score_away'])
    df['prop_reg_away_goal_diff'] = 1 - df['prop_reg_home_goal_diff']

    # calc avg goals per game
    df['avg_reg_home_goals_per_game'] = df['cum_regular_time_score_home'] / df[f'cum_{home_vs_away}_Games']
    df['avg_reg_away_goals_per_game'] = df['cum_regular_time_score_away'] / df[f'cum_{home_vs_away}_Games']

    # isolate cum, prop, and avg cols
    sel_cum_cols = [x for x in df.columns if x.startswith('prop_') or x.startswith('avg_')]

    # trim to final cols
    final_cols = sort_grain + sel_cum_cols
    df = df[final_cols]

    # shift feature cols down that are not in sort_grain
    df[sel_cum_cols] = df.groupby(home_plus_year)[sel_cum_cols].shift(1)

    # rename the sel_cum_cols to indicate home vs away
    if home_vs_away == 'Home_Team':
        home_vs_away = 'Home'
    else:
        home_vs_away = 'Away'
    
    rename_dict = {col: f"{col}_{home_vs_away}" for col in sel_cum_cols}
    df = df.rename(columns=rename_dict)

    return df

# test func
all_home_games = grain_sort_cumsum_props('Home_Team', master_data)
all_home_games.info()
all_home_games.head()


<class 'pandas.core.frame.DataFrame'>
Index: 3381 entries, 1496 to 3095
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Home_Team                         3381 non-null   object        
 1   Season                            3381 non-null   int64         
 2   Date                              3381 non-null   datetime64[ns]
 3   prop_Reg_Home_Win_Home            3285 non-null   float64       
 4   prop_Reg_Away_Win_Home            3285 non-null   float64       
 5   prop_Reg_Tie_Home                 3285 non-null   float64       
 6   prop_P1_Home_Score_Home           3284 non-null   float64       
 7   prop_P2_Home_Score_Home           3284 non-null   float64       
 8   prop_P3_Home_Score_Home           3284 non-null   float64       
 9   prop_P1_Away_Score_Home           3272 non-null   float64       
 10  prop_P2_Away_Score_Home           3272 non-null   

Unnamed: 0,Home_Team,Season,Date,prop_Reg_Home_Win_Home,prop_Reg_Away_Win_Home,prop_Reg_Tie_Home,prop_P1_Home_Score_Home,prop_P2_Home_Score_Home,prop_P3_Home_Score_Home,prop_P1_Away_Score_Home,...,prop_P1_Home_Goal_Diff_Home,prop_P2_Home_Goal_Diff_Home,prop_P3_Home_Goal_Diff_Home,prop_P1_Away_Goal_Diff_Home,prop_P2_Away_Goal_Diff_Home,prop_P3_Away_Goal_Diff_Home,prop_reg_home_goal_diff_Home,prop_reg_away_goal_diff_Home,avg_reg_home_goals_per_game_Home,avg_reg_away_goals_per_game_Home
1496,Anaheim Ducks,2023,2023-09-24,,,,,,,,...,,,,,,,,,,
1471,Anaheim Ducks,2023,2023-09-27,0.0,0.0,1.0,0.5,0.0,0.5,0.0,...,inf,0.0,1.0,0.0,inf,1.0,0.5,0.5,2.0,2.0
1457,Anaheim Ducks,2023,2023-09-29,0.5,0.0,0.5,0.5,0.166667,0.333333,0.25,...,3.0,1.0,1.0,0.333333,1.0,1.0,0.6,0.4,3.0,2.0
1413,Anaheim Ducks,2023,2023-10-05,0.333333,0.333333,0.333333,0.444444,0.222222,0.333333,0.5,...,1.0,1.0,1.5,1.0,1.0,0.666667,0.529412,0.470588,3.0,2.666667
1366,Anaheim Ducks,2023,2023-10-15,0.25,0.5,0.25,0.363636,0.181818,0.454545,0.333333,...,1.0,0.5,1.25,1.0,2.0,0.8,0.478261,0.521739,2.75,3.0


In [7]:
# repeat for away teams
all_away_games = grain_sort_cumsum_props('Away_Team', master_data)
all_away_games.info()
all_away_games.head()

<class 'pandas.core.frame.DataFrame'>
Index: 3381 entries, 1480 to 3014
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Away_Team                         3381 non-null   object        
 1   Season                            3381 non-null   int64         
 2   Date                              3381 non-null   datetime64[ns]
 3   prop_Reg_Home_Win_Away            3285 non-null   float64       
 4   prop_Reg_Away_Win_Away            3285 non-null   float64       
 5   prop_Reg_Tie_Away                 3285 non-null   float64       
 6   prop_P1_Home_Score_Away           3283 non-null   float64       
 7   prop_P2_Home_Score_Away           3283 non-null   float64       
 8   prop_P3_Home_Score_Away           3283 non-null   float64       
 9   prop_P1_Away_Score_Away           3275 non-null   float64       
 10  prop_P2_Away_Score_Away           3275 non-null   

Unnamed: 0,Away_Team,Season,Date,prop_Reg_Home_Win_Away,prop_Reg_Away_Win_Away,prop_Reg_Tie_Away,prop_P1_Home_Score_Away,prop_P2_Home_Score_Away,prop_P3_Home_Score_Away,prop_P1_Away_Score_Away,...,prop_P1_Home_Goal_Diff_Away,prop_P2_Home_Goal_Diff_Away,prop_P3_Home_Goal_Diff_Away,prop_P1_Away_Goal_Diff_Away,prop_P2_Away_Goal_Diff_Away,prop_P3_Away_Goal_Diff_Away,prop_reg_home_goal_diff_Away,prop_reg_away_goal_diff_Away,avg_reg_home_goals_per_game_Away,avg_reg_away_goals_per_game_Away
1480,Anaheim Ducks,2023,2023-09-26,,,,,,,,...,,,,,,,,,,
1445,Anaheim Ducks,2023,2023-10-01,0.0,1.0,0.0,0.0,0.5,0.5,0.75,...,0.0,inf,1.0,inf,0.0,1.0,0.333333,0.666667,2.0,4.0
1430,Anaheim Ducks,2023,2023-10-03,0.0,1.0,0.0,0.5,0.333333,0.166667,0.555556,...,0.6,2.0,0.333333,1.666667,0.5,3.0,0.4,0.6,3.0,4.5
1404,Anaheim Ducks,2023,2023-10-07,0.333333,0.666667,0.0,0.6,0.2,0.2,0.5,...,1.2,1.0,0.666667,0.833333,1.0,1.5,0.5,0.5,3.333333,3.333333
1371,Anaheim Ducks,2023,2023-10-14,0.5,0.5,0.0,0.411765,0.352941,0.235294,0.454545,...,1.4,3.0,1.0,0.714286,0.333333,1.0,0.607143,0.392857,4.25,2.75


In [8]:
# get key cols from master data
base_cols = ['Date', 'Season', 'Game_ID', 'Home_Team', 'Away_Team',
    'Reg_Home_Win', 'Reg_Away_Win', 'Reg_Tie', 'Month', 'Day_of_Week', 
    'Conf_Matchup', 'Div_Matchup', 'Conf_Pair', 'Div_Pair', 'Team_Pair'
]

modeling_data_base = master_data[base_cols]

# ensure data sorted correctly
modeling_data = modeling_data_base.sort_values(by=['Home_Team', 'Season', 'Date'])

# join home and away dfs to modeling data
modeling_data = modeling_data.merge(all_home_games, how='left', on=['Home_Team', 'Season', 'Date'])
modeling_data = modeling_data.merge(all_away_games, how='left', on=['Away_Team', 'Season', 'Date'])

modeling_data.info()
modeling_data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3405 entries, 0 to 3404
Data columns (total 53 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Date                              3405 non-null   datetime64[ns]
 1   Season                            3405 non-null   int64         
 2   Game_ID                           3405 non-null   object        
 3   Home_Team                         3405 non-null   object        
 4   Away_Team                         3405 non-null   object        
 5   Reg_Home_Win                      3405 non-null   bool          
 6   Reg_Away_Win                      3405 non-null   bool          
 7   Reg_Tie                           3405 non-null   bool          
 8   Month                             3405 non-null   object        
 9   Day_of_Week                       3405 non-null   object        
 10  Conf_Matchup                      3405 non-null 

Unnamed: 0,Date,Season,Game_ID,Home_Team,Away_Team,Reg_Home_Win,Reg_Away_Win,Reg_Tie,Month,Day_of_Week,...,prop_P1_Home_Goal_Diff_Away,prop_P2_Home_Goal_Diff_Away,prop_P3_Home_Goal_Diff_Away,prop_P1_Away_Goal_Diff_Away,prop_P2_Away_Goal_Diff_Away,prop_P3_Away_Goal_Diff_Away,prop_reg_home_goal_diff_Away,prop_reg_away_goal_diff_Away,avg_reg_home_goals_per_game_Away,avg_reg_away_goals_per_game_Away
0,2023-09-24,2023,2023-09-24-20:00|Anaheim Ducks vs Los Angeles ...,Anaheim Ducks,Los Angeles Kings,False,False,True,September,Sunday,...,,1.333333,inf,,0.75,0.0,0.625,0.375,5.0,3.0
1,2023-09-27,2023,2023-09-27-22:00|Anaheim Ducks vs San Jose Sharks,Anaheim Ducks,San Jose Sharks,True,False,False,September,Wednesday,...,,,,,,,,,,
2,2023-09-29,2023,2023-09-29-22:00|Anaheim Ducks vs Los Angeles ...,Anaheim Ducks,Los Angeles Kings,False,True,False,September,Friday,...,inf,1.0,0.666667,0.0,1.0,1.5,0.555556,0.444444,3.333333,2.666667
3,2023-10-05,2023,2023-10-05-22:00|Anaheim Ducks vs Arizona Coyotes,Anaheim Ducks,Arizona Coyotes,False,True,False,October,Thursday,...,2.0,8.0,2.0,0.5,0.125,0.5,0.761905,0.238095,4.0,1.25
4,2023-10-15,2023,2023-10-15-20:30|Anaheim Ducks vs Carolina Hur...,Anaheim Ducks,Carolina Hurricanes,True,False,False,October,Sunday,...,1.25,3.0,3.5,0.8,0.333333,0.285714,0.692308,0.307692,4.5,2.0


In [9]:
# write final modeling data to excel
modeling_data.to_excel(r'data/modeling_data.xlsx', index=False)

In [10]:
# Note that most NHL teams play 3-5 games per week