In [1]:
# predict probability that any given game will result in a regulation tie

In [2]:
import pandas as pd


In [3]:
# write final modeling data to excel
modeling_data = pd.read_excel(r'data/modeling_data.xlsx', header=0)

# inspect
modeling_data.info()
modeling_data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3405 entries, 0 to 3404
Data columns (total 57 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Date                              3405 non-null   datetime64[ns]
 1   Season                            3405 non-null   int64         
 2   Game_ID                           3405 non-null   object        
 3   Home_Team                         3405 non-null   object        
 4   Away_Team                         3405 non-null   object        
 5   Odds_1                            3256 non-null   float64       
 6   Odds_X                            3256 non-null   float64       
 7   Odds_2                            3256 non-null   float64       
 8   Reg_Home_Win                      3405 non-null   bool          
 9   Reg_Away_Win                      3405 non-null   bool          
 10  Reg_Tie                           3405 non-null 

Unnamed: 0,Date,Season,Game_ID,Home_Team,Away_Team,Odds_1,Odds_X,Odds_2,Reg_Home_Win,Reg_Away_Win,...,prop_P1_Home_Goal_Diff_Away,prop_P2_Home_Goal_Diff_Away,prop_P3_Home_Goal_Diff_Away,prop_P1_Away_Goal_Diff_Away,prop_P2_Away_Goal_Diff_Away,prop_P3_Away_Goal_Diff_Away,prop_reg_home_goal_diff_Away,prop_reg_away_goal_diff_Away,avg_reg_home_goals_per_game_Away,avg_reg_away_goals_per_game_Away
0,2023-09-24,2023,2023-09-24-20:00|Anaheim Ducks vs Los Angeles ...,Anaheim Ducks,Los Angeles Kings,,,,False,False,...,inf,1.0,2.0,0.0,1.0,0.5,0.583333,0.416667,3.5,2.5
1,2023-09-27,2023,2023-09-27-22:00|Anaheim Ducks vs San Jose Sharks,Anaheim Ducks,San Jose Sharks,,,,True,False,...,2.0,inf,1.0,0.5,0.0,1.0,0.666667,0.333333,4.0,2.0
2,2023-09-29,2023,2023-09-29-22:00|Anaheim Ducks vs Los Angeles ...,Anaheim Ducks,Los Angeles Kings,,,,False,True,...,1.333333,1.0,1.0,0.75,1.0,1.0,0.52,0.48,3.25,3.0
3,2023-10-05,2023,2023-10-05-22:00|Anaheim Ducks vs Arizona Coyotes,Anaheim Ducks,Arizona Coyotes,,,,False,True,...,2.0,2.666667,1.6,0.5,0.375,0.625,0.666667,0.333333,3.6,1.8
4,2023-10-15,2023,2023-10-15-20:30|Anaheim Ducks vs Carolina Hur...,Anaheim Ducks,Carolina Hurricanes,4.75,4.75,1.56,True,False,...,2.0,2.333333,2.25,0.5,0.428571,0.444444,0.685714,0.314286,4.8,2.2


In [4]:
# set season as categorical
modeling_data['Season'] = modeling_data['Season'].astype('str')
modeling_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3405 entries, 0 to 3404
Data columns (total 57 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Date                              3405 non-null   datetime64[ns]
 1   Season                            3405 non-null   object        
 2   Game_ID                           3405 non-null   object        
 3   Home_Team                         3405 non-null   object        
 4   Away_Team                         3405 non-null   object        
 5   Odds_1                            3256 non-null   float64       
 6   Odds_X                            3256 non-null   float64       
 7   Odds_2                            3256 non-null   float64       
 8   Reg_Home_Win                      3405 non-null   bool          
 9   Reg_Away_Win                      3405 non-null   bool          
 10  Reg_Tie                           3405 non-null 

In [5]:
# drop na to make life easier
modeling_data = modeling_data.dropna()
modeling_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3134 entries, 4 to 3404
Data columns (total 57 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   Date                              3134 non-null   datetime64[ns]
 1   Season                            3134 non-null   object        
 2   Game_ID                           3134 non-null   object        
 3   Home_Team                         3134 non-null   object        
 4   Away_Team                         3134 non-null   object        
 5   Odds_1                            3134 non-null   float64       
 6   Odds_X                            3134 non-null   float64       
 7   Odds_2                            3134 non-null   float64       
 8   Reg_Home_Win                      3134 non-null   bool          
 9   Reg_Away_Win                      3134 non-null   bool          
 10  Reg_Tie                           3134 non-null   boo

In [6]:
# designate response variable
response_ = 'Reg_Tie'

# inspect value counts: ties seem to happen 25% of the time roughly
modeling_data[['Season', response_]].value_counts().sort_index()


Season  Reg_Tie
2023    False      1043
        True        274
2024    False      1171
        True        303
2025    False       254
        True         89
Name: count, dtype: int64

In [7]:
# list of drop cols that won't be used in modeling
drop_cols = ['Game_ID', 'Date', 'Reg_Home_Win', 'Reg_Away_Win', 'Home_Team', 'Away_Team']

# get X matrix
X = modeling_data.drop(columns=drop_cols + [response_])
X.info()


<class 'pandas.core.frame.DataFrame'>
Index: 3134 entries, 4 to 3404
Data columns (total 50 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Season                            3134 non-null   object 
 1   Odds_1                            3134 non-null   float64
 2   Odds_X                            3134 non-null   float64
 3   Odds_2                            3134 non-null   float64
 4   Month                             3134 non-null   object 
 5   Day_of_Week                       3134 non-null   object 
 6   Conf_Matchup                      3134 non-null   bool   
 7   Div_Matchup                       3134 non-null   bool   
 8   Conf_Pair                         3134 non-null   object 
 9   Div_Pair                          3134 non-null   object 
 10  Team_Pair                         3134 non-null   object 
 11  Start_Hour_Group                  3134 non-null   object 
 12  prop_Reg_Ho

In [8]:
# isolate binary response
y_binary = modeling_data[response_].astype(bool) # make sure that this is bool

# inspect y
y_binary.info()

# get class weights to deal with imbalanced dataset
from collections import Counter

counts = Counter(y_binary)
total = sum(counts.values())
class_weights = [total / counts[cls] for cls in sorted(counts)]

# print class weights
print(counts)
print('class weights:', class_weights)
print('pos rate:', y_binary.mean()) 

<class 'pandas.core.series.Series'>
Index: 3134 entries, 4 to 3404
Series name: Reg_Tie
Non-Null Count  Dtype
--------------  -----
3134 non-null   bool 
dtypes: bool(1)
memory usage: 27.5 KB
Counter({False: 2468, True: 666})
class weights: [1.2698541329011346, 4.7057057057057055]
pos rate: 0.21250797702616464


In [None]:
modeling_data_trim.info()

In [9]:
# fit a single catboost classifier with defaults
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, accuracy_score

# 1. Train/test split (preserve time order)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_binary, train_size=0.85, shuffle=False
)

# 2. Define categorical features
cat_features = ['Season', 'Month', 'Day_of_Week', 'Start_Hour_Group', 'Conf_Pair', 'Div_Pair', 'Team_Pair']

# 3. Create Pools
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

# 4. Train model with reasonable defaults
basic_model = CatBoostClassifier(
    iterations=1500,
    learning_rate=0.05,
    class_weights=class_weights, # to fix class imbalance
    boosting_type='Ordered',     # ensure data is ordered
    # bagging_temperature=5,  # default is 1
    # depth=9,
    # l2_leaf_reg=9,
    verbose=100
)

# Fit with early stopping
basic_model.fit(
    train_pool,
    eval_set=test_pool,
    early_stopping_rounds=50,
    use_best_model=True
)

# 5. Predict and evaluate
y_pred = basic_model.predict(test_pool)

# 5. Predict and evaluate proba
y_pred_proba = basic_model.predict_proba(test_pool)[:, 1]

# classification report
print(classification_report(y_test, y_pred))

# calc auc, accuracy, and pos rate
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
pos_rate = y_test.mean()

print('acc:', acc)
print('auc:', auc)
print('pos_rate:', pos_rate)


0:	learn: 0.6854282	test: 0.6855481	best: 0.6855481 (0)	total: 314ms	remaining: 7m 51s
100:	learn: 0.5425700	test: 0.5664323	best: 0.5661901 (97)	total: 9.42s	remaining: 2m 10s
200:	learn: 0.4999687	test: 0.5583868	best: 0.5567972 (195)	total: 18.5s	remaining: 1m 59s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.5567972011
bestIteration = 195

Shrink model to first 196 iterations.
              precision    recall  f1-score   support

       False       0.93      0.65      0.76       378
        True       0.36      0.82      0.50        93

    accuracy                           0.68       471
   macro avg       0.65      0.73      0.63       471
weighted avg       0.82      0.68      0.71       471

acc: 0.6794055201698513
auc: 0.7799112476531831
pos_rate: 0.19745222929936307
