# First model regular season games approach| Sebislaw

## Libraries

In [2]:
from os.path  import join
import random
import itertools
import math

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, TimeSeriesSplit
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import brier_score_loss
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

## Data

In [3]:
data_path = '..\\..\\data'

# The Basics ------------------------------------------------------------------------
# Men
MTeams = pd.read_csv(join(data_path, 'MTeams.csv'))
MSeasons = pd.read_csv(join(data_path, 'MSeasons.csv'))
MNCAATourneySeeds = pd.read_csv(join(data_path, 'MNCAATourneySeeds.csv'))
MRegularSeasonCompactResults = pd.read_csv(join(data_path, 'MRegularSeasonCompactResults.csv'))
MNCAATourneyCompactResults = pd.read_csv(join(data_path, 'MNCAATourneyCompactResults.csv'))
# Women
WTeams = pd.read_csv(join(data_path, 'WTeams.csv'))
WSeasons = pd.read_csv(join(data_path, 'WSeasons.csv'))
WNCAATourneySeeds = pd.read_csv(join(data_path, 'WNCAATourneySeeds.csv'))
WRegularSeasonCompactResults = pd.read_csv(join(data_path, 'WRegularSeasonCompactResults.csv'))
WNCAATourneyCompactResults = pd.read_csv(join(data_path, 'WNCAATourneyCompactResults.csv'))
# Other
SampleSubmissionStage1 = pd.read_csv(join(data_path, 'SampleSubmissionStage1.csv'))
SampleSubmissionStage2 = pd.read_csv(join(data_path, 'SampleSubmissionStage2.csv'))
SeedBenchmarkStage1 = pd.read_csv(join(data_path, 'SeedBenchmarkStage1.csv'))

# Team Box Scores ------------------------------------------------------------------------
# Men
MRegularSeasonDetailedResults = pd.read_csv(join(data_path, 'MRegularSeasonDetailedResults.csv'))
MNCAATourneyDetailedResults = pd.read_csv(join(data_path, 'MNCAATourneyDetailedResults.csv'))
# Women
WRegularSeasonDetailedResults = pd.read_csv(join(data_path, 'WRegularSeasonDetailedResults.csv'))
WNCAATourneyDetailedResults = pd.read_csv(join(data_path, 'WNCAATourneyDetailedResults.csv'))

# Geography ------------------------------------------------------------------------
# All
Cities = pd.read_csv(join(data_path, 'Cities.csv'))
Conferences = pd.read_csv(join(data_path, 'Conferences.csv'))
# Men
MGameCities = pd.read_csv(join(data_path, 'MGameCities.csv'))
# Women
WGameCities = pd.read_csv(join(data_path, 'WGameCities.csv'))

# Public Rankings ------------------------------------------------------------------------
# Men
MMasseyOrdinals = pd.read_csv(join(data_path, 'MMasseyOrdinals.csv')) # men only

# Supplements ------------------------------------------------------------------------
# Men
MTeamCoaches = pd.read_csv(join(data_path, 'MTeamCoaches.csv')) # men only
MTeamConferences = pd.read_csv(join(data_path, 'MTeamConferences.csv'))
MConferenceTourneyGames = pd.read_csv(join(data_path, 'MConferenceTourneyGames.csv'))
MSecondaryTourneyTeams = pd.read_csv(join(data_path, 'MSecondaryTourneyTeams.csv'))
MSecondaryTourneyCompactResults = pd.read_csv(join(data_path, 'MSecondaryTourneyCompactResults.csv'))
MTeamSpellings = pd.read_csv(join(data_path, "MTeamSpellings.csv"), encoding='cp1252')
MNCAATourneySlots = pd.read_csv(join(data_path, 'MNCAATourneySlots.csv'))
MNCAATourneySeedRoundSlots = pd.read_csv(join(data_path, 'MNCAATourneySeedRoundSlots.csv')) # men only
# Women
WTeamConferences = pd.read_csv(join(data_path, 'WTeamConferences.csv'))
WConferenceTourneyGames = pd.read_csv(join(data_path, 'WConferenceTourneyGames.csv'))
WSecondaryTourneyTeams = pd.read_csv(join(data_path, 'WSecondaryTourneyTeams.csv'))
WSecondaryTourneyCompactResults = pd.read_csv(join(data_path, 'WSecondaryTourneyCompactResults.csv'))
WTeamSpellings = pd.read_csv(join(data_path, 'WTeamSpellings.csv'), encoding='cp1252')
WNCAATourneySlots = pd.read_csv(join(data_path, 'WNCAATourneySlots.csv'))

## First models based on data from past games

In [3]:
def prepare_data(GameReslts, Conferences, include_conferences, seasons):
    # Make a copy so that the original GameReslts is not modified.
    df = GameReslts.copy()
    
    if include_conferences:
        # =========== MERGE CONFERENCES =========== #
        df = pd.merge(
            df,
            Conferences[['Season', 'TeamID', 'ConfAbbrev']],
            how='left',
            left_on=['Season', 'WTeamID'],
            right_on=['Season', 'TeamID']
        ).rename(columns={'ConfAbbrev': 'WConf'}).drop(columns='TeamID')
        
        df = pd.merge(
            df,
            Conferences[['Season', 'TeamID', 'ConfAbbrev']],
            how='left',
            left_on=['Season', 'LTeamID'],
            right_on=['Season', 'TeamID']
        ).rename(columns={'ConfAbbrev': 'LConf'}).drop(columns='TeamID')
    
    # =========== DEFINE TEAM1/TEAM2 & LABEL =========== #
    def define_team1_team2(row):
        w, l = row['WTeamID'], row['LTeamID']
        return (w, l, 1) if w < l else (l, w, 0)
    
    df[['Team1', 'Team2', 'label']] = df.apply(define_team1_team2, axis=1, result_type='expand')
    
    # =========== ONE-HOT WLoc FROM TEAM1's PERSPECTIVE =========== #
    def get_team1_loc(row):
        wloc = row['WLoc']
        lbl = row['label']
        if wloc == 'N':
            return 'N'
        if wloc == 'H':
            return 'H' if lbl == 1 else 'A'
        if wloc == 'A':
            return 'A' if lbl == 1 else 'H'
        return 'N'  # fallback
    
    df['Team1WLoc'] = df.apply(get_team1_loc, axis=1)
    df = pd.get_dummies(df, columns=['Team1WLoc'], prefix='Loc', drop_first=False)
    
    # Ensure that all location columns are present, even if not generated by get_dummies
    for col in ['Loc_A', 'Loc_H', 'Loc_N']:
        if col not in df.columns:
            df[col] = 0
    
    if include_conferences:
        # =========== ONE-HOT CONFERENCES =========== #
        def get_team1_conf(row):
            return row['WConf'] if row['label'] == 1 else row['LConf']
        def get_team2_conf(row):
            return row['LConf'] if row['label'] == 1 else row['WConf']
        
        df['Team1Conf'] = df.apply(get_team1_conf, axis=1)
        df['Team2Conf'] = df.apply(get_team2_conf, axis=1)
        df = pd.get_dummies(df, columns=['Team1Conf', 'Team2Conf'], prefix=['T1Conf', 'T2Conf'], drop_first=False)
        
        # Ensure that all possible conferences are present in the one-hot encoding.
        all_confs = sorted(Conferences['ConfAbbrev'].unique())
        for conf in all_confs:
            col_name_1 = f"T1Conf_{conf}"
            col_name_2 = f"T2Conf_{conf}"
            if col_name_1 not in df.columns:
                df[col_name_1] = 0
            if col_name_2 not in df.columns:
                df[col_name_2] = 0
    
    # =========== FEATURES & LABEL =========== #
    # Preserve WScore and LScore along with the one-hot encoded columns.
    feature_cols = [c for c in df.columns if c.startswith('Loc_') 
                    or c.startswith('T1Conf_') or c.startswith('T2Conf_')
                    or c in ['WScore', 'LScore']]
    X = df[feature_cols].copy()
    y = df['label']
    
    # Filter by the specified seasons without modifying the original data further.
    mask = df['Season'].isin(seasons)
    X, y = X[mask], y[mask]
    df = df[mask].copy()

    # Define the desired order for the feature columns
    ordered_feature_columns = ['Loc_A', 'Loc_H', 'Loc_N']
    ordered_feature_columns += sorted([col for col in X.columns if col.startswith('T1Conf_')])
    ordered_feature_columns += sorted([col for col in X.columns if col.startswith('T2Conf_')])
    ordered_feature_columns += ['WScore', 'LScore']
    
    # Reorder the X DataFrame
    X = X[ordered_feature_columns]
    
    # Optionally, for the full dataframe (df), define a complete column order
    ordered_df_columns = ['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'label', 'Team1', 'Team2'] + ordered_feature_columns
    df = df.reindex(columns=ordered_df_columns)
    
    return X, y, df

In [5]:
def logistic_regression(X_train, y_train, X_test, max_iter_num=1000):
    # =========== TRAIN LOGISTIC REGRESSION =========== #
    model = LogisticRegression(max_iter=max_iter_num)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    return model, y_pred_proba

### On compact data set

In [56]:
train_seasons_array = [
    [i for i in range(2022, 2023)],
    [i for i in range(2010, 2023)]
]
test_seasons_array = [
    [2023],
    [2023]
]

def print_tmp(train_seasons, test_seasons, y_test, y_pred_proba):
    print(f"Train seasons: {train_seasons}") 
    print(f"Test seasons: {test_seasons}") 
    brier = brier_score_loss(y_test, y_pred_proba)
    print(f"Brier Score: {brier:.10f}")
    y_pred_proba = np.where(
        y_pred_proba > 0.95, 1,
            np.where(
                y_pred_proba < 0.05, 0,
                np.where(
                    (y_pred_proba >= 0.45) & (y_pred_proba <= 0.55),
                    0.5,
                    y_pred_proba
                )
            )
        )
    brier = brier_score_loss(y_test, y_pred_proba)
    print(f"Brier Score after tresholding: {brier:.10f}")
    print()

In [57]:
for train_seasons, test_seasons in zip(train_seasons_array, test_seasons_array):
    
    X_train, y_train, df_train = prepare_data(MRegularSeasonCompactResults, MTeamConferences, True, train_seasons)
    X_test, y_test, df_test = prepare_data(MRegularSeasonCompactResults, MTeamConferences, True, test_seasons)
    model, y_pred_proba = logistic_regression(X_train, y_train, X_test)
    print("Train and test on regular season")
    print_tmp(train_seasons, test_seasons, y_test, y_pred_proba)

    X_train, y_train, df_train = prepare_data(MRegularSeasonCompactResults, MTeamConferences, True, train_seasons)
    X_test, y_test, df_test = prepare_data(MNCAATourneyCompactResults, MTeamConferences, True, test_seasons)
    model, y_pred_proba = logistic_regression(X_train, y_train, X_test)
    print("Train on regular season and test on tournament")
    print_tmp(train_seasons, test_seasons, y_test, y_pred_proba)

    X_train, y_train, df_train = prepare_data(MNCAATourneyCompactResults, MTeamConferences, True, train_seasons)
    X_test, y_test, df_test = prepare_data(MNCAATourneyCompactResults, MTeamConferences, True, test_seasons)
    model, y_pred_proba = logistic_regression(X_train, y_train, X_test)
    print("Train and test on tournament")
    print_tmp(train_seasons, test_seasons, y_test, y_pred_proba)

    X_train, y_train, df_train = prepare_data(MRegularSeasonCompactResults, MTeamConferences, False, train_seasons)
    X_test, y_test, df_test = prepare_data(MNCAATourneyCompactResults, MTeamConferences, False, test_seasons)
    model, y_pred_proba = logistic_regression(X_train, y_train, X_test)
    print("Train on regular season and test on tournament with no conferences")
    print_tmp(train_seasons, test_seasons, y_test, y_pred_proba)

Train and test on regular season
Train seasons: [2022]
Test seasons: [2023]
Brier Score: 0.2220639657
Brier Score after tresholding: 0.2222225503

Train on regular season and test on tournament
Train seasons: [2022]
Test seasons: [2023]
Brier Score: 0.2239554386
Brier Score after tresholding: 0.2220035052

Train and test on tournament
Train seasons: [2022]
Test seasons: [2023]
Brier Score: 0.2597437042
Brier Score after tresholding: 0.2607051521

Train on regular season and test on tournament with no conferences
Train seasons: [2022]
Test seasons: [2023]
Brier Score: 0.2537444326
Brier Score after tresholding: 0.2500000000

Train and test on regular season
Train seasons: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
Test seasons: [2023]
Brier Score: 0.2216529982
Brier Score after tresholding: 0.2218423569

Train on regular season and test on tournament
Train seasons: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
Test sea

### On detailed data set

In [58]:
for train_seasons, test_seasons in zip(train_seasons_array, test_seasons_array):
    
    X_train, y_train, df_train = prepare_data(MRegularSeasonDetailedResults, MTeamConferences, True, train_seasons)
    X_test, y_test, df_test = prepare_data(MRegularSeasonDetailedResults, MTeamConferences, True, test_seasons)
    model, y_pred_proba = logistic_regression(X_train, y_train, X_test)
    print("Train and test on regular season")
    print_tmp(train_seasons, test_seasons, y_test, y_pred_proba)

    X_train, y_train, df_train = prepare_data(MRegularSeasonDetailedResults, MTeamConferences, True, train_seasons)
    X_test, y_test, df_test = prepare_data(MNCAATourneyDetailedResults, MTeamConferences, True, test_seasons)
    model, y_pred_proba = logistic_regression(X_train, y_train, X_test)
    print("Train on regular season and test on tournament")
    print_tmp(train_seasons, test_seasons, y_test, y_pred_proba)

    X_train, y_train, df_train = prepare_data(MNCAATourneyDetailedResults, MTeamConferences, True, train_seasons)
    X_test, y_test, df_test = prepare_data(MNCAATourneyDetailedResults, MTeamConferences, True, test_seasons)
    model, y_pred_proba = logistic_regression(X_train, y_train, X_test)
    print("Train and test on tournament")
    print_tmp(train_seasons, test_seasons, y_test, y_pred_proba)

    X_train, y_train, df_train = prepare_data(MRegularSeasonDetailedResults, MTeamConferences, False, train_seasons)
    X_test, y_test, df_test = prepare_data(MNCAATourneyDetailedResults, MTeamConferences, False, test_seasons)
    model, y_pred_proba = logistic_regression(X_train, y_train, X_test)
    print("Train on regular season and test on tournament with no conferences")
    print_tmp(train_seasons, test_seasons, y_test, y_pred_proba)

Train and test on regular season
Train seasons: [2022]
Test seasons: [2023]
Brier Score: 0.2220639657
Brier Score after tresholding: 0.2222225503

Train on regular season and test on tournament
Train seasons: [2022]
Test seasons: [2023]
Brier Score: 0.2239554386
Brier Score after tresholding: 0.2220035052

Train and test on tournament
Train seasons: [2022]
Test seasons: [2023]
Brier Score: 0.2597437042
Brier Score after tresholding: 0.2607051521

Train on regular season and test on tournament with no conferences
Train seasons: [2022]
Test seasons: [2023]
Brier Score: 0.2537444326
Brier Score after tresholding: 0.2500000000

Train and test on regular season
Train seasons: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
Test seasons: [2023]
Brier Score: 0.2216529982
Brier Score after tresholding: 0.2218423569

Train on regular season and test on tournament
Train seasons: [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
Test sea

### Selecting features

In [53]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, brier_score_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV

# Assume X is your DataFrame with features and y is your binary target (0 or 1)
# For example:
# X = pd.read_csv("your_features.csv")
# y = pd.read_csv("your_labels.csv").squeeze()  # ensure y is a Series

# Split the dataset into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, y_train, df_train = prepare_data(MRegularSeasonCompactResults, MTeamConferences, True, [2022, 2023])
X_test, y_test, df_test = prepare_data(MNCAATourneyCompactResults, MTeamConferences, True, [2024])
model, y_pred_proba = logistic_regression(X_train, y_train, X_test)

# Initialize logistic regression (using 'liblinear' is often good for smaller datasets and supports L1 penalty if needed)
clf = LogisticRegression(solver='liblinear')

# Create a scorer using the Brier score loss. Since lower is better,
# we set greater_is_better=False so that the RFECV can maximize the (negative) score.
scorer = make_scorer(brier_score_loss, greater_is_better=False)

# Use RFECV to perform recursive feature elimination with cross-validation
rfecv = RFECV(estimator=clf, step=1, cv=5, scoring=scorer)
rfecv.fit(X_train, y_train)

print("Optimal number of features:", rfecv.n_features_)
print("Selected features:", X_train.columns[rfecv.support_].tolist())

# Retrain the logistic regression model using only the selected features
clf.fit(X_train.loc[:, rfecv.support_], y_train)

# Make probability predictions on the test set
y_pred_proba = clf.predict_proba(X_test.loc[:, rfecv.support_])[:, 1]

# Evaluate the model using the Brier score on the test set
brier = brier_score_loss(y_test, y_pred_proba)
print("Test set Brier Score:", brier)


Optimal number of features: 65
Selected features: ['Loc_A', 'Loc_H', 'T1Conf_a_sun', 'T1Conf_a_ten', 'T1Conf_aac', 'T1Conf_acc', 'T1Conf_aec', 'T1Conf_big_east', 'T1Conf_big_sky', 'T1Conf_big_south', 'T1Conf_big_ten', 'T1Conf_big_twelve', 'T1Conf_big_west', 'T1Conf_caa', 'T1Conf_cusa', 'T1Conf_horizon', 'T1Conf_ind', 'T1Conf_ivy', 'T1Conf_maac', 'T1Conf_mac', 'T1Conf_meac', 'T1Conf_mvc', 'T1Conf_mwc', 'T1Conf_nec', 'T1Conf_ovc', 'T1Conf_pac_twelve', 'T1Conf_patriot', 'T1Conf_sec', 'T1Conf_southland', 'T1Conf_summit', 'T1Conf_swac', 'T1Conf_wac', 'T1Conf_wcc', 'T2Conf_a_sun', 'T2Conf_a_ten', 'T2Conf_aac', 'T2Conf_acc', 'T2Conf_aec', 'T2Conf_big_east', 'T2Conf_big_sky', 'T2Conf_big_south', 'T2Conf_big_ten', 'T2Conf_big_twelve', 'T2Conf_big_west', 'T2Conf_caa', 'T2Conf_cusa', 'T2Conf_horizon', 'T2Conf_ind', 'T2Conf_ivy', 'T2Conf_maac', 'T2Conf_mac', 'T2Conf_meac', 'T2Conf_mwc', 'T2Conf_nec', 'T2Conf_ovc', 'T2Conf_pac_twelve', 'T2Conf_patriot', 'T2Conf_sec', 'T2Conf_southern', 'T2Conf_sout

In [59]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, brier_score_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV

# Assume X is your DataFrame with features and y is your binary target (0 or 1)
# For example:
# X = pd.read_csv("your_features.csv")
# y = pd.read_csv("your_labels.csv").squeeze()  # ensure y is a Series

# Split the dataset into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=10, random_state=42)

# Create a scorer using the Brier score loss.
# We use greater_is_better=False because lower Brier score indicates better performance.
scorer = make_scorer(brier_score_loss, greater_is_better=False)

# Use RFECV to perform recursive feature elimination with cross-validation
rfecv_rf = RFECV(estimator=rf_model, step=1, cv=5, scoring=scorer)
rfecv_rf.fit(X_train, y_train)

print("Optimal number of features (Random Forest):", rfecv_rf.n_features_)
print("Selected features (Random Forest):", X_train.columns[rfecv_rf.support_].tolist())

# Retrain the Random Forest model using only the selected features
rf_model.fit(X_train.loc[:, rfecv_rf.support_], y_train)

# Make probability predictions on the test set
y_pred_proba_rf = rf_model.predict_proba(X_test.loc[:, rfecv_rf.support_])[:, 1]

# Evaluate the model using the Brier score on the test set
brier_rf = brier_score_loss(y_test, y_pred_proba_rf)
print("Test set Brier Score (Random Forest):", brier_rf)


Optimal number of features (Random Forest): 1
Selected features (Random Forest): ['Loc_A']
Test set Brier Score (Random Forest): 0.24068708080640028


In [19]:
# --- Assume X_train, y_train, X_test, y_test, and df_train are already defined ---
# df_train should include a 'Season' column. Make sure it’s sorted by Season.
df_train_sorted = df_train.sort_values('Season')

# Use TimeSeriesSplit for temporal (walk-forward) cross-validation.
tscv = TimeSeriesSplit(n_splits=3)

# ------------------------------
# 1. Logistic Regression (Baseline)
# ------------------------------
log_brier_scores = []
for train_index, val_index in tscv.split(df_train_sorted):
    X_train_cv = X_train.loc[df_train_sorted.index[train_index]]
    y_train_cv = y_train.loc[df_train_sorted.index[train_index]]
    X_val_cv = X_train.loc[df_train_sorted.index[val_index]]
    y_val_cv = y_train.loc[df_train_sorted.index[val_index]]
    
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train_cv, y_train_cv)
    y_val_pred_proba = lr.predict_proba(X_val_cv)[:, 1]
    # Round extreme probabilities
    y_val_pred_proba = np.where(y_val_pred_proba > 0.95, 1,
                          np.where(y_val_pred_proba < 0.1, 0, y_val_pred_proba))
    score = brier_score_loss(y_val_cv, y_val_pred_proba)
    log_brier_scores.append(score)

print("Logistic Regression CV Brier Scores:", log_brier_scores)
print("Average Logistic Regression CV Brier Score:", np.mean(log_brier_scores))

# Train final logistic regression on all training data
final_lr = LogisticRegression(max_iter=1000)
final_lr.fit(X_train, y_train)
y_test_pred_lr = final_lr.predict_proba(X_test)[:, 1]
y_test_pred_lr = np.where(y_test_pred_lr > 0.95, 1,
                    np.where(y_test_pred_lr < 0.1, 0, y_test_pred_lr))
brier_lr = brier_score_loss(y_test, y_test_pred_lr)
print("Final Logistic Regression Test Brier Score:", brier_lr)

# ------------------------------
# 2. Tree-Based Model (Random Forest) with Calibration
# ------------------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# Calibrate using isotonic regression with the same temporal CV
calibrated_rf = CalibratedClassifierCV(rf, method='isotonic', cv=tscv)
calibrated_rf.fit(X_train, y_train)
y_test_pred_rf = calibrated_rf.predict_proba(X_test)[:, 1]
y_test_pred_rf = np.where(y_test_pred_rf > 0.95, 1,
                    np.where(y_test_pred_rf < 0.1, 0, y_test_pred_rf))
brier_rf = brier_score_loss(y_test, y_test_pred_rf)
print("Final Calibrated Random Forest Test Brier Score:", brier_rf)

# ------------------------------
# 3. Prepare Final Predictions in Submission Format
# ------------------------------
# Assume df_test corresponds to the test set and contains Season, Team1, and Team2.
# Create an ID in the format: Season_Team1_Team2.
df_test['ID'] = df_test.apply(lambda row: f"{row['Season']}_{row['Team1']}_{row['Team2']}", axis=1)

# Add predictions from both models to df_test.
df_test['Pred_lr'] = y_test_pred_lr
df_test['Pred_rf'] = y_test_pred_rf

# Create submission DataFrames (example using logistic regression)
submission_lr = df_test[['ID', 'Pred_lr']]
submission_rf = df_test[['ID', 'Pred_rf']]

print("Submission from Logistic Regression:")
print(submission_lr.head())

print("Submission from Calibrated Random Forest:")
print(submission_rf.head())


Logistic Regression CV Brier Scores: [0.22838221065078593, 0.24019922013470363, 0.2224783155889698]
Average Logistic Regression CV Brier Score: 0.23035324879148644
Final Logistic Regression Test Brier Score: 0.25443847228100314
Final Calibrated Random Forest Test Brier Score: 0.26487586527590296
Submission from Logistic Regression:
                  ID   Pred_lr
1315  2024_1161_1438  0.474249
1316  2024_1224_1447  0.493097
1317  2024_1129_1160  0.485882
1318  2024_1212_1286  0.496201
1319  2024_1112_1253  0.484821
Submission from Calibrated Random Forest:
                  ID   Pred_rf
1315  2024_1161_1438  0.466241
1316  2024_1224_1447  0.525222
1317  2024_1129_1160  0.421288
1318  2024_1212_1286  0.514271
1319  2024_1112_1253  0.576410


## Another idea: make records with tams-vs-team pair scores
Explored in First_model_team_approach notebook.