Data Section 1 file: MTeams.csv and WTeams.csv


These files identify the different college teams present in the dataset .

    TeamID - a 4 digit id number, uniquely identifying each NCAA® men's or women's team. A school's TeamID does not change from one year to the next, so for instance the Duke men's TeamID is 1181 for all seasons. The men's team ID's range from 1000-1999, whereas all of the women's team ID's range from 3000-3999.
    TeamName - a compact spelling of the team's college name, 16 characters or fewer.
    FirstD1Season - the first season in our dataset that the school was a Division-I school. This column is only present in the men's data, so it is not found in WTeams.csv.
    LastD1Season - the last season in our dataset that the school was a Division-I school. For any teams that are currently Division-I, they will be listed with LastD1Season=2025. Again, this column is only present in the men's data, so it is not found in WTeams.csv.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output
import time

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import brier_score_loss
import xgboost as xgb
import lightgbm as lgb

In [None]:
men_teams = pd.read_csv("../../data/MTeams.csv")

In [None]:
men_teams.head

In [None]:
men_scores_compact = pd.read_csv("../../data/MRegularSeasonCompactResults.csv")

In [None]:
men_scores_compact.head()

In [None]:
men_scores_compact.columns

In [None]:
men_plays = men_scores_compact.drop(columns=['Season', 'DayNum','WScore', 'LScore', 'WLoc','NumOT' ])

In [None]:
men_plays.columns

In [None]:
occurrences = men_plays.groupby(['WTeamID', 'LTeamID']).size().reset_index(name='count')

In [None]:
occurrences.sort_values(by='count', ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(occurrences['WTeamID'], occurrences['LTeamID'], s=occurrences['count'], alpha=0.5)
plt.xlabel('WTeamID')
plt.ylabel('LTeamID')
plt.title('Occurrences of WTeamID vs LTeamID')
plt.show()

In [None]:
occurrences

In [None]:
df = pd.read_csv("../../data/MRegularSeasonCompactResults.csv")
wins = df[['Season', 'DayNum', 'WTeamID']].copy()
wins.columns = ['Season', 'DayNum', 'TeamID']
wins['Result'] = 1
losses = df[['Season', 'DayNum', 'LTeamID']].copy()
losses.columns = ['Season', 'DayNum', 'TeamID']
losses['Result'] = 0
games = pd.concat([wins, losses], ignore_index=True)
games = games.sort_values(['TeamID', 'Season', 'DayNum']).reset_index(drop=True)

def compute_loss_streak(results):
    streak = 0
    streaks = []
    for r in results:
        streaks.append(streak)
        if r == 0:  
            streak += 1
        else:                   streak = 0
    return streaks

games['LossStreak'] = games.groupby('TeamID')['Result'].transform(compute_loss_streak)
streak_summary = games.groupby('LossStreak')['Result'].agg(['mean', 'count']).reset_index()
streak_summary.rename(columns={'mean': 'WinProbability', 'count': 'NumGames'}, inplace=True)
print(streak_summary)
plt.figure(figsize=(8, 6))
plt.plot(streak_summary['LossStreak'], streak_summary['WinProbability'], marker='o')
plt.xlabel("Loss Streak (games lost consecutively before the current game)")
plt.ylabel("Win Probability in the Current Game")
plt.title("Win Probability vs. Prior Loss Streak")
plt.grid(True)
plt.show()


In [None]:
df = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')

print("First 5 rows of the dataset:")
print(df.head())
df_non_neutral = df[df['WLoc'] != 'N'].copy()
location_counts = df_non_neutral['WLoc'].value_counts()
print("\nCount of non-neutral games by winning location:")
print(location_counts)
total_non_neutral = location_counts.sum()
win_percentages = (location_counts / total_non_neutral * 100).round(2)
print("\nPercentage of wins by winning location:")
print(win_percentages)
home_win_rate = win_percentages.get('H', 0)
print(f"\nHome team win rate (non-neutral games): {home_win_rate:.2f}%")
df_non_neutral['Margin'] = df_non_neutral['WScore'] - df_non_neutral['LScore']
margin_by_loc = df_non_neutral.groupby('WLoc')['Margin'].mean()
print("\nAverage margin of victory by winning location:")
print(margin_by_loc)
fig, ax = plt.subplots(1, 3, figsize=(18, 5))
location_counts.plot(kind='bar', ax=ax[0], color=['skyblue', 'salmon'])
ax[0].set_title("Count of Non-Neutral Games")
ax[0].set_xlabel("Winning Location\n(H = Home, A = Away)")
ax[0].set_ylabel("Count")
win_percentages.plot(kind='bar', ax=ax[1], color=['skyblue', 'salmon'])
ax[1].set_title("Win Percentages by Location")
ax[1].set_xlabel("Winning Location\n(H = Home, A = Away)")
ax[1].set_ylabel("Percentage (%)")
margin_by_loc.plot(kind='bar', ax=ax[2], color=['skyblue', 'salmon'])
ax[2].set_title("Average Margin of Victory")
ax[2].set_xlabel("Winning Location\n(H = Home, A = Away)")
ax[2].set_ylabel("Average Margin")
plt.tight_layout()
plt.show()


In [None]:
df = pd.read_csv('../../data/MRegularSeasonDetailedResults.csv')
print("First 5 rows of the detailed results dataset:")
print(df.head())

df['W_Attempts'] = df['WFGA'] + 0.44 * df['WFTA']
df['L_Attempts'] = df['LFGA'] + 0.44 * df['LFTA']
print("\nDescriptive statistics for winners' shot attempts:")
print(df['W_Attempts'].describe())
print("\nDescriptive statistics for losers' shot attempts:")
print(df['L_Attempts'].describe())
fig, ax = plt.subplots(figsize=(8, 6))
data = [df['W_Attempts'], df['L_Attempts']]
ax.boxplot(data, labels=['Winners', 'Losers'])
ax.set_title("Shot Attempts (Style of Play) for Winners vs. Losers")
ax.set_ylabel("Shot Attempts (FGA + 0.44*FTA)")
plt.show()
plt.figure(figsize=(10, 5))
plt.hist(df['W_Attempts'], bins=50, alpha=0.6, label='Winners')
plt.hist(df['L_Attempts'], bins=50, alpha=0.6, label='Losers')
plt.title("Histogram of Shot Attempts for Winners vs. Losers")
plt.xlabel("Shot Attempts (FGA + 0.44*FTA)")
plt.ylabel("Frequency")
plt.legend()
plt.show()
df['Margin'] = df['WScore'] - df['LScore']
df['Attempt_Diff'] = df['W_Attempts'] - df['L_Attempts']
plt.figure(figsize=(8, 6))
plt.scatter(df['Attempt_Diff'], df['Margin'], alpha=0.5)
plt.title("Difference in Shot Attempts vs. Margin of Victory")
plt.xlabel("Shot Attempts Difference (Winner - Loser)")
plt.ylabel("Margin of Victory")
plt.show()
correlation = df['Attempt_Diff'].corr(df['Margin'])
print(f"\nCorrelation between shot attempt difference and margin of victory: {correlation:.2f}")


In [None]:
coaches = pd.read_csv('../../data/MTeamCoaches.csv')
games = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')

winners = games[['Season', 'DayNum', 'WTeamID']].copy()
winners.rename(columns={'WTeamID': 'TeamID'}, inplace=True)
winners['Outcome'] = 1  

losers = games[['Season', 'DayNum', 'LTeamID']].copy()
losers.rename(columns={'LTeamID': 'TeamID'}, inplace=True)
losers['Outcome'] = 0  

coach_games = pd.concat([winners, losers], ignore_index=True)
merged = pd.merge(coach_games, coaches, on=['Season', 'TeamID'], how='left')
merged = merged[(merged['DayNum'] >= merged['FirstDayNum']) & (merged['DayNum'] <= merged['LastDayNum'])]
coach_performance = merged.groupby('CoachName').agg(
    games_coached=('Outcome', 'count'),
    wins=('Outcome', 'sum')
).reset_index()
coach_performance['win_pct'] = coach_performance['wins'] / coach_performance['games_coached']
min_games = 50
coach_performance_filtered = coach_performance[coach_performance['games_coached'] >= min_games]

plt.figure(figsize=(10, 6))
sns.histplot(coach_performance_filtered['win_pct'], bins=20, kde=True)
plt.xlabel('Win Percentage')
plt.title(f'Distribution of Coach Win Percentages (Coaches with >= {min_games} Games)')
plt.show()

first_season = merged.groupby('CoachName')['Season'].min().reset_index().rename(columns={'Season': 'first_season'})
merged = pd.merge(merged, first_season, on='CoachName', how='left')
merged['is_first_season'] = np.where(merged['Season'] == merged['first_season'], 'First Season', 'Subsequent Seasons')
first_season_stats = merged.groupby('is_first_season').agg(
    games=('Outcome', 'count'),
    wins=('Outcome', 'sum')
).reset_index()
first_season_stats['win_pct'] = first_season_stats['wins'] / first_season_stats['games']
print("Win percentages by season type:")
print(first_season_stats)
coach_season_win = merged.groupby(['CoachName', 'is_first_season']).agg(
    games_coached=('Outcome', 'count'),
    wins=('Outcome', 'sum')
).reset_index()
coach_season_win['win_pct'] = coach_season_win['wins'] / coach_season_win['games_coached']

plt.figure(figsize=(12, 6))
sns.boxplot(x='is_first_season', y='win_pct', data=coach_season_win)
plt.xlabel('Season Type')
plt.ylabel('Win Percentage')
plt.title('Coach Win Percentage: First Season vs. Subsequent Seasons')
plt.show()


In [None]:
games = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')
teams_df = pd.read_csv('../../data/MTeams.csv')

team_names = dict(zip(teams_df['TeamID'], teams_df['TeamName']))

options_teamA = [(team_names[tid], tid) for tid in teams_df['TeamID']]
options_teamA = sorted(options_teamA, key=lambda x: x[0])
dropdown_teamA = widgets.Dropdown(options=options_teamA, description='Team A:')

dropdown_teamB = widgets.Dropdown(options=[], description='Team B:')
def update_teamB_options(change):
    teamA = change['new']
    
    subset = games[(games['WTeamID'] == teamA) | (games['LTeamID'] == teamA)]
    opponents = set(subset['WTeamID'].tolist() + subset['LTeamID'].tolist())
    opponents.discard(teamA)
    new_options = [(team_names[tid], tid) for tid in opponents]
    new_options = sorted(new_options, key=lambda x: x[0])
    dropdown_teamB.options = new_options
    if new_options:
        dropdown_teamB.value = new_options[0][1]
    else:
        dropdown_teamB.value = None
dropdown_teamA.observe(update_teamB_options, names='value')

update_teamB_options({'new': dropdown_teamA.value})
def update_plot(teamA, teamB):
    clear_output(wait=True)
    display(interactive_plot)
    
    

    subset = games[
        ((games['WTeamID'] == teamA) & (games['LTeamID'] == teamB)) |
        ((games['WTeamID'] == teamB) & (games['LTeamID'] == teamA))
    ]
    
    if subset.empty:
        print("No games found between {} and {}.".format(team_names[teamA], team_names[teamB]))
        return
    

    progress = widgets.IntProgress(value=0, min=0, max=len(subset), description='Processing:')
    display(progress)
 
   

    margins = []
    for _, row in subset.iterrows():
        if row['WTeamID'] == teamA:
            margin = row['WScore'] - row['LScore']
        else:
            margin = -(row['LScore'] - row['WScore'])
        margins.append(margin)
        progress.value += 1  
    

    subset = subset.copy()
    subset['margin'] = margins
    
    

    season_net = subset.groupby('Season')['margin'].sum().reset_index().sort_values('Season')
    
    

    plt.figure(figsize=(10, 6))
    plt.plot(season_net['Season'], season_net['margin'], marker='o', linestyle='-')
    plt.xlabel('Season')
    plt.ylabel('Net Margin for {}'.format(team_names[teamA]))
    plt.title('Net Margin Over Time: {} vs {}'.format(team_names[teamA], team_names[teamB]))
    plt.grid(True)
    plt.show()
    
    

    progress.layout.visibility = 'hidden'
interactive_plot = widgets.interactive(update_plot, teamA=dropdown_teamA, teamB=dropdown_teamB)
display(interactive_plot)


In [None]:
import pandas as pd
games = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')
teams = pd.read_csv('../../data/MTeams.csv')
team_names = dict(zip(teams['TeamID'], teams['TeamName']))

games['TeamA'] = games[['WTeamID', 'LTeamID']].min(axis=1)
games['TeamB'] = games[['WTeamID', 'LTeamID']].max(axis=1)

pair_counts = games.groupby(['TeamA', 'TeamB']).size().reset_index(name='games_played')
top_pairs = pair_counts.sort_values('games_played', ascending=False).head(10)
top_pairs['TeamAName'] = top_pairs['TeamA'].map(team_names)
top_pairs['TeamBName'] = top_pairs['TeamB'].map(team_names)
print("Top 10 pairs of teams that have played the most games together:")
print(top_pairs[['TeamAName', 'TeamBName', 'games_played']])


In [None]:
import pandas as pd
games = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')
games['TeamA'] = games[['WTeamID', 'LTeamID']].min(axis=1)
games['TeamB'] = games[['WTeamID', 'LTeamID']].max(axis=1)
def split_train_test_by_pair(games_df, n=1):
    train_list = []
    test_list = []
    
    grouped = games_df.groupby(['TeamA', 'TeamB'])
    
    for (teamA, teamB), group in grouped:
        group_sorted = group.sort_values(['Season', 'DayNum'])
        if len(group_sorted) > n:
            train = group_sorted.iloc[:-n]
            test = group_sorted.iloc[-n:]
        else:
            train = group_sorted.iloc[0:0]
            test = group_sorted
        
        train_list.append(train)
        test_list.append(test)
    
    train_set = pd.concat(train_list).reset_index(drop=True)
    test_set = pd.concat(test_list).reset_index(drop=True)
    
    return train_set, test_set

n = 1

train_set, test_set = split_train_test_by_pair(games, n=n)

print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)
print(test_set.head())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output
import time

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import brier_score_loss
import xgboost as xgb
import lightgbm as lgb

In [None]:
n = 1  
train_set, test_set = split_train_test_by_pair(games, n=n)

print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

def create_features(df):
    df = df.copy()
    df['Outcome'] = (df['WTeamID'] == df['TeamA']).astype(int)
    X = df[['TeamA', 'TeamB']].astype(str)
    y = df['Outcome']
    X_encoded = pd.get_dummies(X, columns=['TeamA', 'TeamB'])
    return X_encoded, y

X_train, y_train = create_features(train_set)
X_test, y_test = create_features(test_set)

X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

models = {
    'XGBoost (GPU)': xgb.XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, eval_metric='logloss'),
    'LightGBM (GPU)': lgb.LGBMClassifier(device='gpu'),
    'Logistic Regression (CPU)': LogisticRegression(max_iter=1000),
}

bootstrap_threshold = 200 
bootstrap_factor = 3

progress = widgets.IntProgress(value=0, min=0, max=len(models), description='Models:')
display(progress)

results = {}

for name, model in models.items():
    if len(X_train) < bootstrap_threshold:
        X_train_boot = X_train.sample(n=bootstrap_factor * len(X_train), replace=True, random_state=42)
        y_train_boot = y_train.loc[X_train_boot.index]
        X_to_train = X_train_boot
        y_to_train = y_train_boot
    else:
        X_to_train = X_train
        y_to_train = y_train

    start_time = time.time()
    model.fit(X_to_train, y_to_train)
    training_time = time.time() - start_time

    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_pred_proba = model.decision_function(X_test)
        y_pred_proba = 1 / (1 + np.exp(-y_pred_proba))
        
    brier = brier_score_loss(y_test, y_pred_proba)
    
    results[name] = {
         'model': model,
         'training_time': training_time,
         'brier_score': brier,
         'y_pred_proba': y_pred_proba
    }
    
    progress.value += 1

progress.layout.visibility = 'hidden'

print("Trained models and their performance on the test set:")
for name, res in results.items():
    print(f"{name} - Training time: {res['training_time']:.2f} sec, Brier Score: {res['brier_score']:.4f}")


## Basic Hyper parameters

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for logistic regression
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['saga'],  # saga supports both l1 and l2
    'max_iter': [1000]
}

# Initialize the logistic regression model (CPU-based)
logreg = LogisticRegression()

# Set up GridSearchCV with cross-validation and parallel processing
grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,            # Use all CPU cores
    scoring='neg_log_loss'  # or another scoring metric like 'accuracy'
)

# Fit grid search on your training data
grid_search.fit(X_train, y_train)

# Best hyperparameters and best model
print("Best hyperparameters:", grid_search.best_params_)
best_logreg = grid_search.best_estimator_


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import ipywidgets as widgets
from IPython.display import display


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)



X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32).to(device)


class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

input_dim = X_train_tensor.shape[1]

learning_rates = [0.01, 0.001, 0.0001]
weight_decays = [0, 1e-4, 1e-3, 1e-2]
param_grid = [(lr, wd) for lr in learning_rates for wd in weight_decays]
n_total = len(param_grid)

progress = widgets.IntProgress(value=0, min=0, max=n_total, description='GPU Tuning:')
display(progress)

best_brier = float('inf')
best_params = None
best_model = None

num_epochs = 50  
batch_size = 64

dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

criterion = nn.BCELoss()  


for i, (lr, wd) in enumerate(param_grid):
    
    model = LogisticRegressionModel(input_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    
    
    model.train()
    for epoch in range(num_epochs):
        for batch_X, batch_y in dataloader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
    
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        
        brier_score = torch.mean((y_pred - y_test_tensor) ** 2).item()
    
    
    print(f"LR: {lr}, Weight Decay: {wd}, Brier Score: {brier_score:.4f}")
    
    
    if brier_score < best_brier:
        best_brier = brier_score
        best_params = {'learning_rate': lr, 'weight_decay': wd}
        best_model = model
    
    
    progress.value = i + 1

print("Best GPU (PyTorch) parameters:", best_params)
print("Best GPU (PyTorch) Brier score:", best_brier)


Best GPU (PyTorch) parameters: {'learning_rate': 0.001, 'weight_decay': 0}
Best GPU (PyTorch) Brier score: 0.17488974332809448

In [None]:
import time
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss
import xgboost as xgb
import lightgbm as lgb

# (Assumes the following helper functions are defined from your earlier code)
# split_train_test_by_pair(games, n) and create_features(df)

# For example, using n = 1:
n = 1  
train_set, test_set = split_train_test_by_pair(games, n=n)
print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

def create_features(df):
    df = df.copy()
    df['Outcome'] = (df['WTeamID'] == df['TeamA']).astype(int)
    X = df[['TeamA', 'TeamB']].astype(str)
    y = df['Outcome']
    X_encoded = pd.get_dummies(X, columns=['TeamA', 'TeamB'])
    return X_encoded, y

X_train, y_train = create_features(train_set)
X_test, y_test = create_features(test_set)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# If the training set is small, force the models to run on CPU.
if X_train.shape[0] < 1000:
    print("Dataset is small; using CPU for tree-based models.")
    models = {
         'XGBoost (CPU)': xgb.XGBClassifier(tree_method='hist', 
                                            use_label_encoder=False, 
                                            eval_metric='logloss'),
         'LightGBM (CPU)': lgb.LGBMClassifier(device='cpu'),
         'Logistic Regression (CPU)': LogisticRegression(max_iter=1000),
    }
else:
    models = {
         'XGBoost (GPU)': xgb.XGBClassifier(tree_method='gpu_hist', 
                                            use_label_encoder=False, 
                                            eval_metric='logloss'),
         'LightGBM (GPU)': lgb.LGBMClassifier(device='gpu'),
         'Logistic Regression (CPU)': LogisticRegression(max_iter=1000),
    }

bootstrap_threshold = 200 
bootstrap_factor = 3

progress = widgets.IntProgress(value=0, min=0, max=len(models), description='Models:')
display(progress)

results = {}

for name, model in models.items():
    # Optionally use bootstrap sampling if training set is very small.
    if len(X_train) < bootstrap_threshold:
        X_train_boot = X_train.sample(n=bootstrap_factor * len(X_train), replace=True, random_state=42)
        y_train_boot = y_train.loc[X_train_boot.index]
        X_to_train = X_train_boot
        y_to_train = y_train_boot
    else:
        X_to_train = X_train
        y_to_train = y_train

    start_time = time.time()
    model.fit(X_to_train, y_to_train)
    training_time = time.time() - start_time

    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_pred_proba = model.decision_function(X_test)
        y_pred_proba = 1 / (1 + np.exp(-y_pred_proba))
        
    brier = brier_score_loss(y_test, y_pred_proba)
    
    results[name] = {
         'model': model,
         'training_time': training_time,
         'brier_score': brier,
         'y_pred_proba': y_pred_proba
    }
    
    progress.value += 1

progress.layout.visibility = 'hidden'

print("Trained models and their performance on the test set:")
for name, res in results.items():
    print(f"{name} - Training time: {res['training_time']:.2f} sec, Brier Score: {res['brier_score']:.4f}")


# More advanced models

In [None]:
pip install joblib

In [None]:
import os
import json
import time
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss
import joblib

In [None]:
def split_by_year(games_df, predict_year): # useful for predicting future seasons
    train_set = games_df[games_df['Season'] < predict_year]
    test_set = games_df[games_df['Season'] == predict_year]
    return train_set, test_set

def create_features(df):
    df = df.copy()
    df['Outcome'] = (df['WTeamID'] == df['TeamA']).astype(int)
    X = df[['TeamA', 'TeamB']].astype(str)
    y = df['Outcome']
    X_encoded = pd.get_dummies(X, columns=['TeamA', 'TeamB'])
    return X_encoded, y

In [None]:
games = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')
teams = pd.read_csv('../../data/MTeams.csv')

team_names = dict(zip(teams['TeamID'], teams['TeamName']))

games['TeamA'] = games[['WTeamID', 'LTeamID']].min(axis=1)
games['TeamB'] = games[['WTeamID', 'LTeamID']].max(axis=1)

In [None]:
predict_year = 2024
train_set, test_set = split_by_year(games, predict_year)
print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

X_train, y_train = create_features(train_set)
X_test, y_test = create_features(test_set)

In [None]:
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

hyperparameter_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
}

param_combinations = [{'C': c} for c in hyperparameter_grid['C']]
n_total = len(param_combinations)
print("Total hyperparameter combinations:", n_total)


progress = widgets.IntProgress(value=0, min=0, max=n_total, description='Tuning:')
display(progress)

best_brier = float('inf')
best_params = None
best_model = None

for i, params in enumerate(param_combinations):
    model = LogisticRegression(C=params['C'], max_iter=10000)
    
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Predict probabilities on the test set.
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    brier = brier_score_loss(y_test, y_pred_proba)
    
    print(f"Params: {params}, Training time: {training_time:.2f} sec, Brier Score: {brier:.4f}")
    
    if brier < best_brier:
        best_brier = brier
        best_params = params
        best_model = model
    
    progress.value = i + 1

progress.layout.visibility = 'hidden'

print("\nBest parameters:", best_params)
print("Best Brier Score:", best_brier)

# -------------------------------
# Saving the Best Model & Description
# -------------------------------
os.makedirs("saved_models", exist_ok=True)
model_filename = os.path.join("saved_models", "best_logistic_regression_model" + str(predict_year) + ".pkl")
desc_filename = os.path.join("saved_models", "best_logistic_regression_model" + str(predict_year) + ".json")

# Save the model using joblib.
joblib.dump(best_model, model_filename)

# Save the description.
description = {
    "model_type": "Logistic Regression",
    "hyperparameters": best_params,
    "brier_score": best_brier,
    "predict_year": predict_year,
    "train_set_shape": train_set.shape,
    "test_set_shape": test_set.shape
}
with open(desc_filename, "w") as f:
    json.dump(description, f, indent=4)

print(f"Best model saved to {model_filename}")
print(f"Model description saved to {desc_filename}")

In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss
import ipywidgets as widgets
from IPython.display import display

# -------------------------------
# Data Loading and Preprocessing
# -------------------------------

# Load game results (assumed to be in ../../data/)
games = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')

# Create unique matchup identifiers: TeamA is the lower ID, TeamB is the higher ID.
games['TeamA'] = games[['WTeamID', 'LTeamID']].min(axis=1)
games['TeamB'] = games[['WTeamID', 'LTeamID']].max(axis=1)

def create_features(df):
    """
    Given a DataFrame of games, creates one-hot encoded features for team matchups and the target.
    Outcome is 1 if the lower-ID team (TeamA) wins (i.e. if WTeamID equals TeamA), else 0.
    """
    df = df.copy()
    df['Outcome'] = (df['WTeamID'] == df['TeamA']).astype(int)
    # Convert team IDs to strings so that one-hot encoding creates categorical features.
    X = df[['TeamA', 'TeamB']].astype(str)
    y = df['Outcome']
    X_encoded = pd.get_dummies(X, columns=['TeamA', 'TeamB'])
    return X_encoded, y

def train_and_predict(test_year, min_train_year=2013):
    """
    Trains a Logistic Regression model using all games from min_train_year up to test_year - 1,
    then predicts on games from test_year.
    
    Parameters:
      test_year: The year to predict.
      min_train_year: The earliest season to use for training.
    
    Returns:
      model: Trained LogisticRegression model.
      brier: Brier score on the test set.
      X_test: Features for the test set.
      y_test: True outcomes for the test set.
      y_pred: Predicted probabilities for the test set.
    """
    # Training set: all games with Season >= min_train_year and Season < test_year.
    train_set = games[(games['Season'] >= min_train_year) & (games['Season'] < test_year)]
    # Test set: games with Season == test_year.
    test_set = games[games['Season'] == test_year]
    
    # Create features.
    X_train, y_train = create_features(train_set)
    X_test, y_test = create_features(test_set)
    
    # Align features so that both training and test sets have the same dummy columns.
    X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)
    
    # Train logistic regression.
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    # Predict probabilities.
    y_pred = model.predict_proba(X_test)[:, 1]
    
    # Compute Brier score.
    brier = brier_score_loss(y_test, y_pred)
    
    return model, brier, X_test, y_test, y_pred

# -------------------------------
# Rolling Forecast: Loop Over Years
# -------------------------------

# We'll make predictions for each year from 2014 to 2024.
years = list(range(2014, 2025))
results = {}

# Create a progress bar.
progress = widgets.IntProgress(value=0, min=0, max=len(years), description='Rolling Forecast:')
display(progress)

for i, yr in enumerate(years):
    model, brier, X_test, y_test, y_pred = train_and_predict(test_year=yr, min_train_year=2013)
    results[yr] = {
        'model': model,
        'brier': brier,
        'X_test': X_test,
        'y_test': y_test,
        'y_pred': y_pred
    }
    progress.value = i + 1

progress.layout.visibility = 'hidden'

# -------------------------------
# Reporting the Results
# -------------------------------
print("Rolling Forecast Performance (Brier Scores):")
for yr in years:
    print(f"Year: {yr}, Brier Score: {results[yr]['brier']:.4f}")

print("\nFinal evaluation on 2024:")
print(f"Brier Score for 2024: {results[2024]['brier']:.4f}")
