In [8]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output
import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import brier_score_loss
import xgboost as xgb
# import lightgbm as lgb

In [11]:
games = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')
gamesT = pd.read_csv('../../data/MNCAATourneyCompactResults.csv')
games['TeamA'] = games[['WTeamID', 'LTeamID']].min(axis=1)
games['TeamB'] = games[['WTeamID', 'LTeamID']].max(axis=1)
gamesT['TeamA'] = gamesT[['WTeamID', 'LTeamID']].min(axis=1)
gamesT['TeamB'] = gamesT[['WTeamID', 'LTeamID']].max(axis=1)
def split_train_test_by_pair(games_df, n=1):
    train_list = []
    test_list = []
    
    grouped = games_df.groupby(['TeamA', 'TeamB'])
    
    for (teamA, teamB), group in grouped:
        group_sorted = group.sort_values(['Season', 'DayNum'])
        if len(group_sorted) > n:
            train = group_sorted.iloc[:-n]
            test = group_sorted.iloc[-n:]
        else:
            train = group_sorted.iloc[0:0]
            test = group_sorted
        
        train_list.append(train)
        test_list.append(test)
    
    train_set = pd.concat(train_list).reset_index(drop=True)
    test_set = pd.concat(test_list).reset_index(drop=True)
    
    return train_set, test_set

In [19]:
n = 1

train_set, train_set2 = split_train_test_by_pair(games, n=n)
test_set, test_set2 = split_train_test_by_pair(gamesT, n=n)

print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)
print(test_set2.head())

train_set = pd.concat([train_set, train_set2])
test_set = pd.concat([test_set, test_set2])

Train set shape: (162629, 10)
Test set shape: (355, 10)
   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT  TeamA  TeamB
0    2019     136     1246      79     1101      44    N      0   1101   1246
1    2021     138     1101      53     1400      52    N      0   1101   1400
2    2021     140     1417      67     1101      47    N      0   1101   1417
3    2006     136     1228      78     1102      69    N      0   1102   1228
4    2004     136     1314      63     1102      52    N      0   1102   1314


In [22]:
print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)

def create_features(df):
    df = df.copy()
    df['Outcome'] = (df['WTeamID'] == df['TeamA']).astype(int)
    X = df[['TeamA', 'TeamB']].astype(str)
    y = df['Outcome']
    X_encoded = pd.get_dummies(X, columns=['TeamA', 'TeamB'])
    return X_encoded, y

X_train, y_train = create_features(train_set)
X_test, y_test = create_features(test_set)

X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

models = {
    # 'XGBoost (GPU)': xgb.XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, eval_metric='logloss'),
    # 'LightGBM (GPU)': lgb.LGBMClassifier(device='gpu'),
    'Logistic Regression (CPU)': LogisticRegression(max_iter=1000),
}

bootstrap_threshold = 200 
bootstrap_factor = 3

progress = widgets.IntProgress(value=0, min=0, max=len(models), description='Models:')
display(progress)

results = {}

for name, model in models.items():
    if len(X_train) < bootstrap_threshold:
        X_train_boot = X_train.sample(n=bootstrap_factor * len(X_train), replace=True, random_state=42)
        y_train_boot = y_train.loc[X_train_boot.index]
        X_to_train = X_train_boot
        y_to_train = y_train_boot
    else:
        X_to_train = X_train
        y_to_train = y_train

    start_time = time.time()
    model.fit(X_to_train, y_to_train)
    training_time = time.time() - start_time

    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_pred_proba = model.decision_function(X_test)
        y_pred_proba = 1 / (1 + np.exp(-y_pred_proba))
        
    brier = brier_score_loss(y_test, y_pred_proba)
    
    results[name] = {
         'model': model,
         'training_time': training_time,
         'brier_score': brier,
         'y_pred_proba': y_pred_proba
    }
    
    progress.value += 1

progress.layout.visibility = 'hidden'

print("Trained models and their performance on the test set:")
for name, res in results.items():
    print(f"{name} - Training time: {res['training_time']:.2f} sec, Brier Score: {res['brier_score']:.4f}")

Train set shape: (191796, 10)
Test set shape: (2518, 10)


IntProgress(value=0, description='Models:', max=1)

Trained models and their performance on the test set:
Logistic Regression (CPU) - Training time: 3.75 sec, Brier Score: 0.2012
