In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.callbacks import LearningRateScheduler

# Set seeds for the model to ensure reproducibility of results
seed = 10
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.experimental.numpy.random.seed(seed)

2024-03-20 15:58:04.920570: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-20 15:58:04.920712: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-20 15:58:05.090468: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Data Preprocessing

In [2]:
# Initialize runtime configuration
curr_year = 2023 # Using 2023 as 2024 data is not available

# Load in the compactresults csv and subset to the specified year
game_history_df = pd.read_csv('/kaggle/input/ncaa-seeds-2024/MRegularSeasonCompactResults.csv')
curr_history_df = game_history_df[game_history_df['Season'] == 2023]

def convert_wl_to_ab(row):
    # 50/50 if the winning team becomes TeamA or TeamB
    if np.random.uniform() > 0.5:
        new_row = {
            'team_a': row['WTeamID'],
            'team_a_score': row['WScore'],
            'team_b': row['LTeamID'],
            'team_b_score': row['LScore']
        }
    else:
        new_row = {
            'team_b': row['WTeamID'],
            'team_b_score': row['WScore'],
            'team_a': row['LTeamID'],
            'team_a_score': row['LScore']
        }
    return new_row

# Convert the dataframe from win/loss team to a/b team.
ab_history = curr_history_df.apply(convert_wl_to_ab, axis=1).tolist()
ab_history_df = pd.DataFrame(ab_history)
# Define the score_diff column as the difference between team a and team b (can be negative).
ab_history_df['score_diff'] = ab_history_df['team_a_score'] - ab_history_df['team_b_score']

# Team names need to be label encoded so they can be used as embeddings in the model.
label_encoder = LabelEncoder()
all_teams = list(set(ab_history_df['team_a'].tolist() + ab_history_df['team_b'].tolist()))
label_encoder.fit(all_teams)
ab_history_df['team_a_encoded'] = label_encoder.transform(ab_history_df['team_a'])
ab_history_df['team_b_encoded'] = label_encoder.transform(ab_history_df['team_b'])

# Define our features and target values. This model only uses the team names and score_diff.
features = ['team_a_encoded', 'team_b_encoded']
feature_df = ab_history_df[features]
target_df = ab_history_df['score_diff']

# Break our data into a train/test split to evaluate performance. Finalized models should use all data.
X_train, X_test, y_train, y_test = train_test_split(feature_df, target_df, test_size=0.33, random_state=42)

## Model Definition and Training

In [3]:
def get_model():
    # Define hyperparameters
    embedding_dim = 32
    num_teams = len(all_teams)
    dropout_rate = 0.2

    # Input layers
    team_a = Input(shape=(1,), name='team_a')
    team_b = Input(shape=(1,), name='team_b')

    # Embedding layers
    team_a_embedding = Embedding(num_teams, embedding_dim, name='team_a_embedding')(team_a)
    team_b_embedding = Embedding(num_teams, embedding_dim, name='team_b_embedding')(team_b)

    # Flatten and concatenate the layers
    team_a_flat = Flatten()(team_a_embedding)
    team_b_flat = Flatten()(team_b_embedding)
    team_a_b = Concatenate()([team_a_flat, team_b_flat])

    # Dense layers with dropout
    dense_1 = Dense(128, activation='relu')(team_a_b)
    dropout_1 = Dropout(dropout_rate)(dense_1)
    dense_2 = Dense(64, activation='relu')(dropout_1)
    dropout_2 = Dropout(dropout_rate)(dense_2)
    dense_3 = Dense(32, activation='relu')(dropout_2)
    dropout_3 = Dropout(dropout_rate)(dense_3)

    # Output layer
    output_layer = Dense(1, activation='linear', name='output')(dropout_3)

    # Create and compile the model
    model = Model(inputs=[team_a, team_b], outputs=output_layer)

    model.compile(optimizer=Adam(learning_rate=1e-3), loss='mse', metrics=['mae'])

    return model

def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr * np.exp(-0.1)

# Get and train the model
bracket_model = get_model()
bracket_model.fit(
    [X_train['team_a_encoded'], X_train['team_b_encoded']],
    y_train,
    batch_size=32,
    epochs=20,
    validation_data=([X_test['team_a_encoded'], X_test['team_b_encoded']], y_test),
    callbacks=[LearningRateScheduler(scheduler)]
)

# Check accuracy of predictions
preds = bracket_model.predict([X_test['team_a_encoded'], X_test['team_b_encoded']])
preds = preds.reshape(-1).tolist()
correct = 0
for y_pred, y_true in zip(preds, y_test):
    if y_pred > 0 and y_true > 0:
        correct += 1
    elif y_pred < 0 and y_true < 0:
        correct += 1

print(f"Model Accuracy: {correct/len(preds)}")

Epoch 1/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 230.2617 - mae: 11.9153 - val_loss: 207.1004 - val_mae: 11.5599 - learning_rate: 0.0010
Epoch 2/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 209.4049 - mae: 11.2662 - val_loss: 173.7991 - val_mae: 10.4344 - learning_rate: 0.0010
Epoch 3/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 149.6972 - mae: 9.4509 - val_loss: 165.1171 - val_mae: 10.1666 - learning_rate: 0.0010
Epoch 4/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 134.2123 - mae: 9.0597 - val_loss: 162.8261 - val_mae: 10.1251 - learning_rate: 0.0010
Epoch 5/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 120.5455 - mae: 8.5980 - val_loss: 164.7469 - val_mae: 10.2152 - learning_rate: 0.0010
Epoch 6/20
[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss:

## Applying the model to generate a bracket

In [4]:
# Load in the tournament seeds and subset to the selected year
seed_file = pd.read_csv('/kaggle/input/ncaa-seeds-2024/2024_tourney_seeds.csv')
curr_seeds = seed_file

# Create all team pairings
teams = curr_seeds['TeamID']
team_pairs = []

# For each team, we pair them up with every other team
for ix, team_a in enumerate(teams):
    for team_b in teams[ix:]:
        # Provide a consistent ordering to avoid duplicates
        if team_a > team_b:
            team_pairs.append({
                'team_a': team_a,
                'team_b': team_b
            })
        else:
            team_pairs.append({
                'team_a': team_b,
                'team_b': team_a
            })

# Create a dataframe based on the generated pairings
team_pairs_df = pd.DataFrame(team_pairs)
print(f"Number of pairings are - {len(team_pairs)}")

# Find any new teams that the encoder hasn't seen
new_teams = set(team_pairs_df['team_a']).union(set(team_pairs_df['team_b'])) - set(label_encoder.classes_)

# Add the new teams to the encoder's classes
label_encoder.classes_ = np.append(label_encoder.classes_, list(new_teams))

# Encode the teams from the pairings and generate predictions for each pairing
team_pairs_df['team_a_enc'] = label_encoder.transform(team_pairs_df['team_a'])
team_pairs_df['team_b_enc'] = label_encoder.transform(team_pairs_df['team_b'])
team_pairs_df['score_diff'] = bracket_model.predict([
    team_pairs_df['team_a_enc'],
    team_pairs_df['team_b_enc']
])

# Use np.where to identify where team a is predicted to win or lose
team_pairs_df['is_team_a_win'] = np.where(team_pairs_df['score_diff'] > 0, 1, 0)

# Format the outputs and save the results
output_cols = ['team_a', 'team_b', 'is_team_a_win']
output_df = team_pairs_df[output_cols]
output_df.to_csv('predictions.csv', index=False)

Number of pairings are - 8256
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


## Saving out the trained model

In [5]:
bracket_model.save('bracket_model.keras')

In [6]:
# Read the predictions.csv file
predictions_df = pd.read_csv('/kaggle/working/predictions.csv')

# Display the contents of the DataFrame
print(predictions_df)

      team_a  team_b  is_team_a_win
0       1163    1163              1
1       1235    1163              0
2       1228    1163              0
3       1163    1120              1
4       1361    1163              0
...      ...     ...            ...
8251    3263    3238              1
8252    3394    3238              1
8253    3263    3263              1
8254    3394    3263              1
8255    3394    3394              1

[8256 rows x 3 columns]


In [7]:
# Load the team mapping data
team_mapping_df = pd.read_csv('/kaggle/input/ncaa-seeds-2024/MTeams.csv')

# Merge the team mapping with the team pairs dataframe
team_pairs_df = team_pairs_df.merge(team_mapping_df, left_on='team_a', right_on='TeamID', how='left', suffixes=('', '_y'))
team_pairs_df = team_pairs_df.merge(team_mapping_df, left_on='team_b', right_on='TeamID', how='left', suffixes=('', '_z'))

# Rename the columns to match the desired output
team_pairs_df = team_pairs_df.rename(columns={'TeamName': 'team_a_name', 'TeamName_z': 'team_b_name'})

# Use np.where to identify where team a is predicted to win or lose
team_pairs_df['is_team_a_win'] = np.where(team_pairs_df['score_diff'] > 0, 1, 0)

# Remove rows where either team_a_name or team_b_name is blank
team_pairs_df = team_pairs_df.dropna(subset=['team_a_name', 'team_b_name'])

# Format the outputs and save the results
output_cols = ['team_a', 'team_a_name', 'team_b', 'team_b_name', 'is_team_a_win']
output_df = team_pairs_df[output_cols]
output_df.to_csv('predictions(name).csv', index=False)

In [8]:
# Read the predictions.csv file
predictions_name_df = pd.read_csv('/kaggle/working/predictions(name).csv')

# Display the contents of the DataFrame
print(predictions_name_df)

      team_a   team_a_name  team_b  team_b_name  is_team_a_win
0       1163   Connecticut    1163  Connecticut              1
1       1235       Iowa St    1163  Connecticut              0
2       1228      Illinois    1163  Connecticut              0
3       1163   Connecticut    1120       Auburn              1
4       1361  San Diego St    1163  Connecticut              0
...      ...           ...     ...          ...            ...
2075    1443           WKU    1324      Oakland              1
2076    1324       Oakland    1255     Longwood              0
2077    1443           WKU    1443          WKU              1
2078    1443           WKU    1255     Longwood              1
2079    1255      Longwood    1255     Longwood              1

[2080 rows x 5 columns]


## End