In [1]:
import numpy as np
import pandas as pd
from sklearn import *
import glob
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, regularizers
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.utils import class_weight
import matplotlib.pyplot as plt


import torch

In [2]:
teams = pd.read_csv('../../data/MTeams.csv')  
MTeam_spellings = pd.read_csv('../../data/MTeamspellings.csv', encoding='ISO-8859-1')  
WTeam_spellings = pd.read_csv('../../data/WTeamspellings.csv', encoding='ISO-8859-1')  

In [3]:
teams_spelling = pd.concat([MTeam_spellings, WTeam_spellings])
teams_spelling = teams_spelling.groupby(by='TeamID', as_index=False)['TeamNameSpelling'].count()
teams_spelling.columns = ['TeamID', 'TeamNameCount']
teams = pd.merge(teams, teams_spelling, how='left', on=['TeamID'])
del teams_spelling

In [4]:
MRegularSeasonCompactResults = pd.read_csv('../../data/MRegularSeasonCompactResults.csv')
WRegularSeasonCompactResults = pd.read_csv('../../data/WRegularSeasonCompactResults.csv')
MRegularSeasonDetailedResults = pd.read_csv('../../data/MRegularSeasonDetailedResults.csv')
WRegularSeasonDetailedResults = pd.read_csv('../../data/WRegularSeasonDetailedResults.csv')
MNCAATourneyCompactResults = pd.read_csv('../../data/MNCAATourneyCompactResults.csv')
WNCAATourneyCompactResults = pd.read_csv('../../data/WNCAATourneyCompactResults.csv')
MNCAATourneyDetailedResults = pd.read_csv('../../data/MNCAATourneyDetailedResults.csv') 
WNCAATourneyDetailedResults = pd.read_csv('../../data/WNCAATourneyDetailedResults.csv')
MNCAATourneySeeds = pd.read_csv('../../data/MNCAATourneySeeds.csv')
WNCAATourneySeeds = pd.read_csv('../../data/WNCAATourneySeeds.csv')
MGameCities = pd.read_csv('../../data/MGameCities.csv') 
WGameCities = pd.read_csv('../../data/WGameCities.csv')
MSeasons = pd.read_csv('../../data/MSeasons.csv') 
WSeasons = pd.read_csv('../../data/WSeasons.csv')

In [5]:
season_cresults = pd.concat([MRegularSeasonCompactResults, WRegularSeasonCompactResults])
season_dresults = pd.concat([MRegularSeasonDetailedResults, WRegularSeasonDetailedResults])
tourney_cresults = pd.concat([MNCAATourneyCompactResults, WNCAATourneyCompactResults])
tourney_dresults = pd.concat([MNCAATourneyDetailedResults, WNCAATourneyDetailedResults])
seeds = pd.concat([MNCAATourneySeeds, WNCAATourneySeeds])
gcities = pd.concat([MGameCities, WGameCities])
seasons = pd.concat([MSeasons, WSeasons])

seeds = {'_'.join(map(str,[int(k1),k2])):int(v[1:3]) for k1, v, k2 in seeds[['Season', 'Seed', 'TeamID']].values}
cities = pd.read_csv('../../data/cities.csv') 
sub = pd.read_csv('../../data/SampleSubmissionStage2.csv')

In [6]:
season_cresults['ST'] = 'S'
season_dresults['ST'] = 'S'
tourney_cresults['ST'] = 'T'
tourney_dresults['ST'] = 'T'
games = pd.concat((season_cresults, tourney_cresults), axis=0, ignore_index=True)
games = pd.concat((season_dresults, tourney_dresults), axis=0, ignore_index=True)
games.reset_index(drop=True, inplace=True)
games['WLoc'] = games['WLoc'].map({'A': 1, 'H': 2, 'N': 3})

games['ID'] = games.apply(lambda r: '_'.join(map(str, [r['Season']]+sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
games['IDTeams'] = games.apply(lambda r: '_'.join(map(str, sorted([r['WTeamID'],r['LTeamID']]))), axis=1)
games['Team1'] = games.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[0], axis=1)
games['Team2'] = games.apply(lambda r: sorted([r['WTeamID'],r['LTeamID']])[1], axis=1)
games['IDTeam1'] = games.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
games['IDTeam2'] = games.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)

games['Team1Seed'] = games['IDTeam1'].map(seeds).fillna(0)
games['Team2Seed'] = games['IDTeam2'].map(seeds).fillna(0)

games['ScoreDiff'] = games['WScore'] - games['LScore']
games['Pred'] = games.apply(lambda r: 1. if sorted([r['WTeamID'],r['LTeamID']])[0]==r['WTeamID'] else 0., axis=1)
games['ScoreDiffNorm'] = games.apply(lambda r: r['ScoreDiff'] * -1 if r['Pred'] == 0. else r['ScoreDiff'], axis=1)
games['SeedDiff'] = games['Team1Seed'] - games['Team2Seed'] 
games = games.fillna(-1)

c_score_col = ['NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl',
 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl',
 'LBlk', 'LPF']
c_score_agg = ['sum', 'mean', 'median', 'max', 'min', 'std', 'skew', 'nunique']
gb = games.groupby(by=['IDTeams']).agg({k: c_score_agg for k in c_score_col}).reset_index()
gb.columns = [''.join(c) + '_c_score' for c in gb.columns]

games = games[games['ST']=='T']

sub['WLoc'] = 3
sub['Season'] = sub['ID'].map(lambda x: x.split('_')[0])
sub['Season'] = sub['ID'].map(lambda x: x.split('_')[0])
sub['Season'] = sub['Season'].astype(int)
sub['Team1'] = sub['ID'].map(lambda x: x.split('_')[1])
sub['Team2'] = sub['ID'].map(lambda x: x.split('_')[2])
sub['IDTeams'] = sub.apply(lambda r: '_'.join(map(str, [r['Team1'], r['Team2']])), axis=1)
sub['IDTeam1'] = sub.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
sub['IDTeam2'] = sub.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)
sub['Team1Seed'] = sub['IDTeam1'].map(seeds).fillna(0)
sub['Team2Seed'] = sub['IDTeam2'].map(seeds).fillna(0)
sub['SeedDiff'] = sub['Team1Seed'] - sub['Team2Seed'] 
sub = sub.fillna(-1)

games = pd.merge(games, gb, how='left', left_on='IDTeams', right_on='IDTeams_c_score')
sub = pd.merge(sub, gb, how='left', left_on='IDTeams', right_on='IDTeams_c_score')

col = [c for c in games.columns if c not in ['ID', 'DayNum', 'ST', 'Team1', 'Team2', 'IDTeams', 'IDTeam1', 'IDTeam2', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'NumOT', 'Pred', 'ScoreDiff', 'ScoreDiffNorm', 'WLoc'] + c_score_col]

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Data Preprocessing
imputer = SimpleImputer(strategy='mean')  
scaler = StandardScaler()

X = games[col].fillna(-1)
X_imputed = imputer.fit_transform(X)
X_scaled = scaler.fit_transform(X_imputed)

y = games['Pred']

# Convert to TensorFlow format
X_train_tensor = tf.convert_to_tensor(X_scaled, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y, dtype=tf.float32)

# Train, valid split
train_size = int(0.8 * len(X_train_tensor))  # 80% for training
val_size = len(X_train_tensor) - train_size  # Remaining 20% for validation

X_train, X_val = X_train_tensor[:train_size], X_train_tensor[train_size:]
y_train, y_val = y_train_tensor[:train_size], y_train_tensor[train_size:]

# Create the TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

# Set batch size
batch_size = 32
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

# Neural Network Model
class NeuralNetwork(keras.Model):
    def __init__(self, d_in, d_out, d_hidden, n_layers=2):
        super().__init__()
        self.d_in = d_in
        self.d_out = d_out
        self.d_hidden = d_hidden
        self.n_layers = n_layers
        
        self.dense1 = layers.Dense(d_hidden, activation='relu', input_dim=d_in)
        self.batch_norm1 = layers.BatchNormalization()
        self.dropout = layers.Dropout(0.3)
        
        self.hidden_layers = [layers.Dense(d_hidden, activation='relu') for _ in range(n_layers)]
        self.batch_norms = [layers.BatchNormalization() for _ in range(n_layers)]
        
        self.output_layer = layers.Dense(d_out)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.batch_norm1(x)
        
        for dense, batch_norm in zip(self.hidden_layers, self.batch_norms):
            x = dense(x)
            x = batch_norm(x)
            x = self.dropout(x)
        
        return self.output_layer(x)

# Model creation
model = NeuralNetwork(X_scaled.shape[1], 1, 100)

# Compile model
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss='mse')

# Training function
def train(model, train_dataset, val_dataset, epochs):
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}:")
        # Training loop
        for batch_X, batch_y in train_dataset:
            with tf.GradientTape() as tape:
                predictions = model(batch_X)
                loss = tf.reduce_mean(tf.square(batch_y - predictions))
                
            gradients = tape.gradient(loss, model.trainable_variables)
            model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            
        # Validation loop
        val_loss = 0
        for batch_X, batch_y in val_dataset:
            predictions = model(batch_X)
            val_loss += tf.reduce_mean(tf.square(batch_y - predictions))
        
        print(f"  Train Loss (MSE): {loss.numpy():.4f}")
        print(f"  Validation Loss (MSE): {val_loss.numpy()/len(val_dataset):.4f}")

# Train model
train(model, train_dataset, val_dataset, epochs=25)

# Prepare for submission
X_submit = sub[col].fillna(-1)
X_submit_imputed = imputer.transform(X_submit)
X_submit_scaled = scaler.transform(X_submit_imputed)

X_submit_tensor = tf.convert_to_tensor(X_submit_scaled, dtype=tf.float32)

# Predicting on submission data
y_preds = model(X_submit_tensor).numpy()

sub['Pred'] = y_preds


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/25:


2025-03-10 13:48:06.839130: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-03-10 13:48:06.880745: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


  Train Loss (MSE): 0.2567
  Validation Loss (MSE): 0.2699
Epoch 2/25:


2025-03-10 13:48:08.178836: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


  Train Loss (MSE): 0.2580
  Validation Loss (MSE): 0.2572
Epoch 3/25:
  Train Loss (MSE): 0.2510
  Validation Loss (MSE): 0.2524
Epoch 4/25:


2025-03-10 13:48:10.847954: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


  Train Loss (MSE): 0.2457
  Validation Loss (MSE): 0.2512
Epoch 5/25:
  Train Loss (MSE): 0.2506
  Validation Loss (MSE): 0.2510
Epoch 6/25:
  Train Loss (MSE): 0.2712
  Validation Loss (MSE): 0.2557
Epoch 7/25:
  Train Loss (MSE): 0.2535
  Validation Loss (MSE): 0.2526
Epoch 8/25:


2025-03-10 13:48:16.149672: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


  Train Loss (MSE): 0.2601
  Validation Loss (MSE): 0.2517
Epoch 9/25:
  Train Loss (MSE): 0.2602
  Validation Loss (MSE): 0.2499
Epoch 10/25:
  Train Loss (MSE): 0.2523
  Validation Loss (MSE): 0.2498
Epoch 11/25:
  Train Loss (MSE): 0.2510
  Validation Loss (MSE): 0.2503
Epoch 12/25:
  Train Loss (MSE): 0.2526
  Validation Loss (MSE): 0.2516
Epoch 13/25:
  Train Loss (MSE): 0.2496
  Validation Loss (MSE): 0.2497
Epoch 14/25:
  Train Loss (MSE): 0.2679
  Validation Loss (MSE): 0.2639
Epoch 15/25:
  Train Loss (MSE): 0.2489
  Validation Loss (MSE): 0.2507
Epoch 16/25:


2025-03-10 13:48:26.882686: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


  Train Loss (MSE): 0.2236
  Validation Loss (MSE): 0.2580
Epoch 17/25:
  Train Loss (MSE): 0.2493
  Validation Loss (MSE): 0.2498
Epoch 18/25:
  Train Loss (MSE): 0.2487
  Validation Loss (MSE): 0.2498
Epoch 19/25:
  Train Loss (MSE): 0.2706
  Validation Loss (MSE): 0.2506
Epoch 20/25:
  Train Loss (MSE): 0.2473
  Validation Loss (MSE): 0.2506
Epoch 21/25:
  Train Loss (MSE): 0.2491
  Validation Loss (MSE): 0.2511
Epoch 22/25:
  Train Loss (MSE): 0.2540
  Validation Loss (MSE): 0.2497
Epoch 23/25:
  Train Loss (MSE): 0.2454
  Validation Loss (MSE): 0.2551
Epoch 24/25:
  Train Loss (MSE): 0.2498
  Validation Loss (MSE): 0.2499
Epoch 25/25:
  Train Loss (MSE): 0.2475
  Validation Loss (MSE): 0.2514


In [9]:
sub[['ID', 'Pred']].sample(30)

Unnamed: 0,ID,Pred
114630,2025_3288_3351,0.538426
6915,2025_1123_1320,0.538426
26418,2025_1188_1454,0.524479
20227,2025_1167_1392,0.538426
5409,2025_1117_1192,0.538426
121104,2025_3330_3474,0.538426
13084,2025_1143_1474,0.538426
129183,2025_3409_3468,0.538426
55366,2025_1329_1361,0.538426
12439,2025_1142_1144,0.538426
