# Second Chance Tournament Predictions
Author Glen Joy (c) 2024

Re-training a new model to predict the 2024 March Madness tournament from the Sweet-16 onward.

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

## Importing Data
We will use the same training data as we did in the first model (predict.ipynb)

In [2]:
traindf = pd.read_csv('./data/training_data.csv')
traindf.head()

Unnamed: 0.1,Unnamed: 0,YEAR,TEAM_1,TEAM_2,WINNER,TEAM_ONE,CONF_ONE,ADJOE_ONE,ADJDE_ONE,BARTHAG_x_ONE,...,EFF HGT_TWO,EXP_TWO,TALENT_TWO,FT%_TWO,OP FT%_TWO,PPPO_TWO,PPPD_TWO,ELITE SOS_TWO,WAB_y_TWO,WIN_PCT_CONF_TWO
0,0,2013,LIU Brooklyn,James Madison,2,LIU Brooklyn,NEC,108.1,111.2,0.4195,...,79.88,1.595,19.579,69.9,70.3,1.005,1.029,12.956,-11.2,0.44868
1,1,2013,La Salle,Boise St,1,La Salle,A10,112.0,96.2,0.8516,...,80.035,1.826,24.94,70.8,68.8,1.031,0.98,25.212,-1.4,0.611722
2,2,2013,Arizona,Belmont,1,Arizona,P12,114.4,92.2,0.9229,...,78.906,1.956,4.688,70.7,71.2,1.018,1.038,12.869,-10.0,0.474138
3,3,2013,Butler,Bucknell,1,Butler,A10,109.2,93.0,0.8624,...,79.883,1.667,3.239,72.4,68.8,1.008,1.016,14.044,-9.8,0.48583
4,4,2013,UNLV,California,2,UNLV,MWC,104.7,88.4,0.8749,...,81.109,1.664,55.458,70.3,69.5,1.027,0.977,26.463,-1.8,0.587629


In [3]:
labels = np.array(traindf['WINNER'])
traindf = traindf.drop(['WINNER', 'Unnamed: 0', 'TEAM_1', 'TEAM_2', 'TEAM_ONE', 'TEAM_TWO', 'CONF_ONE', 'CONF_TWO', 'POSTSEASON_ONE', 'POSTSEASON_TWO', 'YEAR.1'], axis=1) 

In [4]:
traindf.head()

Unnamed: 0,YEAR,ADJOE_ONE,ADJDE_ONE,BARTHAG_x_ONE,EFG_O_ONE,EFG_D_ONE,TOR_ONE,TORD_ONE,ORB_ONE,DRB_ONE,...,EFF HGT_TWO,EXP_TWO,TALENT_TWO,FT%_TWO,OP FT%_TWO,PPPO_TWO,PPPD_TWO,ELITE SOS_TWO,WAB_y_TWO,WIN_PCT_CONF_TWO
0,2013,108.1,111.2,0.4195,54.4,52.6,20.3,18.4,35.2,33.5,...,79.88,1.595,19.579,69.9,70.3,1.005,1.029,12.956,-11.2,0.44868
1,2013,112.0,96.2,0.8516,51.9,49.3,17.1,21.3,29.0,34.2,...,80.035,1.826,24.94,70.8,68.8,1.031,0.98,25.212,-1.4,0.611722
2,2013,114.4,92.2,0.9229,52.5,46.6,19.5,19.8,35.0,26.7,...,78.906,1.956,4.688,70.7,71.2,1.018,1.038,12.869,-10.0,0.474138
3,2013,109.2,93.0,0.8624,50.7,46.7,20.2,17.4,35.9,26.3,...,79.883,1.667,3.239,72.4,68.8,1.008,1.016,14.044,-9.8,0.48583
4,2013,104.7,88.4,0.8749,48.9,44.1,20.2,18.5,33.3,27.1,...,81.109,1.664,55.458,70.3,69.5,1.027,0.977,26.463,-1.8,0.587629


## Training Model
We will train a new model - this time using XGBoost

In [5]:
import xgboost as xgb

In [6]:
train_features, test_features, train_labels, test_labels = train_test_split(traindf, labels, test_size = 0.20, random_state = 42)

In [7]:
# # Create classification matrices
# dtrain_clf = xgb.DMatrix(train_features, train_labels)
# dtest_clf = xgb.DMatrix(test_features, test_labels)

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_labels = le.fit_transform(train_labels)

In [9]:
test_labels = le.fit_transform(test_labels)

In [10]:
clf = xgb.XGBClassifier(max_depth=5)
# Fit the model, test sets are used for early stopping.
clf.fit(train_features.values, train_labels)

In [11]:
y_pred = clf.predict(test_features.values)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = metrics.accuracy_score(test_labels, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 71.30%


## Running Predictions for 2024 (Sweet Sixteen Onward)

In [12]:
pdf = pd.read_csv('./data/lookup_table.csv') # to lookup team info to construct our input data, exported from predict.ipynb
pdf.drop(['Unnamed: 0'], axis=1, inplace=True)
pdf.head()

Unnamed: 0,TEAM,CONF,ADJOE,ADJDE,BARTHAG_x,EFG_O,EFG_D,TOR,TORD,ORB,...,EFF HGT,EXP,TALENT,FT%,OP FT%,PPPO,PPPD,ELITE SOS,WAB_y,WIN_PCT_CONF
0,Dayton,A10,117.5,100.8,0.8535,56.6,47.9,15.8,15.7,26.9,...,80.244,1.872,18.179,72.2,71.7,1.073,1.033,17.561,-4.2,0.579345
1,Duquesne,A10,106.4,96.2,0.7619,50.1,48.0,17.5,19.5,29.9,...,80.244,1.872,18.179,72.2,71.7,1.073,1.033,17.561,-4.2,0.579345
2,North Carolina,ACC,116.8,93.4,0.9291,51.3,46.4,14.4,14.9,32.8,...,81.051,1.648,61.688,73.6,72.8,1.081,1.026,26.024,-1.3,0.587805
3,Duke,ACC,120.8,97.0,0.9259,55.2,49.0,14.3,16.9,31.8,...,81.051,1.648,61.688,73.6,72.8,1.081,1.026,26.024,-1.3,0.587805
4,Clemson,ACC,116.7,99.4,0.863,53.9,48.7,14.9,13.6,27.7,...,81.051,1.648,61.688,73.6,72.8,1.081,1.026,26.024,-1.3,0.587805


In [13]:
# function taken from predict.ipynb
def construct_matchup(team1, team2):
    pdf_temp = pdf.copy()
    team1_data = pdf_temp[pdf_temp['TEAM'] == team1]
    team2_data = pdf_temp[pdf_temp['TEAM'] == team2]
    input_data = team1_data.merge(team2_data, on='YEAR', suffixes=['_ONE', '_TWO']).copy()
    input_data.drop(['TEAM_ONE',
 'CONF_ONE',
 '3PR_ONE',
 '3PRD_ONE',
 'TEAM_TWO',
 'CONF_TWO',
 '3PR_TWO',
 '3PRD_TWO'], axis=1, inplace=True)
    return input_data

In [14]:
test = construct_matchup('Connecticut', 'Purdue') # example
test

Unnamed: 0,ADJOE_ONE,ADJDE_ONE,BARTHAG_x_ONE,EFG_O_ONE,EFG_D_ONE,TOR_ONE,TORD_ONE,ORB_ONE,DRB_ONE,FTR_x_ONE,...,EFF HGT_TWO,EXP_TWO,TALENT_TWO,FT%_TWO,OP FT%_TWO,PPPO_TWO,PPPD_TWO,ELITE SOS_TWO,WAB_y_TWO,WIN_PCT_CONF_TWO
0,126.9,93.8,0.97,57.1,45.1,14.9,16.2,36.5,26.8,33.3,...,81.153,1.987,57.308,72.4,72.3,1.099,1.028,29.652,0.1,0.603627


### Modified code from simulator.ipynb with model replaced

In [15]:
# given two teams, this function will determine who the winner will be using our ML model
def predict(team1, team2):
    input_data = construct_matchup(team1, team2)
    input_data['YEAR'] = 2024 # hard coding
    input_data = input_data[traindf.columns] # to ensure column ordering matches that of training data
    try:  
        result = clf.predict(np.array(input_data).reshape(1,-1))
    except ValueError:
        print(f'Either {team1} or {team2} not in list')
    if result == 0: # Xgboost needs classes to start from 0, not 1 
        winner = team1
    elif result == 1:
        winner = team2
    print(f"Between {team1} and {team2} | {winner} won!")
    return winner

In [25]:
# East Region - Sweet 16
predict('Connecticut', 'San Diego St.')
predict('Illinois', 'Iowa St.')

Between Connecticut and San Diego St. | Connecticut won!
Between Illinois and Iowa St. | Iowa St. won!


'Iowa St.'

In [26]:
predict('Connecticut', 'Iowa St.') # Final Four east WINNER

Between Connecticut and Iowa St. | Connecticut won!


'Connecticut'

In [27]:
predict('North Carolina', 'Alabama')
predict('Clemson', 'Arizona')

Between North Carolina and Alabama | North Carolina won!
Between Clemson and Arizona | Arizona won!


'Arizona'

In [28]:
predict('North Carolina', 'Arizona') # Final Four west WINNER

Between North Carolina and Arizona | Arizona won!


'Arizona'

In [29]:
# South -Sweet 16
predict('Houston', 'Duke')
predict('North Carolina St.', 'Marquette')

Between Houston and Duke | Houston won!
Between North Carolina St. and Marquette | Marquette won!


'Marquette'

In [30]:
predict('Houston', 'Marquette') # Final Four south WINNER

Between Houston and Marquette | Houston won!


'Houston'

In [31]:
predict('Purdue', 'Gonzaga')
predict('Creighton', 'Tennessee')

Between Purdue and Gonzaga | Purdue won!
Between Creighton and Tennessee | Tennessee won!


'Tennessee'

In [32]:
predict('Purdue', 'Tennessee') # Final Four midwest WINNER

Between Purdue and Tennessee | Purdue won!


'Purdue'

### Final Four and Finals!

In [33]:
#east/west Final Four
predict('Connecticut', 'Arizona')

Between Connecticut and Arizona | Connecticut won!


'Connecticut'

In [34]:
predict('Houston', 'Purdue') # south/midwest Final Four

Between Houston and Purdue | Houston won!


'Houston'

In [35]:
# CHAMPIONSHIP
predict('Connecticut', 'Houston')

Between Connecticut and Houston | Connecticut won!


'Connecticut'