# March Madness 2024 | ML Bracket Prediction
Author Glen Joy (c) 2024

This notebook trains a random-forest model on historical NCAA college basketball statistics to make formulate a predicted bracket for the 2024 season.

In [1]:
import pandas as pd
import numpy as np

## 1. Reading in Training Data
We will incorporate data from a variety of sources. This includes 'traditional' regular season team stats such as winning percentage, 3-pt percentage, free throw percentage, etc. We will also use a team's historic tournament performance. 

Since these are yearly stats, they will be merged on (Team Name, Year) where (Team Name, Year) is set as the multi-index.

Additionally, for funzies, we'll incorporate external 'nontraditional' factors such as public rankings, school spirit, academic ranking, etc.

### Data Source 1: Historic Team Stats

In [2]:
# Reading in historic team season stats data. Has data from 2013 to 2023
df = pd.read_csv('./data/archive1/cbb.csv')
df.head()

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,North Carolina,ACC,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,...,30.4,53.9,44.6,32.7,36.2,71.7,8.6,2ND,1.0,2016
1,Wisconsin,B10,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,...,22.4,54.8,44.7,36.5,37.5,59.3,11.3,2ND,1.0,2015
2,Michigan,B10,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,...,30.0,54.7,46.8,35.2,33.2,65.9,6.9,2ND,3.0,2018
3,Texas Tech,B12,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,...,36.6,52.8,41.9,36.5,29.7,67.5,7.0,2ND,3.0,2019
4,Gonzaga,WCC,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,...,26.9,56.3,40.0,38.2,29.0,71.5,7.7,2ND,1.0,2017


### Data Source 2: Historic Tournament Performance

In [3]:
# Reading in historic tournament matchups
df2 = pd.read_csv('./data/archive/Tournament Matchups.csv')
df2 = df2[df2['YEAR'] != 2024] # removing 2024 projected matchups
df2.head()

Unnamed: 0,YEAR,BY YEAR NO,BY ROUND NO,TEAM NO,TEAM,SEED,ROUND,CURRENT ROUND,SCORE
148,2023,1888,1888,1011,Alabama,1,16,64,96.0
149,2023,1887,1887,955,Texas A&M Corpus Chris,16,64,64,75.0
150,2023,1886,1886,979,Maryland,8,32,64,67.0
151,2023,1885,1885,945,West Virginia,9,64,64,65.0
152,2023,1884,1884,961,San Diego St.,5,2,64,63.0


### Data Source 3: Conference Stats

In [4]:
df3 = pd.read_csv('./data/archive/Conference Stats.csv') # this is regular season only!
df3 = df3[df3['YEAR'] != 2024] # removing imcomplete 2024 data
df3.head()

Unnamed: 0,YEAR,CONF ID,CONF,BADJ EM,BADJ O,BADJ D,BARTHAG,G,W,L,...,AVG HGT,EFF HGT,EXP,TALENT,FT%,OP FT%,PPPO,PPPD,ELITE SOS,WAB
33,2023,7,B12,18.0,111.8,93.8,0.883,327,206,121,...,77.238,80.522,2.14,55.266,72.5,71.7,1.059,0.978,34.92,2.9
34,2023,6,B10,14.2,110.4,96.2,0.83,456,267,189,...,78.053,80.919,1.845,55.026,71.1,70.7,1.061,1.001,31.036,0.2
35,2023,8,BE,13.0,110.8,97.8,0.808,360,202,158,...,77.609,80.679,1.935,54.668,73.5,72.7,1.065,1.019,30.79,-1.0
36,2023,28,SEC,12.5,109.2,96.7,0.802,460,272,188,...,77.488,80.909,1.968,47.67,71.1,71.5,1.053,0.998,28.681,-0.8
37,2023,24,P12,11.4,107.9,96.5,0.783,393,214,179,...,78.397,81.439,1.855,55.558,71.7,71.0,1.03,0.991,28.629,-2.2


### Data Source 4: ABC 538 Power Rating

In [5]:
df4 = pd.read_csv('./data/archive/538 Ratings.csv')
df4.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,SEED,ROUND,POWER RATING,POWER RATING RANK
0,2023,993,Houston,1,16,93.2,13
1,2023,1011,Alabama,1,16,92.6,16
2,2023,986,Kansas,1,32,89.6,49
3,2023,963,Purdue,1,64,89.5,50
4,2023,957,Texas,2,8,90.1,43


### Data Source 5: Tournament Team Resume Stats

In [6]:
df5 = pd.read_csv('./data/archive/Resumes.csv')
df5.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,SEED,ROUND,NET RPI,RESUME,WAB RANK,ELO,B POWER,Q1 W,Q2 W,Q1 PLUS Q2 W,Q3 Q4 L,PLUS 500,R SCORE,BID TYPE
0,2023,1011,Alabama,1,16,2,3,1,1,2.7,13,6,19,0,24,99.9,Auto
1,2023,1010,Arizona,2,64,10,7,7,10,9.3,9,7,16,0,22,99.8,Auto
2,2023,1009,Arizona St.,11,64,66,32,39,60,70.0,5,4,9,1,10,43.9,At-Large
3,2023,1008,Arkansas,8,16,21,37,48,61,18.7,4,4,8,1,7,88.1,At-Large
4,2023,1007,Auburn,9,32,32,49,42,59,27.3,3,6,9,1,8,84.9,At-Large


## 2. Merging, Fusing, and Deconflicting Datasets
We will combine all of these datasets into a large one to train using.

In [7]:
dfs = [df, df2, df4, df5] # we didnt include df3 since it doesnt have a Team column to merge on
for d in dfs:
    d.set_index(['TEAM', 'YEAR'], inplace=True)

In [8]:
cdf = pd.concat(dfs) # combining all the datasets
cdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,...,WAB RANK,ELO,B POWER,Q1 W,Q2 W,Q1 PLUS Q2 W,Q3 Q4 L,PLUS 500,R SCORE,BID TYPE
TEAM,YEAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
North Carolina,2016,ACC,40.0,33.0,123.3,94.9,0.9531,52.6,48.1,15.4,18.2,...,,,,,,,,,,
Wisconsin,2015,B10,40.0,36.0,129.1,93.6,0.9758,54.8,47.7,12.4,15.8,...,,,,,,,,,,
Michigan,2018,B10,40.0,33.0,114.4,90.4,0.9375,53.9,47.7,14.0,19.5,...,,,,,,,,,,
Texas Tech,2019,B12,38.0,31.0,115.2,85.2,0.9696,53.5,43.0,17.7,22.8,...,,,,,,,,,,
Gonzaga,2017,WCC,39.0,37.0,117.8,86.3,0.9728,56.6,41.1,16.2,17.1,...,,,,,,,,,,


In [9]:
cdf = cdf[cdf.index.get_level_values('YEAR') >= 2016] # using only 2016 and up since that is where all the datasets overlap

In [10]:
cdf = cdf.merge(df3, on=['CONF', 'YEAR'])

In [14]:
cdf

Unnamed: 0,CONF,YEAR,G_x,W_x,ADJOE,ADJDE,BARTHAG_x,EFG_O,EFG_D,TOR,...,AVG HGT,EFF HGT,EXP,TALENT,FT%,OP FT%,PPPO,PPPD,ELITE SOS,WAB_y
0,ACC,2016,40.0,33.0,123.3,94.9,0.9531,52.6,48.1,15.4,...,77.971,81.006,1.650,68.794,71.2,69.6,1.096,1.024,31.563,0.7
1,B10,2018,40.0,33.0,114.4,90.4,0.9375,53.9,47.7,14.0,...,77.572,80.791,1.590,61.112,71.0,71.1,1.083,1.005,29.954,-0.5
2,B12,2019,38.0,31.0,115.2,85.2,0.9696,53.5,43.0,17.7,...,77.329,80.223,1.570,57.441,69.5,70.1,1.049,0.987,34.332,2.2
3,WCC,2017,39.0,37.0,117.8,86.3,0.9728,56.6,41.1,16.2,...,77.218,80.671,1.642,21.039,70.0,68.5,1.041,1.015,20.152,-5.8
4,ACC,2019,38.0,35.0,123.0,89.9,0.9736,55.2,44.7,14.7,...,77.665,80.702,1.566,60.430,72.3,70.6,1.062,0.981,31.784,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2469,MAC,2023,34.0,27.0,119.9,109.6,0.7369,56.3,52.9,13.6,...,76.702,79.672,1.945,11.717,72.6,71.5,1.060,1.073,13.306,-10.0
2470,ASun,2023,33.0,27.0,111.4,97.3,0.8246,55.5,49.3,16.0,...,76.536,79.582,1.960,9.599,72.4,72.6,1.041,1.067,14.623,-8.9
2471,WAC,2023,34.0,28.0,107.1,94.6,0.8065,51.7,44.0,19.3,...,77.075,79.663,2.117,6.719,70.5,71.5,1.023,1.024,16.685,-7.0
2472,CUSA,2023,38.0,29.0,112.4,97.0,0.8453,50.3,47.3,17.3,...,76.989,80.325,2.036,16.529,71.7,72.1,1.042,1.023,18.116,-5.9


## 2. Training

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics