# March Madness 2024 | ML Bracket Prediction
Author Glen Joy (c) 2024

This notebook trains a random-forest model on historical NCAA college basketball statistics to make formulate a predicted bracket for the 2024 season.

In [1]:
import pandas as pd
import numpy as np

## 1. Reading in Training Data
We will incorporate data from a variety of sources. This includes 'traditional' regular season team stats such as winning percentage, 3-pt percentage, free throw percentage, etc. We will also use a team's historic tournament performance. 

Since these are yearly stats, they will be merged on (Team Name, Year) where (Team Name, Year) is set as the multi-index.

Additionally, for funzies, we'll incorporate external 'nontraditional' factors such as public rankings, school spirit, academic ranking, etc.

### Data Source 1: Historic Team Stats

In [2]:
# Reading in historic team season stats data. Has data from 2013 to 2023
df = pd.read_csv('./data/archive1/cbb.csv')
df.head()

Unnamed: 0,TEAM,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,North Carolina,ACC,40,33,123.3,94.9,0.9531,52.6,48.1,15.4,...,30.4,53.9,44.6,32.7,36.2,71.7,8.6,2ND,1.0,2016
1,Wisconsin,B10,40,36,129.1,93.6,0.9758,54.8,47.7,12.4,...,22.4,54.8,44.7,36.5,37.5,59.3,11.3,2ND,1.0,2015
2,Michigan,B10,40,33,114.4,90.4,0.9375,53.9,47.7,14.0,...,30.0,54.7,46.8,35.2,33.2,65.9,6.9,2ND,3.0,2018
3,Texas Tech,B12,38,31,115.2,85.2,0.9696,53.5,43.0,17.7,...,36.6,52.8,41.9,36.5,29.7,67.5,7.0,2ND,3.0,2019
4,Gonzaga,WCC,39,37,117.8,86.3,0.9728,56.6,41.1,16.2,...,26.9,56.3,40.0,38.2,29.0,71.5,7.7,2ND,1.0,2017


### Data Source 2: Historic Tournament Performance

In [3]:
# Reading in historic tournament matchups
df2 = pd.read_csv('./data/archive/Tournament Matchups.csv')
df2 = df2[df2['YEAR'] != 2024] # removing 2024 projected matchups
df2.head()

Unnamed: 0,YEAR,BY YEAR NO,BY ROUND NO,TEAM NO,TEAM,SEED,ROUND,CURRENT ROUND,SCORE
148,2023,1888,1888,1011,Alabama,1,16,64,96.0
149,2023,1887,1887,955,Texas A&M Corpus Chris,16,64,64,75.0
150,2023,1886,1886,979,Maryland,8,32,64,67.0
151,2023,1885,1885,945,West Virginia,9,64,64,65.0
152,2023,1884,1884,961,San Diego St.,5,2,64,63.0


### Data Source 3: Conference Stats

In [4]:
df3 = pd.read_csv('./data/archive/Conference Stats.csv') # this is regular season only!
df3.head()

Unnamed: 0,YEAR,CONF ID,CONF,BADJ EM,BADJ O,BADJ D,BARTHAG,G,W,L,...,AVG HGT,EFF HGT,EXP,TALENT,FT%,OP FT%,PPPO,PPPD,ELITE SOS,WAB
0,2024,7,B12,16.8,113.2,96.4,0.864,375,243,132,...,77.615,80.941,2.092,48.691,72.0,71.2,1.099,0.986,29.01,1.0
1,2024,6,B10,15.1,113.6,98.5,0.838,386,233,153,...,77.906,81.153,1.987,57.308,72.4,72.3,1.099,1.028,29.652,0.1
2,2024,28,SEC,14.2,113.8,99.6,0.822,378,232,146,...,77.608,81.094,2.083,48.953,73.0,72.6,1.107,1.036,28.276,-0.1
3,2024,8,BE,13.6,112.6,99.0,0.815,300,173,127,...,77.727,80.761,2.107,50.457,74.5,73.2,1.089,1.032,30.625,-0.4
4,2024,2,ACC,11.5,110.9,99.4,0.779,410,241,169,...,77.902,81.051,1.648,61.688,73.6,72.8,1.081,1.026,26.024,-1.3


### Data Source 4: ABC 538 Power Rating

In [5]:
df4 = pd.read_csv('./data/archive/538 Ratings.csv')
df4.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,SEED,ROUND,POWER RATING,POWER RATING RANK
0,2023,993,Houston,1,16,93.2,13
1,2023,1011,Alabama,1,16,92.6,16
2,2023,986,Kansas,1,32,89.6,49
3,2023,963,Purdue,1,64,89.5,50
4,2023,957,Texas,2,8,90.1,43


### Data Source 5: Tournament Team Resume Stats

In [6]:
df5 = pd.read_csv('./data/archive/Resumes.csv')
df5.head()

Unnamed: 0,YEAR,TEAM NO,TEAM,SEED,ROUND,NET RPI,RESUME,WAB RANK,ELO,B POWER,Q1 W,Q2 W,Q1 PLUS Q2 W,Q3 Q4 L,PLUS 500,R SCORE,BID TYPE
0,2023,1011,Alabama,1,16,2,3,1,1,2.7,13,6,19,0,24,99.9,Auto
1,2023,1010,Arizona,2,64,10,7,7,10,9.3,9,7,16,0,22,99.8,Auto
2,2023,1009,Arizona St.,11,64,66,32,39,60,70.0,5,4,9,1,10,43.9,At-Large
3,2023,1008,Arkansas,8,16,21,37,48,61,18.7,4,4,8,1,7,88.1,At-Large
4,2023,1007,Auburn,9,32,32,49,42,59,27.3,3,6,9,1,8,84.9,At-Large


## 2. Merging, Fusing, and Deconflicting Datasets
We will combine all of these datasets into a large one to train using.

In [7]:
dfs = [df, df2, df4, df5] # we didnt include df3 since it doesnt have a Team column to merge on
for d in dfs:
    d.set_index(['TEAM', 'YEAR'], inplace=True)

In [8]:
cdf = pd.concat(dfs) # combining all the datasets
cdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CONF,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,...,WAB RANK,ELO,B POWER,Q1 W,Q2 W,Q1 PLUS Q2 W,Q3 Q4 L,PLUS 500,R SCORE,BID TYPE
TEAM,YEAR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
North Carolina,2016,ACC,40.0,33.0,123.3,94.9,0.9531,52.6,48.1,15.4,18.2,...,,,,,,,,,,
Wisconsin,2015,B10,40.0,36.0,129.1,93.6,0.9758,54.8,47.7,12.4,15.8,...,,,,,,,,,,
Michigan,2018,B10,40.0,33.0,114.4,90.4,0.9375,53.9,47.7,14.0,19.5,...,,,,,,,,,,
Texas Tech,2019,B12,38.0,31.0,115.2,85.2,0.9696,53.5,43.0,17.7,22.8,...,,,,,,,,,,
Gonzaga,2017,WCC,39.0,37.0,117.8,86.3,0.9728,56.6,41.1,16.2,17.1,...,,,,,,,,,,


In [15]:
cdf = cdf[cdf.index.get_level_values('YEAR') >= 2016] # using only 2016 and up since that is where all the datasets overlap

In [14]:
cdf.merge(df3, how='inner', on=)

False

In [16]:
df3.head()

Unnamed: 0,YEAR,CONF ID,CONF,BADJ EM,BADJ O,BADJ D,BARTHAG,G,W,L,...,AVG HGT,EFF HGT,EXP,TALENT,FT%,OP FT%,PPPO,PPPD,ELITE SOS,WAB
0,2024,7,B12,16.8,113.2,96.4,0.864,375,243,132,...,77.615,80.941,2.092,48.691,72.0,71.2,1.099,0.986,29.01,1.0
1,2024,6,B10,15.1,113.6,98.5,0.838,386,233,153,...,77.906,81.153,1.987,57.308,72.4,72.3,1.099,1.028,29.652,0.1
2,2024,28,SEC,14.2,113.8,99.6,0.822,378,232,146,...,77.608,81.094,2.083,48.953,73.0,72.6,1.107,1.036,28.276,-0.1
3,2024,8,BE,13.6,112.6,99.0,0.815,300,173,127,...,77.727,80.761,2.107,50.457,74.5,73.2,1.089,1.032,30.625,-0.4
4,2024,2,ACC,11.5,110.9,99.4,0.779,410,241,169,...,77.902,81.051,1.648,61.688,73.6,72.8,1.081,1.026,26.024,-1.3


In [17]:
df.index.get_level_values('YEAR').max()

2023