<a href="https://colab.research.google.com/github/emilyginn/march_madness_bracket/blob/main/march_madness_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set Up

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.api._v2.keras import regularizers

pd.set_option('display.max_columns', None)

## Collect Data

In [None]:
years = [2023, 2022, 2021, 2019, 2018, 2017, 2016, 2015, 2014]

dataframes = {}

for year in years:
  data = pd.read_csv('/content/drive/MyDrive/march_madness_predictor_dir/'+str(year)).drop(columns=['Unnamed: 8', 'Unnamed: 11', 'Unnamed: 14', 'Unnamed: 17', 'Unnamed: 20'])
  data.insert(2, 'year', year)
  dataframes[year] = data

# Concatenate the dataframes into one
data = pd.concat(dataframes.values(), ignore_index=True)

## Preprocessing

In [None]:
data = data.rename(columns={'W.1': 'conference_wins', 'L.1': 'conference_losses', 'W.2': 'home_wins', 'L.2': 'home_losses', 'W.3': 'away_wins', 'L.3': 'away_losses'})

In [None]:
data['School'] = data['School'].astype(str).str.strip()

In [None]:
## Account for 2023 teams, as they aren't specified in the data sets yet

teams_2023 = ["Alabama","Houston","Kansas","Purdue","UCLA","Texas","Arizona","Marquette","Baylor","Gonzaga","Kansas State","Xavier","Connecticut","Tennessee","Indiana","Virginia","San Diego State",
              "Duke","Saint Mary's","Miami","lowa State","Creighton","Kentucky","TCU","Texas A&M","Michigan State","Missouri","Northwestern","Memphis","Arkansas","Maryland","lowa","Florida Atlantic",
              "West Virginia","Auburn","Illinois","Boise State","Penn State","Southern California","Utah State","NC State","Providence","Mississippi State","Pittsburgh","Arizona State","Nevada",
              "College of Charleston","Oral Roberts","Drake","Virginia Commonwealth","Kent State","lona","Furman","Louisiana","Kennesaw State","UC Santa Barbara","Grand Canyon","Montana State","Vermont",
              "Colgate","Princeton","UNC Asheville","Northern Kentucky","Howard","A&M-Corpus Christi","Texas Southern","Southeast Missouri State","Fairleigh Dickinson"]

data.loc[:, 'NCAA'] = 0

data['School'] = [s.translate({0xa0: ' '}) for s in data['School'].to_list()]

for i in data[data['year'] == 2023].index:
  if data.loc[i, 'School'] in teams_2023:
    data.loc[i, 'NCAA'] = 1

for i in data[data['year'] != 2023].index:
  if " NCAA" in data.loc[i, 'School']:
    data.loc[i, 'NCAA'] = 1

In [None]:
tournament_teams = data[data["NCAA"] == 1]

In [None]:
tournament_teams['School'] = tournament_teams['School'].str.replace("'", "")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tournament_teams['School'] = tournament_teams['School'].str.replace("'", "")


### Feature Normalization

In [None]:
for year in tournament_teams['year'].unique():
    cols_to_normalize = tournament_teams.loc[tournament_teams['year'] == year].columns[17:30]
    normalized_cols = (tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize] - tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize].min()) / (tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize].max() - tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize].min())
    tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize] = normalized_cols


In [None]:
for year in tournament_teams['year'].unique():
    cols_to_normalize = tournament_teams.loc[tournament_teams['year'] == year].columns[6:9]
    normalized_cols = (tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize] - tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize].min()) / (tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize].max() - tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize].min())
    tournament_teams.loc[tournament_teams['year'] == year, cols_to_normalize] = normalized_cols

### Lookup Tables

In [None]:
years = [2022, 2021, 2019, 2018, 2017, 2016, 2015, 2014]

dataframes = {}

for year in years:
  data = pd.read_csv('/content/drive/MyDrive/march_madness_predictor_dir/'+str(year)+'_WAS.csv', header=None)
  data.columns = ["School", "WAS"]
  data['School'] = data['School'].str.replace("'", "")
  dataframes[year] = data


In [None]:
groups = {}

for year, df in dataframes.items():
    groups[year] = pd.merge(tournament_teams[tournament_teams["year"]==year], df, how="outer", on="School")
groups[2022].loc[:68,:]

In [None]:
groups[2022] = groups[2022].loc[:67,:]

In [None]:
model_data = pd.concat(groups.values(), ignore_index=True).fillna(0)

In [None]:
model_data.drop(columns=['Rk','NCAA'], inplace=True)

In [None]:
model_data.shape

(544, 30)

## Neural Network

In [None]:
model = keras.Sequential()

In [None]:
model.add(layers.Dense(256,kernel_regularizer=regularizers.l2(0.00001), activation='relu', input_shape=(30,)))

In [None]:
model.add(layers.Dropout(0.1))

In [None]:
model.add(layers.Dense(128,kernel_regularizer=regularizers.l2(0.00001), activation='relu'))

In [None]:
model.add(layers.Dropout(0.1))

In [None]:
model.add(layers.Dense(64,kernel_regularizer=regularizers.l2(0.00001), activation='relu'))

In [None]:
model.add(layers.Dropout(0.1))

In [None]:
model.add(layers.Dense(32,kernel_regularizer=regularizers.l2(0.00001), activation='relu'))

In [None]:
model.add(layers.Dropout(0.1))

In [None]:
model.add(layers.Dense(11, activation='softmax'))

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

## Sources

Data collected from: https://www.sports-reference.com/cbb/