**1. Getting Started**

Loading packages and reading all of the .csv files into data frames with the same names.

This notebook contains R code to make a "chalk" bracket where the favorites win every game. It works by finding all of the teams that could win each game in the tournament and giving the win to the highest rated team amongst them.

In [1]:
library(data.table)
library(tidyverse)
library(stringr)

files = list.files(path = "march-machine-learning-mania-2025")
df_names = gsub(".csv","",files)

for(i in df_names){
    filepath <- file.path("../input/march-machine-learning-mania-2025/",paste(i,".csv",sep=""))
    assign(i, fread(filepath))
}


# to see list of available data frames
#ls()

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1194816113.py, line 9)

**2. Reading in Team Ratings and matching them with team IDS**

In [None]:
# from https://masseyratings.com/cbw/ncaa-d1/ratings and selecting more/export
msilver = read.csv("/kaggle/input/msilver/msilver.csv")
msilver = msilver[,c(1,14)]
colnames(msilver) = c("TeamName", "Power")

# from copying and pasting https://kenpom.com/ into a spreadsheet
wsilver = read.csv("/kaggle/input/wsilver/wsilver.csv")
wsilver = wsilver[,c(1,11)]
colnames(wsilver) = c("TeamName", "Power")

head(msilver)
head(wsilver)

**3. Matching Team Ratings with Team IDs and Tournament Slots**

In [None]:
#head(`2024_tourney_seeds`)
tourney_seeds = `2024_tourney_seeds`

#head(WTeamSpellings)

M_new_spellings = 
data.frame(
    TeamNameSpelling = c("southeast missouri st.", 
                         "texas a&m corpus chris"), 
                         TeamID = c(1369, 1394))

W_new_spellings = 
data.frame(
    TeamNameSpelling = c("tamu-corpus christi"), 
                         TeamID = c(3394))

MTeamSpellings2 = 
rbind(MTeamSpellings, M_new_spellings)

WTeamSpellings2 = 
rbind(WTeamSpellings, W_new_spellings)

wsilver_with_teamIDs = 
wsilver %>% 
mutate(TeamName = tolower(TeamName)) %>%
left_join(WTeamSpellings2,
         by= c("TeamName"="TeamNameSpelling"))

msilver_with_teamIDs = 
msilver %>% 
mutate(TeamName = 
       tolower(str_trim(str_remove_all(TeamName, "[:digit:]")))) %>%
filter(!(TeamName %in% c("team", "ncsos"))) %>%
left_join(MTeamSpellings2,
         by= c("TeamName"="TeamNameSpelling")) 
# a few team names aren't matched but they likely aren't in the tournament


full_ratings = rbind(wsilver_with_teamIDs, msilver_with_teamIDs) %>%
    mutate(Power = as.numeric(Power))

tourney_seeds_with_ratings = 
left_join(tourney_seeds,
         full_ratings,
         by="TeamID")

# are there any teams without ratings?
tourney_seeds_with_ratings %>% 
    filter(is.na(Power))

**POSSSIBLE ALTERATION #1:**

Manually change one or more team's ratings

In [None]:
# step 1: Find the TeamID's and Power ratings 
# for the team you want to alter

tourney_seeds_with_ratings %>% 
    filter(Tournament == "W" & TeamName %like% "carolina")
# team 3376

tourney_seeds_with_ratings %>% 
    filter(Tournament == "M" & TeamName %like% "houston")
# team 1222

tourney_seeds_with_ratings %>% 
    filter(Tournament == "M" & TeamName %like% "conn")
# team 1163

# let's bump up the ratings for South Carolina (W) and
# Houston (M) and leave everyone else the same:
tourney_seeds_with_ratings = 
tourney_seeds_with_ratings %>% 
    mutate(Power = case_when(
        TeamID == 3376 ~ 200,
        TeamID == 1163 ~ 200,
       TRUE ~ Power))

**4. Function to predict winner from two team ratings and HFA**

In [None]:
# kenpom adjEM is per 100 (multiply by 0.7) then use sd of 11
# 

msilver_wpct = function(pwr1, pwr2){
    pred_pt_margin = (pwr1-pwr2)
    tscore = pred_pt_margin/11
    pnorm(tscore)
}

wsilver_wpct = function(pwr1, pwr2, home=0){
    # home = 1 (home), 0 (neutral), -1 (away)
    hfa = 2.73*home
    tscore = (pwr1 - pwr2 + hfa)/11.5
    pnorm(tscore)
}

wpct_function = function(bracket, pwr1, pwr2, home=0){
    ifelse(bracket=="M",
          msilver_wpct(pwr1, pwr2),
           wsilver_wpct(pwr1, pwr2, home)
          )
}

**POSSSIBLE ALTERATION #2:**

Make the projections more aggressive by reducing the standard error in the projected winning margins:

**5. Determining the team slots that are candidates to win each tournament game**

In [None]:
# slicing up sample submission file
tourney_seeds = 
tourney_seeds %>% 
    mutate(team_region = substr(Seed, 1, 1),
          team_rank = as.numeric(substr(Seed, 2, 3)))

# creating a data frame of games to be predicted
games_to_predict = 
sample_submission %>%
    mutate(Round = substr(Slot, 2, 2),
            Region = ifelse(Round <= 4, substr(Slot, 3, 3), substr(Slot, 3, 4)),
            Highest_Rank = substr(Slot, 4, 4))

games_to_predict_with_teams = 
full_join(games_to_predict,
          tourney_seeds,
          by=c("Tournament"), relationship="many-to-many")

# filtering out teams that can't possible win certain slots

# filtering by region
games_to_predict_filtered_by_region = 
games_to_predict_with_teams %>%
 filter(
     (Round <= 4 & Region == team_region) |
     Region == "WX" & team_region %in% c("W", "X") |
     Region == "YZ" & team_region %in% c("Y", "Z") |
     Region == "CH"
     ) 

# filtering by slot
games_to_predict_filtered_by_region_slot = 
games_to_predict_filtered_by_region %>%
 mutate(first_round_game = 
       pmin(team_rank, 17-team_rank),
       second_round_game = 
       pmin(first_round_game, 9-first_round_game),
       third_round_game = 
       pmin(second_round_game, 5-second_round_game)) %>%
filter(Round >= 4 | 
       (Round == 1 & Highest_Rank == first_round_game) |
      (Round == 2 & Highest_Rank == second_round_game) |
       (Round == 3 & Highest_Rank == third_round_game)
      )

# adding in HFA for women's bracket
games_to_predict_with_possible_winners = 
games_to_predict_filtered_by_region_slot %>%
mutate(at_home = 
       ifelse(Round <= 2 & Tournament == "W" & 
              team_rank <= 4, "yes", "no"))

# joining possible winners for every slot
# with team ratings from Massey and KenPom

games_to_predict_with_possible_winners = 
left_join(games_to_predict_with_possible_winners,
          tourney_seeds_with_ratings %>%
              select(TeamID, Power),
          by="TeamID"
         )

**6. Simulating the Tournament **

In [None]:
# first finding possible winners for first round
t = Sys.time()

number_of_brackets = 5000

sim_brackets = data.frame(matrix(nrow = 0, ncol = 4)) 
colnames(sim_brackets) = c("Tournament","Slot","Team", "Bracket") 


for (bracket_num in 1:number_of_brackets){
possible_winners = 
games_to_predict_with_possible_winners %>%
    filter(Round == 1) %>%
    pull(TeamID)

# make empty vector of predicted game winners
predicted_games = data.frame(matrix(nrow = 0, ncol = 3)) 
colnames(predicted_games) = c("Tournament","Slot","Team") 

for (rnd in 1:6){
round_possible_winners = 
games_to_predict_with_possible_winners %>%
    filter(Round == rnd, TeamID %in% possible_winners) %>%
    select(Tournament, Slot, Seed, TeamID, at_home, Power)


round_possible_winners_wide = 
left_join(round_possible_winners,
            round_possible_winners,
          by=c("Tournament", "Slot"),
         relationship="many-to-many") %>%
filter(TeamID.x < TeamID.y) %>%
mutate(home = case_when(
                at_home.x == "yes" ~ 1,
                at_home.y == "yes" ~ -1,
                TRUE ~ 0)) %>%
    mutate(wpct = wpct_function(bracket=Tournament, 
                                pwr1 = Power.x, 
                                pwr2 = Power.y,
                               home = home))

ngames = nrow(round_possible_winners_wide)
winners = rep(NA, ngames)

for (i in 1:ngames){
    games = round_possible_winners_wide[i,]
    winner =
    games %>%
    mutate(winner = sample(x=c(TeamID.x, TeamID.y),
                           size=1,
                           prob=c(wpct, 1-wpct))) %>%
    pull(winner)
   
    winners[i] = winner
}

possible_winners = winners

winning_seeds = 
left_join(data.frame(TeamID=winners),
        tourney_seeds %>%
        select(TeamID, Seed),
         by="TeamID") %>%
pull(Seed)

round_possible_winners_wide$Team = winning_seeds

new_predicted_games = 
round_possible_winners_wide %>%
select(Tournament, Slot, Team)

predicted_games = 
rbind(predicted_games,
     new_predicted_games)}

new_sim_bracket = predicted_games
new_sim_bracket$Bracket = bracket_num
    
sim_brackets = 
    rbind(sim_brackets,
         new_sim_bracket)
}

Sys.time()-t

**7. Creating the Submission Files for the Competition**

In [None]:
sim_brackets$RowId = 1:nrow(sim_brackets)
write.csv(sim_brackets %>%
             select(RowId, Tournament, Bracket, Slot, Team),
         "submission.csv", row.names=FALSE)
#submission.csv

**8. Which teams were most likely to win the Championship??**

In [None]:
sim_brackets %>% 
    filter(Tournament == "W", Slot == "R6CH") %>%
    group_by(Team) %>%
    summarize(n=n()) %>%
    arrange(desc(n)) %>%
    head()

tourney_seeds_with_ratings %>%
    filter(Tournament == "W") %>%
    arrange(desc(Power)) %>%
    head()

sim_brackets %>% 
    filter(Tournament == "M", Slot == "R6CH") %>%
    group_by(Team) %>%
    summarize(n=n()) %>%
    arrange(desc(n)) %>%
    head()

tourney_seeds_with_ratings %>%
    filter(Tournament == "M") %>%
    arrange(desc(Power)) %>%
    head()