In [49]:
# Load required packages
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)

In [50]:
data.madnessteams <- read.csv('../2019/NCAATourneySeeds.csv',stringsAsFactors=F)
data.madnessteams %>% 
    filter(Season==2019) %>%
    select(TeamID) %>% 
    mutate(Season=2019) -> df.tourneyteams

In [51]:
# Sagarin Rating 2019
# https://ux.kitsapsun.com/sports/ncaab/sagarin/201?/team/
data.sagarin <- read.csv('../2019//Sagarin.csv',stringsAsFactors=F)
data.sagarin %>% head

Rank,Team,Rating,Sched,Predictor,Golden,Recent
1,Duke,96.48,82.47,96.45,96.11,95.65
2,Virginia,95.31,80.8,95.37,95.79,92.82
3,North Carolina,94.55,82.58,94.39,93.87,95.9
4,Michigan State,94.06,82.45,93.88,93.75,94.47
5,Gonzaga,93.68,74.88,95.89,94.06,86.65
6,Kentucky,91.29,81.36,91.04,92.05,90.35


In [52]:
# load team spellings variations
data.teamspellings <- readRDS('../datafiles_rds/TeamSpellings.rds')
data.teams <- readRDS('../datafiles_rds/Teams.rds')

In [53]:
# add TeamID based on team name
data.sagarin %>% 
    mutate(Name=str_to_lower(Team)) %>% 
    left_join(data.teamspellings,by=c('Name'='TeamNameSpelling')) %>%
    mutate(Season=2019) -> df.sagarin

In [54]:
# missing tourney team in Sagarin stats (because name misspelling)
df.sagarin %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season
1233,Iona,1985,2019
1388,St Mary's CA,1985,2019
1433,VA Commonwealth,1985,2019


In [55]:
# add missing TeamID from Sagarin stats
df.sagarin %>%
    mutate_which(str_detect(Name,'commonwealth'),TeamID=1433) %>%
    mutate_which(str_detect(Name,'saint mary'),TeamID=1388) %>%
    mutate_which(str_detect(Name,'^iona'),TeamID=1233) -> df.sagarin

In [56]:
# check if ssomething is still wrong
df.sagarin %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season


In [57]:
# restrict Sagarin stats to tourney teams
df.sagarin %>% 
    inner_join(df.tourneyteams,by=c('TeamID','Season')) %>%
    select(Season,TeamID,Rating) -> df.sagarin

In [58]:
# Check if we have 68 teams per Season
df.sagarin %>% 
    group_by(Season) %>% 
    count

Season,n
2019,68


In [59]:
# check missing TeamID per Season 
df.tourneyteams %>% 
    anti_join(df.sagarin,by=c('Season','TeamID'))

TeamID,Season


#### Predictions

In [60]:
df.tourneyteams %>%
    full_join(data.madnessteams,by='Season') %>%
    filter(TeamID.x < TeamID.y) %>%
    select(-Season) -> data.matchups 

In [67]:
Qtl = 0.35
Slope = as.numeric(3 / ( max(df.sagarin$Rating) - quantile(df.sagarin$Rating,Qtl)))
Slope

In [68]:
df.sagarin %>% head

Season,TeamID,Rating
2019,1181,96.48
2019,1438,95.31
2019,1314,94.55
2019,1277,94.06
2019,1211,93.68
2019,1246,91.29


In [74]:
data.matchups %>%
    inner_join(df.sagarin,by=c('TeamID.x'='TeamID')) %>%
    select(-Season,-Seed) %>%
    inner_join(df.sagarin,by=c('TeamID.y'='TeamID')) %>% 
    select(everything()) -> df.matchups

In [77]:
df.matchups %>% head

TeamID.x,TeamID.y,Rating.x,Season,Rating.y
1181,1277,96.48,2019,94.06
1181,1261,96.48,2019,86.98
1181,1439,96.48,2019,88.64
1181,1280,96.48,2019,86.57
1181,1268,96.48,2019,85.49
1181,1257,96.48,2019,86.35


In [79]:
# machups predictions : compute the winning probability of TeamID.x and cap it into [0.05,0.95] to avoid too much logloss is case of FP or FN 
df.matchups %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(Rating.x-Rating.y,a=Slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    mutate(Prob=round(Prob,3)) %>%
    mutate_which(Prob>0.95,Prob=0.95) %>%
    mutate_which(Prob<0.05,Prob=0.05) %>%
    select(ID,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

ID,Pred,Prob
2019_1439_1459,1,0.695
2019_1138_1305,0,0.483
2019_1234_1439,0,0.256
2019_1308_1403,0,0.144
2019_1318_1416,0,0.235


In [80]:
# save the best results for kaggle submission
df.submit %>% 
    select(ID,Pred=Prob) %>% 
    write.csv('../2019/SagarinRating.csv',quote=F,row.names=F)

In [82]:
# save best model for mix
df.submit %>%
    saveRDS('../2019/SagarinRating.rds')    