In [80]:
# Load required packages
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)

In [81]:
data.madnessteams <- read.csv('../2019/NCAATourneySeeds.csv',stringsAsFactors=F)
data.madnessteams %>% 
    filter(Season==2019) %>%
    select(TeamID) %>% 
    mutate(Season=2019) -> df.tourneyteams

In [84]:
# DunkleIndex Rating 2014+
# http://dunkelindex.com/rankings/basketball/ncaa/
data.dunkel <- read.csv('../2019/DunkelIndex.csv',stringsAsFactors=F)
data.dunkel %>% head

Rank,Team,WLRating,SEARating,RECRating
1,Gonzaga,86.58,79.287,82.039
2,Duke,86.438,79.955,76.182
3,Virginia,83.927,76.904,80.205
4,Tennessee,82.079,75.596,75.97
5,Michigan State,82.013,76.071,75.614
6,North Carolina,81.639,75.967,80.693


In [86]:
data.dunkel %>% rename(Name=Team,
                       Rating=WLRating) -> data.dunkel

In [87]:
# load team spellings variations
data.teamspellings <- readRDS('../datafiles_rds/TeamSpellings.rds')
data.teams <- readRDS('../datafiles_rds/Teams.rds')

In [88]:
# add TeamID based on team name
data.dunkel %>% 
    mutate(LowName=str_to_lower(Name)) %>% 
    left_join(data.teamspellings,by=c('LowName'='TeamNameSpelling')) %>%
    mutate(Season=2019) -> df.dunkel

In [89]:
data.dunkel %>% filter(str_detect(Name,'^St. Mary'))

Rank,Name,Rating,SEARating,RECRating
49,St. Mary’s,67.303,64.332,65.961


In [90]:
# missing tourney team in Dunkle stats (because name misspelling)
df.dunkel %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season
1192,F Dickinson,1985,2019
1385,St John's,1985,2019
1387,St Louis,1985,2019
1388,St Mary's CA,1985,2019


In [91]:
# add missing TeamID from Sagarin stats
df.dunkel %>%
    mutate_which(str_detect(Name,'Dickinson'),TeamID=1192) %>%
    mutate_which(str_detect(Name,'John'),TeamID=1385) %>%
    mutate_which(str_detect(Name,'St. Louis'),TeamID=1387) %>%
    mutate_which(str_detect(Name,'^St. Mary'),TeamID=1388) -> df.dunkel

In [92]:
# check if ssomething is still wrong
df.dunkel %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season


In [93]:
# restrict Dunkle stats to tourney teams
df.dunkel %>% 
    inner_join(df.tourneyteams,by=c('TeamID','Season')) %>%
    select(Season,TeamID,Rating) -> df.dunkel

In [94]:
# Check if we have 68 teams per Season
df.dunkel %>% 
    group_by(Season) %>% 
    count

Season,n
2019,68


In [95]:
# check missing TeamID per Season 
df.tourneyteams %>% 
    anti_join(df.dunkel,by=c('Season','TeamID'))

TeamID,Season


#### Predictions

In [96]:
df.tourneyteams %>%
    full_join(data.madnessteams,by='Season') %>%
    filter(TeamID.x < TeamID.y) %>%
    select(-Season) -> data.matchups 

In [98]:
Qtl = 0.65
Slope = as.numeric(3 / ( max(df.dunkel$Rating) - quantile(df.dunkel$Rating,Qtl)))
Slope

In [100]:
df.dunkel %>% head

Season,TeamID,Rating
2019,1211,86.58
2019,1181,86.438
2019,1438,83.927
2019,1397,82.079
2019,1277,82.013
2019,1314,81.639


In [101]:
data.matchups %>%
    inner_join(df.dunkel,by=c('TeamID.x'='TeamID')) %>%
    select(-Season,-Seed) %>%
    inner_join(df.dunkel,by=c('TeamID.y'='TeamID')) %>% 
    select(everything()) -> df.matchups

In [102]:
df.matchups %>% head

TeamID.x,TeamID.y,Rating.x,Season,Rating.y
1181,1277,86.438,2019,82.013
1181,1261,86.438,2019,74.161
1181,1439,86.438,2019,75.82
1181,1280,86.438,2019,73.124
1181,1268,86.438,2019,71.99
1181,1257,86.438,2019,72.018


In [103]:
# machups predictions : compute the winning probability of TeamID.x and cap it into [0.05,0.95] to avoid too much logloss is case of FP or FN 
df.matchups %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(Rating.x-Rating.y,a=Slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    mutate(Prob=round(Prob,3)) %>%
    mutate_which(Prob>0.95,Prob=0.95) %>%
    mutate_which(Prob<0.05,Prob=0.05) %>%
    select(ID,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

ID,Pred,Prob
2019_1266_1387,1,0.927
2019_1196_1233,1,0.95
2019_1280_1305,0,0.353
2019_1113_1300,1,0.95
2019_1196_1266,0,0.358


In [106]:
# save the best results for kaggle submission
df.submit %>% 
    select(ID,Pred=Prob) %>% 
    write.csv('../predictions_2019/DunkelIndex.csv',quote=F,row.names=F)

In [107]:
# save best model for mix
df.submit %>%
    saveRDS('../predictions_2019/DunkelIndex.rds')    