In [14]:
# Load required packages
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)

In [15]:
# load the stats
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
# keep 2014+
data.tourney %>% filter(Season>2013) -> data.tourney

In [16]:
# store 68 tourney teams 
data.tourney %>% 
    select(Season,TeamID=WTeamID) %>% 
    bind_rows(data.tourney %>% 
              select(Season,TeamID=LTeamID)) %>% 
    distinct(Season,TeamID) -> df.tourneyteams

In [106]:
# SevenOvertime Rating 2014+
# http://sevenovertimes.com/teamstats.php?season=201?
data.sevenot <- read.csv('datafiles_csv/SevenOvertimes.csv',stringsAsFactors=F)
data.sevenot %>% head

Season,Team.Rank,Team.Name,Team.Record,Conference.Name,Rating,AvgWinProb,Strength.of.Schedule,SOS.Rank,Strength.of.Schedule..1st.Order.,SOS..1st.Order..Rank,Pace,Pace.Rank,Luck,Luck.Rank
2014,1,Arizona,(33-5),Pac-12,0.654,0.752,0.537,52,,1,65.71,310,4.397,32
2014,2,Duke,(26-9),ACC,0.648,0.707,0.544,21,,1,72.91,89,1.241,134
2014,3,Florida,(36-3),SEC,0.644,0.723,0.548,10,,1,63.84,329,7.782,7
2014,4,Wisconsin,(30-8),Big Ten,0.641,0.679,0.544,20,,1,68.75,238,4.175,39
2014,5,Virginia,(30-7),ACC,0.64,0.692,0.549,6,,1,60.93,350,4.385,33
2014,6,Kansas,(25-10),Big XII,0.634,0.739,0.549,9,,1,74.54,56,-0.88,219


In [107]:
data.sevenot %>% rename(Name=Team.Name,Rank=Team.Rank) -> data.sevenot

In [108]:
# load team spellings variations
data.teamspellings <- readRDS('datafiles_rds/TeamSpellings.rds')
data.teams <- readRDS('datafiles_rds/Teams.rds')
data.conferences <- readRDS('datafiles_rds/TeamConferences.rds')

In [109]:
# add TeamID based on team name
data.sevenot %>% 
    mutate(LowName=str_to_lower(Name)) %>% 
    left_join(data.teamspellings,by=c('LowName'='TeamNameSpelling')) -> df.ratingsystem

In [110]:
df.ratingsystem %>% filter(str_detect(Name,'E Tenn St'))
# df.ratingsystem %>% filter(str_detect(Conference.Name,'Southern'))

Season,Rank,Name,Team.Record,Conference.Name,Rating,AvgWinProb,Strength.of.Schedule,SOS.Rank,Strength.of.Schedule..1st.Order.,SOS..1st.Order..Rank,Pace,Pace.Rank,Luck,Luck.Rank,LowName,TeamID
2014,236,E Tenn St,(16-14),Atlantic Sun,0.457,0.496,0.476,297,,1,76.45,28,1.115,137,e tenn st,
2015,175,E Tenn St,(14-14),Southern Conference,0.477,0.461,0.471,283,,1,72.35,34,1.09,107,e tenn st,
2016,152,E Tenn St,(22-12),Southern Conference,0.492,0.473,0.474,262,,1,76.38,56,5.897,1,e tenn st,
2017,71,E Tenn St,(25-8),Southern Conference,0.55,0.673,0.479,247,0.489,190,74.43,108,2.503,47,e tenn st,
2018,289,E Tenn St,(23-9),Southern Conference,0.458,0.586,0.471,300,0.488,176,70.4,257,4.234,60,e tenn st,


In [112]:
# missing tourney team in Dunkle stats (because name misspelling)
df.ratingsystem %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season
1110,American Univ,1985,2019
1114,Ark Little Rock,1985,2019
1157,Coastal Car,1987,2019
1190,ETSU,1985,2019
1192,F Dickinson,1985,2019
1195,FL Gulf Coast,2008,2019
1203,G Washington,1985,2019
1245,Kent,1985,2019
1292,MTSU,1985,2019
1300,NC Central,2008,2019


In [113]:
# add missing TeamID from Sagarin stats
df.ratingsystem %>%
    mutate_which(str_detect(Name,'American U'),TeamID=1110) %>%
    mutate_which(str_detect(Name,'AR-Little Rock'),TeamID=1114) %>%
    mutate_which(str_detect(Name,'Coast Carolina'),TeamID=1157) %>%
    mutate_which(str_detect(Name,'E Tenn St'),TeamID=1190) %>%                   # East Tennessee State University (Buccaners)
    mutate_which(str_detect(Name,'Fair Dickinson'),TeamID=1192) %>%
    mutate_which(str_detect(Name,'FLA Gulf Coast'),TeamID=1195) %>%
    mutate_which(str_detect(Name,'G. Washington'),TeamID=1203) %>%
    mutate_which(str_detect(Name,'Kent St'),TeamID=1245) %>%
    mutate_which(str_detect(Name,'Mid Tennessee'),TeamID=1292) %>%          # Middle Tennessee State University (Blue Rider)
    mutate_which(str_detect(Name,'N Carolina Cent'),TeamID=1300) %>%        # NC Central = North Carolina Cent
    mutate_which(str_detect(Name,'N Carolina St'),TeamID=1301) %>%          # NC State = North Carolina State
    mutate_which(str_detect(Name,'S Methodist'),TeamID=1374) %>%                    # SMU = Southern Methodist University Mustangs
    mutate_which(str_detect(Name,'St. Johns'),TeamID=1385) %>%
    mutate_which(str_detect(Name,'Saint Josephs'),TeamID=1386) %>%
    mutate_which(str_detect(Name,'Saint Marys'),TeamID=1388) %>%
    mutate_which(str_detect(Name,'LA-Lafayette'),TeamID=1418) %>%               # ULL University of Louisiana at Lafayette 
    mutate_which(str_detect(Name,'UNC-Wilmington'),TeamID=1423) -> df.ratingsystem

In [114]:
# check if ssomething is still wrong
df.ratingsystem %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season


In [115]:
# restrict Dunkle stats to tourney teams
df.ratingsystem %>% 
    inner_join(df.tourneyteams,by=c('TeamID','Season')) %>%
    select(Season,TeamID,Rating) -> df.ratingsystem

In [116]:
# Check if we have 68 teams per Season
df.ratingsystem %>% 
    group_by(Season) %>% 
    count

Season,n
2014,68
2015,68
2016,68
2017,68
2018,68


In [117]:
# check missing TeamID per Season 
df.tourneyteams %>% 
    anti_join(df.ratingsystem,by=c('Season','TeamID'))

Season,TeamID


In [118]:
# load true results
data.truth <- readRDS('datafiles_rds/TourneyTrueResults.rds')

In [126]:
# slope hyper parameter grid : we will compute 51 models and chose the best one (min logloss)
df.ratingsystem %>% 
    mutate(dummy=1) %>% 
    inner_join(data.frame(qtl=seq(0.01,0.75,0.01),dummy=1),by='dummy') %>% 
    select(-dummy) %>% 
    group_by(qtl,Season) %>% 
    mutate(thres=max(Rating)-quantile(Rating,max(qtl)),Slope=3/thres) %>%
    ungroup %>% 
    select(Season,qtl,Slope) %>% 
    distinct() -> df.slopes
df.slopes %>% sample_n(5)

Season,qtl,Slope
2017,0.42,53.57143
2016,0.52,58.63956
2018,0.24,21.92341
2015,0.31,28.11885
2017,0.2,26.83363


In [127]:
# matchups dataframe : all the possible matches between the 68 teams per Season 
df.ratingsystem %>% 
    inner_join(df.ratingsystem,by='Season') %>% 
    filter(TeamID.x < TeamID.y) -> df.matchup
df.matchup %>% sample_n(5)

Season,TeamID.x,Rating.x,TeamID.y,Rating.y
2016,1124,0.601,1386,0.616
2016,1438,0.644,1452,0.616
2016,1276,0.55,1396,0.537
2015,1316,0.475,1329,0.585
2015,1257,0.623,1277,0.62


In [128]:
# all matchs with all 51 hyper parameter setting (qtl=[0.25,0.75]) 
df.matchup %>% inner_join(df.slopes,by=c('Season')) -> df.matchup
df.matchup %>% sample_n(5)

Season,TeamID.x,Rating.x,TeamID.y,Rating.y,qtl,Slope
2017,1181,0.627,1374,0.619,0.51,60.0
2017,1345,0.617,1423,0.59,0.17,25.0
2018,1139,0.542,1344,0.553,0.17,20.29358
2015,1314,0.65,1329,0.585,0.26,25.08781
2017,1411,0.462,1435,0.611,0.41,52.57624


In [129]:
# machups predictions : compute the winning probability of TeamID.x and cap it into [0.05,0.95] to avoid too much logloss is case of FP or FN 
df.matchup %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(Rating.x-Rating.y,a=Slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    mutate(Prob=round(Prob,3)) %>%
    mutate_which(Prob>0.95,Prob=0.95) %>%
    mutate_which(Prob<0.05,Prob=0.05) %>%
    select(qtl,ID,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

qtl,ID,Pred,Prob
0.1,2017_1166_1411,1,0.92
0.23,2016_1122_1396,0,0.103
0.69,2014_1372_1393,0,0.05
0.07,2015_1277_1428,1,0.512
0.45,2016_1114_1437,0,0.05


In [130]:
# merge prediction and truth
data.truth %>% inner_join(df.submit,by='ID') -> df.results

In [131]:
# compute accuracy & Logloss per qtl to find the best one
df.results %>% 
    mutate(OK=ifelse(Pred==Target,1,0)) %>%
    mutate(LogLoss=Target*log(Prob) + (1-Target)*log(1-Prob)) %>%
    group_by(qtl) %>%
    summarise(TC=sum(OK),N=n(),FC=N-TC,Acc=TC/N,LogLoss=-mean(LogLoss)) %>%
    select(qtl,TC,FC,N,Acc,LogLoss) -> df.results.perf
df.results.perf %>% arrange(LogLoss) %>% head(5)

qtl,TC,FC,N,Acc,LogLoss
0.23,249,86,335,0.7432836,0.5377802
0.16,249,86,335,0.7432836,0.5379097
0.22,249,86,335,0.7432836,0.5379678
0.24,249,86,335,0.7432836,0.5379723
0.21,249,86,335,0.7432836,0.5380687


In [132]:
# store the best qtl
df.results.perf %>% 
    arrange(LogLoss) %>% 
    head(1) %>% pull(qtl) -> bestqtl

In [133]:
# save the best results for kaggle submission
df.submit %>% 
    filter(qtl==bestqtl) %>% 
    select(ID,Pred=Prob) %>% 
    write.csv('predictions/Pred_SevenOvertimes_bestqtl.csv',quote=F,row.names=F)

In [134]:
# save best model for mix
df.submit %>%
    filter(qtl==bestqtl) %>% 
    saveRDS('predictions/SevenOvertimes_bestqtl.rds')    