In [1]:
# Load required packages
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)

In [2]:
# load the stats
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
# keep 2014+
data.tourney %>% filter(Season>2013) -> data.tourney

In [3]:
# store 68 tourney teams 
data.tourney %>% 
    select(Season,TeamID=WTeamID) %>% 
    bind_rows(data.tourney %>% 
              select(Season,TeamID=LTeamID)) %>% 
    distinct(Season,TeamID) -> df.tourneyteams

In [4]:
# Sagarin Rating 2014+
# https://ux.kitsapsun.com/sports/ncaab/sagarin/201?/team/
data.sagarin <- read.csv('datafiles_csv/Sagarin20142018.csv',stringsAsFactors=F)
data.sagarin %>% head

Season,Rank,Name,Rate,W,L,Sched,RankSched,W25,L25,W50,L50,GoldenMean,RankGM,Predictor,RankPred,Pure,RankPure
2014,1,Louisville,93.92,31,6,76.14,95,4,3,7,6,95.09,1,94.53,1,91.49,7
2014,2,Arizona,93.65,33,5,79.8,21,6,2,13,4,94.19,2,93.84,2,92.06,5
2014,3,Florida,93.27,36,3,80.07,16,9,3,13,3,92.45,3,92.57,3,96.41,2
2014,4,Wisconsin,91.99,30,8,81.91,2,6,4,12,5,91.35,4,91.99,4,92.34,4
2014,5,Michigan State,90.66,29,9,81.09,5,5,5,11,6,90.46,6,89.84,6,91.95,6
2014,6,Connecticut,90.61,32,8,80.24,15,8,4,14,7,89.87,10,89.42,10,96.48,1


In [5]:
# load team spellings variations
data.teamspellings <- readRDS('datafiles_rds/TeamSpellings.rds')
data.teams <- readRDS('datafiles_rds/Teams.rds')

In [6]:
# add TeamID based on team name
data.sagarin %>% 
    mutate(Name=str_to_lower(Name)) %>% 
    left_join(data.teamspellings,by=c('Name'='TeamNameSpelling')) -> df.sagarin

In [7]:
# team's name which does not match
df.sagarin %>% 
    filter(is.na(TeamID)) %>%
    select(Name) %>%
    distinct() %>%
    arrange(Name)

Name
ark.-little rock
ark.-little rock ualr
binghamton-ny
cal poly-slo
central connecticut st.
central florida ucf
central floridaucf
east tennessee state ets
fla. international
fort wayne ipfw


In [8]:
# missing tourney team in Sagarin stats (because name misspelling)
df.sagarin %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season
1142,Cal Poly SLO,1995,2019
1254,Long Island,1985,2019
1316,North Florida,2006,2019
1386,St Joseph's PA,1985,2019
1388,St Mary's CA,1985,2019
1392,Stony Brook,2000,2019
1433,VA Commonwealth,1985,2019
1462,Xavier,1985,2019


In [9]:
# add missing TeamID from Sagarin stats
df.sagarin %>%
    mutate_which(str_detect(Name,'xavier-ohio'),TeamID=1462) %>%
    mutate_which(str_detect(Name,'commonwealth'),TeamID=1433) %>%
    mutate_which(str_detect(Name,'stony brook'),TeamID=1392) %>%
    mutate_which(str_detect(Name,'saint mary'),TeamID=1388) %>%
    mutate_which(str_detect(Name,'saint joseph'),TeamID=1386) %>%
    mutate_which(str_detect(Name,'north florida'),TeamID=1316) %>%
    mutate_which(str_detect(Name,'long island'),TeamID=1254) %>%
    mutate_which(str_detect(Name,'cal poly'),TeamID=1142) %>%
    mutate_which(str_detect(Name,'^iona'),TeamID=1233) %>%    # Iona renamed in Iona College in 2017
    mutate_which(str_detect(Name,'east tennessee'),TeamID=1190) -> df.sagarin    # ETSU = East Tennessee State University

In [10]:
# check if ssomething is still wrong
df.sagarin %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season


In [11]:
# restrict Sagarin stats to tourney teams
df.sagarin %>% 
    inner_join(df.tourneyteams,by=c('TeamID','Season')) %>%
    select(Season,TeamID,Predictor) -> df.sagarin

In [12]:
# Check if we have 68 teams per Season
df.sagarin %>% 
    group_by(Season) %>% 
    count

Season,n
2014,68
2015,68
2016,68
2017,68
2018,68


In [13]:
# check missing TeamID per Season 
df.tourneyteams %>% 
    anti_join(df.sagarin,by=c('Season','TeamID'))

Season,TeamID


In [14]:
# compute the Truth 
data.tourney %>% 
    select(Season,WTeamID,LTeamID) %>% 
    mutate(TeamID.1=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.2=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(ID=str_c(Season,'_',TeamID.1,'_',TeamID.2)) %>% 
    mutate(Target=ifelse(WTeamID==TeamID.1,1,0)) %>% 
    select(ID,Target) -> df.truth
df.truth %>% sample_n(5)

ID,Target
2018_1260_1305,1
2016_1320_1400,1
2017_1124_1308,1
2015_1181_1352,1
2017_1268_1462,0


In [15]:
# slope hyper parameter grid : we will compute 51 models and chose the best one (min logloss)
df.sagarin %>% 
    mutate(dummy=1) %>% 
    inner_join(data.frame(qtl=seq(0.25,0.75,0.01),dummy=1),by='dummy') %>% 
    select(-dummy) %>% 
    group_by(qtl,Season) %>% 
    mutate(thres=max(Predictor)-quantile(Predictor,max(qtl)),Slope=3/thres) %>%
    ungroup %>% 
    select(Season,qtl,Slope) %>% 
    distinct() -> df.sagarin.slopes
df.sagarin.slopes %>% sample_n(5)

Season,qtl,Slope
2016,0.5,0.2967359
2015,0.5,0.1882648
2014,0.37,0.233102
2014,0.41,0.2424752
2016,0.55,0.319098


In [16]:
# matchups dataframe : all the possible matches between the 68 teams per Season 
df.sagarin %>% 
    inner_join(df.sagarin,by='Season') %>% 
    filter(TeamID.x < TeamID.y) -> df.matchup
df.matchup %>% sample_n(5)

Season,TeamID.x,Predictor.x,TeamID.y,Predictor.y
2016,1214,67.04,1437,94.0
2014,1153,85.84,1387,83.48
2018,1222,85.57,1411,66.08
2016,1211,87.67,1425,82.25
2014,1140,82.12,1326,88.42


In [17]:
# all matchs with all 51 hyper parameter setting (qtl=[0.25,0.75]) 
df.matchup %>% inner_join(df.sagarin.slopes,by=c('Season')) -> df.matchup
df.matchup %>% sample_n(5)

Season,TeamID.x,Predictor.x,TeamID.y,Predictor.y,qtl,Slope
2015,1268,83.71,1316,73.21,0.47,0.1844996
2016,1276,83.57,1332,88.47,0.59,0.338761
2017,1233,73.54,1438,89.61,0.75,0.5597015
2018,1267,75.8,1345,91.66,0.33,0.2051114
2015,1207,85.86,1459,76.9,0.42,0.1774791


In [18]:
# machups predictions : compute the winning probability of TeamID.x and cap it into [0.05,0.95] to avoid too much logloss is case of FP or FN 
df.matchup %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(Predictor.x-Predictor.y,a=Slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    mutate(Prob=round(Prob,3)) %>%
    mutate_which(Prob>0.95,Prob=0.95) %>%
    mutate_which(Prob<0.05,Prob=0.05) %>%
    select(qtl,ID,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

qtl,ID,Pred,Prob
0.44,2018_1166_1437,0,0.073
0.71,2016_1163_1355,1,0.95
0.3,2014_1300_1444,1,0.727
0.28,2014_1142_1459,1,0.54
0.42,2016_1234_1425,1,0.792


In [19]:
# merge prediction and truth
df.truth %>% inner_join(df.submit,by='ID') -> df.results

In [20]:
# compute accuracy & Logloss per qtl to find the best one
df.results %>% 
    mutate(OK=ifelse(Pred==Target,1,0)) %>%
    mutate(LogLoss=Target*log(Prob) + (1-Target)*log(1-Prob)) %>%
    group_by(qtl) %>%
    summarise(TC=sum(OK),N=n(),FC=N-TC,Acc=TC/N,LogLoss=-mean(LogLoss)) %>%
    select(qtl,TC,FC,N,Acc,LogLoss) -> df.results.perf
df.results.perf %>% arrange(LogLoss) %>% head(5)

qtl,TC,FC,N,Acc,LogLoss
0.31,253,82,335,0.7552239,0.4925257
0.32,253,82,335,0.7552239,0.4926244
0.33,253,82,335,0.7552239,0.492786
0.3,253,82,335,0.7552239,0.492797
0.34,253,82,335,0.7552239,0.4930503


In [21]:
# store the best qtl
df.results.perf %>% 
    arrange(LogLoss) %>% 
    head(1) %>% pull(qtl) -> bestqtl

In [22]:
# save the best results for kaggle submission
df.submit %>% 
    filter(qtl==bestqtl) %>% 
    select(ID,Pred=Prob) %>% 
    write.csv('predictions/Pred_Sagarin_bestqtl.csv',quote=F,row.names=F)

In [24]:
# save best model for mix
df.submit %>%
    filter(qtl==bestqtl) %>% 
    saveRDS('predictions/Sagarin_bestqtl.rds')    