In [1]:
# Load required packages
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)

In [2]:
# load the stats
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
# keep 2014+
data.tourney %>% filter(Season>2013) -> data.tourney

In [3]:
# store 68 tourney teams 
data.tourney %>% 
    select(Season,TeamID=WTeamID) %>% 
    bind_rows(data.tourney %>% 
              select(Season,TeamID=LTeamID)) %>% 
    distinct(Season,TeamID) -> df.tourneyteams

In [4]:
# Pomeroy Rating 2014+
# https://kenpom.com/index.php?y=201?
data.pomeroy <- read.csv('datafiles_csv/Pomeroy.csv',stringsAsFactors=F)
data.pomeroy %>% head

Season,Rk,Team,Conf,W.L,AdjEM,AdjO,AdjORank,AdjD,AdjDRank,⋯,Luck,LuckRank,SOSAdjEM,SOSAdjEMRank,SOSOppO,SOSOppORank,SOSOppD,SOSOppDRank,NCSOSAdjEM,NCSOSAdjEMRank
2014,1,Louisville,Amer,31-6,30.41,118.9,7,88.5,5,⋯,-0.045,278,4.31,95,107.0,100,102.7,93,-4.65,295
2014,2,Arizona,P12,33-5,30.11,116.5,20,86.4,1,⋯,0.014,151,9.3,17,109.9,20,100.6,18,1.62,113
2014,3,Florida,SEC,36-3,28.57,116.5,19,88.0,3,⋯,0.053,68,9.02,22,109.4,32,100.4,14,2.39,94
2014,4,Virginia,ACC,30-7,26.46,114.8,27,88.3,4,⋯,0.012,154,8.67,28,109.4,31,100.8,27,1.57,114
2014,5,Wisconsin,B10,30-8,25.89,122.0,4,96.1,35,⋯,0.019,133,11.44,3,110.2,13,98.8,2,3.33,68
2014,6,Wichita St.,MVC,35-1,25.36,117.8,17,92.4,11,⋯,-0.009,205,1.04,125,105.8,119,104.8,144,2.89,79


In [5]:
data.pomeroy %>% rename(Name=Team,
                        Rank=Rk,
                        Rating=AdjEM) -> data.pomeroy

In [6]:
# load team spellings variations
data.teamspellings <- readRDS('datafiles_rds/TeamSpellings.rds')
data.teams <- readRDS('datafiles_rds/Teams.rds')

In [7]:
# add TeamID based on team name
data.pomeroy %>% 
    mutate(LowName=str_to_lower(Name)) %>% 
    left_join(data.teamspellings,by=c('LowName'='TeamNameSpelling')) -> df.ratingsystem

In [8]:
df.ratingsystem %>% filter(str_detect(Name,'Albany'))

Season,Rank,Name,Conf,W.L,Rating,AdjO,AdjORank,AdjD,AdjDRank,⋯,SOSAdjEM,SOSAdjEMRank,SOSOppO,SOSOppORank,SOSOppD,SOSOppDRank,NCSOSAdjEM,NCSOSAdjEMRank,LowName,TeamID
2014,181,Albany,AE,19-15,-1.04,102.9,217,103.9,148,⋯,-5.12,270,101.5,311,106.6,224,0.91,135,albany,1107
2015,131,Albany,AE,24-9,3.29,107.2,116,103.9,162,⋯,-5.5,297,100.4,325,105.9,229,0.18,154,albany,1107
2016,121,Albany,AE,24-9,4.25,106.9,141,102.7,131,⋯,-8.12,328,100.8,313,108.9,330,-4.63,293,albany,1107
2017,129,Albany,AE,21-14,2.66,107.5,116,104.9,180,⋯,-5.16,283,101.8,286,106.9,263,-2.0,216,albany,1107
2018,151,Albany,AE,22-10,1.71,107.1,134,105.4,180,⋯,-6.49,306,101.4,307,107.9,300,-5.2,307,albany,1107


In [9]:
# missing tourney team in stats (because name misspelling)
df.ratingsystem %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season
1167,CS Bakersfield,2008,2019
1418,ULL,1985,2019


In [10]:
# add missing TeamID from Sagarin stats
df.ratingsystem %>%
    mutate_which(str_detect(Name,'Cal St. Bakersfield'),TeamID=1167) %>%
    mutate_which(str_detect(Name,'Louisiana Lafayette'),TeamID=1418) %>%               # ULL University of Louisiana at Lafayette 
    mutate_which(str_detect(Name,'Little Rock'),TeamID=1114) -> df.ratingsystem

In [11]:
# check if ssomething is still wrong
df.ratingsystem %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season


In [12]:
# restrict stats to tourney teams
df.ratingsystem %>% 
    inner_join(df.tourneyteams,by=c('TeamID','Season')) %>%
    select(Season,TeamID,Rating) -> df.ratingsystem

In [13]:
# Check if we have 68 teams per Season
df.ratingsystem %>% 
    group_by(Season) %>% 
    count

Season,n
2014,68
2015,68
2016,68
2017,68
2018,68


In [14]:
# check missing TeamID per Season 
df.tourneyteams %>% 
    anti_join(df.ratingsystem,by=c('Season','TeamID')) %>%
    inner_join(data.teams,by='TeamID') %>%
    select(TeamID,TeamName) %>%
    distinct()

TeamID,TeamName


In [15]:
# load true results
data.truth <- readRDS('datafiles_rds/TourneyTrueResults.rds')

In [16]:
# slope hyper parameter grid : we will compute 51 models and chose the best one (min logloss)
df.ratingsystem %>% 
    mutate(dummy=1) %>% 
    inner_join(data.frame(qtl=seq(0.01,0.75,0.01),dummy=1),by='dummy') %>% 
    select(-dummy) %>% 
    group_by(qtl,Season) %>% 
    mutate(thres=max(Rating)-quantile(Rating,max(qtl)),Slope=3/thres) %>%
    ungroup %>% 
    select(Season,qtl,Slope) %>% 
    distinct() -> df.slopes
df.slopes %>% sample_n(5)

Season,qtl,Slope
2014,0.5,0.20080321
2017,0.12,0.09881163
2018,0.02,0.07232541
2017,0.74,0.34014377
2018,0.46,0.15738283


In [17]:
# matchups dataframe : all the possible matches between the 68 teams per Season 
df.ratingsystem %>% 
    inner_join(df.ratingsystem,by='Season') %>% 
    filter(TeamID.x < TeamID.y) -> df.matchup
df.matchup %>% sample_n(5)

Season,TeamID.x,Rating.x,TeamID.y,Rating.y
2014,1307,17.15,1455,25.36
2018,1112,17.7,1277,25.41
2017,1139,20.76,1374,24.73
2018,1168,0.62,1462,21.51
2015,1361,15.95,1437,30.65


In [18]:
# all matchs with all 51 hyper parameter setting (qtl=[0.25,0.75]) 
df.matchup %>% inner_join(df.slopes,by=c('Season')) -> df.matchup
df.matchup %>% sample_n(5)

Season,TeamID.x,Rating.x,TeamID.y,Rating.y,qtl,Slope
2018,1277,25.41,1411,-6.65,0.39,0.1525569
2014,1300,7.98,1372,10.83,0.24,0.1487092
2014,1308,10.54,1387,15.72,0.67,0.2983175
2017,1268,14.34,1276,23.05,0.39,0.1701577
2017,1308,7.89,1413,-4.41,0.49,0.1893641


In [19]:
# machups predictions : compute the winning probability of TeamID.x and cap it into [0.05,0.95] to avoid too much logloss is case of FP or FN 
df.matchup %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(Rating.x-Rating.y,a=Slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    mutate(Prob=round(Prob,3)) %>%
    mutate_which(Prob>0.95,Prob=0.95) %>%
    mutate_which(Prob<0.05,Prob=0.05) %>%
    select(qtl,ID,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

qtl,ID,Pred,Prob
0.69,2017_1233_1407,1,0.665
0.68,2015_1279_1417,0,0.468
0.58,2018_1116_1254,1,0.95
0.73,2017_1196_1376,1,0.905
0.28,2015_1246_1320,1,0.863


In [20]:
# merge prediction and truth
data.truth %>% inner_join(df.submit,by='ID') -> df.results

In [21]:
# compute accuracy & Logloss per qtl to find the best one
df.results %>% 
    mutate(OK=ifelse(Pred==Target,1,0)) %>%
    mutate(LogLoss=Target*log(Prob) + (1-Target)*log(1-Prob)) %>%
    group_by(qtl) %>%
    summarise(TC=sum(OK),N=n(),FC=N-TC,Acc=TC/N,LogLoss=-mean(LogLoss)) %>%
    select(qtl,TC,FC,N,Acc,LogLoss) -> df.results.perf
df.results.perf %>% arrange(LogLoss) %>% head(5)

qtl,TC,FC,N,Acc,LogLoss
0.31,248,87,335,0.7402985,0.4989922
0.32,248,87,335,0.7402985,0.4990531
0.3,248,87,335,0.7402985,0.4992647
0.35,248,87,335,0.7402985,0.4993081
0.34,248,87,335,0.7402985,0.4993364


In [22]:
# store the best qtl
df.results.perf %>% 
    arrange(LogLoss) %>% 
    head(1) %>% pull(qtl) -> bestqtl

In [23]:
# save the best results for kaggle submission
df.submit %>% 
    filter(qtl==bestqtl) %>% 
    select(ID,Pred=Prob) %>% 
    write.csv('predictions/Pred_Pomeroy_bestqtl.csv',quote=F,row.names=F)

In [24]:
# save best model for mix
df.submit %>%
    filter(qtl==bestqtl) %>% 
    saveRDS('predictions/Pomeroy_bestqtl.rds')    