In [1]:
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)

In [2]:
# load the stats
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
# keep 2014
data.tourney %>% filter(Season==2014) -> data.tou2014

In [3]:
# store 68 tourney teams 
data.tou2014 %>% 
    select(TeamID=WTeamID) %>% 
    bind_rows(data.tou2014 %>% 
              select(TeamID=LTeamID)) %>% 
    distinct(TeamID) -> df.teams2014

In [4]:
# load ranking db
data.ranking <- readRDS('datafiles_rds/MasseyOrdinals.rds')
# keep 2014, last day
data.ranking %>% 
    filter(Season==2014) %>% 
    filter(RankingDayNum==max(RankingDayNum)) -> data.lastranking

In [5]:
# compute the Truth 
data.tou2014 %>% 
    select(Season,WTeamID,LTeamID) %>% 
    mutate(TeamID.1=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.2=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(ID=str_c(Season,'_',TeamID.1,'_',TeamID.2)) %>% 
    mutate(Target=ifelse(WTeamID==TeamID.1,1,0)) %>% 
    select(ID,Target) -> df.truth
df.truth %>% sample_n(5)

ID,Target
2014_1242_1390,0
2014_1372_1433,1
2014_1276_1397,1
2014_1393_1444,1
2014_1107_1196,0


In [6]:
data.lastranking %>% distinct(SystemName) -> df.systems

In [7]:
# compute slope of sigmoid
slope = 3 / 180    # ~.50 percentile
slope

In [8]:
# matchups dataframe for each SystemName
df.teams2014 %>%
    inner_join(data.lastranking,by='TeamID') %>%
    select(Season,SystemName,TeamID,OrdinalRank) -> tmp01
tmp01 %>% 
    inner_join(tmp01,by=c('Season','SystemName')) %>% 
    filter(TeamID.x < TeamID.y) -> df.matchup
df.matchup %>% sample_n(5)

Season,SystemName,TeamID.x,OrdinalRank.x,TeamID.y,OrdinalRank.y
2014,DOL,1166,16,1314,27
2014,BUR,1107,182,1326,18
2014,MAS,1166,17,1397,42
2014,KPK,1295,48,1409,79
2014,STF,1272,38,1386,51


In [9]:
# remove ranking with too few teams
df.matchup %>% 
    group_by(SystemName) %>% 
    mutate(N=n()) %>%
    ungroup() %>%
    filter(N==max(N)) %>%
    select(-N) -> df.matchup

In [10]:
# machups predictions
df.matchup %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(OrdinalRank.y-OrdinalRank.x,a=slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    select(ID,SystemName,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

ID,SystemName,Pred,Prob
2014_1174_1390,MOR,0,0.214165
2014_1211_1411,DOL,1,0.979501
2014_1437_1444,BBT,1,0.8273079
2014_1314_1455,CJB,0,0.450166
2014_1338_1458,7OT,0,0.4296368


In [11]:
# merge prediction and truth
df.truth %>% inner_join(df.submit,by='ID') -> df.results

In [12]:
# Accuracy & Logloss per SystemName
df.results %>% 
    mutate(OK=ifelse(Pred==Target,1,0)) %>%
    mutate(LogLoss=Target*log(Prob) + (1-Target)*log(1-Prob)) %>%
    group_by(SystemName) %>%
    summarise(TC=sum(OK),N=n(),FC=N-TC,Acc=TC/N,LogLoss=-mean(LogLoss)) %>%
    select(SystemName,TC,FC,N,Acc,LogLoss) -> df.results.perf
df.results.perf %>% arrange(desc(Acc)) %>% head(5)

SystemName,TC,FC,N,Acc,LogLoss
UPS,48,19,67,0.7164179,0.5685834
DUN,47,20,67,0.7014925,0.5653627
7OT,46,21,67,0.6865672,0.5635918
RTP,46,21,67,0.6865672,0.5838192
SE,46,21,67,0.6865672,0.5855446
