In [21]:
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)

In [2]:
# load the stats
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
# keep 2014
data.tourney %>% filter(Season==2014) -> data.tou2014

In [3]:
# store 68 tourney teams 
data.tou2014 %>% 
    select(TeamID=WTeamID) %>% 
    bind_rows(data.tou2014 %>% 
              select(TeamID=LTeamID)) %>% 
    distinct(TeamID) -> df.teams2014

In [4]:
# load ranking db
data.ranking <- readRDS('datafiles_rds/MasseyOrdinals.rds')
# keep 2014, last day
data.ranking %>% 
    filter(Season==2014) %>% 
    filter(RankingDayNum==max(RankingDayNum)) -> data.lastranking

In [5]:
# compute the Truth 
data.tou2014 %>% 
    select(Season,WTeamID,LTeamID) %>% 
    mutate(TeamID.1=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.2=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(ID=str_c(Season,'_',TeamID.1,'_',TeamID.2)) %>% 
    mutate(Target=ifelse(WTeamID==TeamID.1,1,0)) %>% 
    select(ID,Target) -> df.truth
df.truth %>% sample_n(5)

ID,Target
2014_1235_1314,1
2014_1112_1211,1
2014_1277_1438,1
2014_1112_1451,1
2014_1163_1246,1


In [6]:
data.lastranking %>% distinct(SystemName) -> df.systems

In [7]:
# compute slope of sigmoid
slope = 3 / 180    # ~.50 percentile
slope

In [9]:
# matchups dataframe for each SystemName
df.teams2014 %>%
    inner_join(data.lastranking,by='TeamID') %>%
    select(Season,SystemName,TeamID,OrdinalRank) -> tmp01
tmp01 %>% 
    inner_join(tmp01,by=c('Season','SystemName')) %>% 
    filter(TeamID.x < TeamID.y) -> df.matchup
df.matchup %>% sample_n(5)

Season,SystemName,TeamID.x,OrdinalRank.x,TeamID.y,OrdinalRank.y
2014,TW,1387,23,1462,38
2014,TW,1273,185,1314,19
2014,WIL,1184,130,1300,125
2014,LMC,1301,68,1314,27
2014,PGH,1372,58,1400,24


In [18]:
# remove ranking with too few teams
df.matchup %>% 
    group_by(SystemName) %>% 
    mutate(N=n()) %>%
    ungroup() %>%
    filter(N==max(N)) %>%
    select(-N) -> df.matchup

In [22]:
# machups predictions
df.matchup %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(OrdinalRank.x-OrdinalRank.y,a=slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    select(ID,SystemName,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

ID,SystemName,Pred,Prob
2014_1257_1328,DCI,0,0.42963676
2014_1328_1329,KPK,1,0.5166605
2014_1437_1462,STS,0,0.27888482
2014_1397_1409,WLK,0,0.2755454
2014_1307_1451,ADE,0,0.06199703


In [25]:
# merge prediction and truth
df.truth %>% inner_join(df.submit,by='ID') -> df.results

In [31]:
# Accuracy & Logloss per SystemName
df.results %>% 
    mutate(OK=ifelse(Pred==Target,1,0)) %>%
    mutate(LogLoss=Target*log(Prob) + (1-Target)*log(1-Prob)) %>%
    group_by(SystemName) %>%
    summarise(TC=sum(OK),N=n(),FC=N-TC,Acc=TC/N,LogLoss=-mean(LogLoss)) %>%
    select(SystemName,TC,FC,N,Acc,LogLoss) -> df.results.perf
df.results.perf %>% arrange(desc(Acc)) %>% head(5)

SystemName,TC,FC,N,Acc,LogLoss
KBM,27,40,67,0.4029851,1.058206
CPR,26,41,67,0.3880597,1.115242
SEL,26,41,67,0.3880597,1.116321
ADE,25,42,67,0.3731343,1.084068
KPK,25,42,67,0.3731343,1.04139
