In [1]:
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)

In [2]:
# load the stats
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
# keep 2014
data.tourney %>% filter(Season==2014) -> data.tou2014

In [3]:
# store 68 tourney teams 
data.tou2014 %>% 
    select(TeamID=WTeamID) %>% 
    bind_rows(data.tou2014 %>% 
              select(TeamID=LTeamID)) %>% 
    distinct(TeamID) -> df.teams2014

In [4]:
# load ranking db
data.ranking <- readRDS('datafiles_rds/MasseyOrdinals.rds')
# keep 2014, last day
data.ranking %>% 
    filter(Season==2014) %>% 
    filter(RankingDayNum==max(RankingDayNum)) -> data.lastranking

In [5]:
# compute the Truth 
data.tou2014 %>% 
    select(Season,WTeamID,LTeamID) %>% 
    mutate(TeamID.1=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.2=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(ID=str_c(Season,'_',TeamID.1,'_',TeamID.2)) %>% 
    mutate(Target=ifelse(WTeamID==TeamID.1,1,0)) %>% 
    select(ID,Target) -> df.truth
df.truth %>% sample_n(5)

ID,Target
2014_1107_1196,0
2014_1308_1361,0
2014_1124_1166,1
2014_1163_1196,1
2014_1124_1458,0


In [6]:
data.lastranking %>% distinct(SystemName) -> df.systems

In [7]:
# compute slope of sigmoid
slope = 3 / 180    # ~.50 percentile
slope

In [8]:
# matchups dataframe for each SystemName
df.teams2014 %>%
    inner_join(data.lastranking,by='TeamID') %>%
    select(Season,SystemName,TeamID,OrdinalRank) -> tmp01
tmp01 %>% 
    inner_join(tmp01,by=c('Season','SystemName')) %>% 
    filter(TeamID.x < TeamID.y) -> df.matchup
df.matchup %>% sample_n(5)

Season,SystemName,TeamID.x,OrdinalRank.x,TeamID.y,OrdinalRank.y
2014,BUR,1211,28,1264,73
2014,TMR,1277,14,1386,42
2014,CJB,1332,31,1455,11
2014,GRS,1387,34,1411,274
2014,SE,1124,26,1418,107


In [9]:
# remove ranking with too few teams
df.matchup %>% 
    group_by(SystemName) %>% 
    mutate(N=n()) %>%
    ungroup() %>%
    filter(N==max(N)) %>%
    select(-N) -> df.matchup

In [10]:
# machups predictions
df.matchup %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(OrdinalRank.y-OrdinalRank.x,a=slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    select(ID,SystemName,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

ID,SystemName,Pred,Prob
2014_1301_1462,DUN,0,0.3933306
2014_1211_1397,COL,1,0.6145945
2014_1372_1390,BBT,1,0.5166605
2014_1272_1386,STH,0,0.4625702
2014_1291_1308,TMR,0,0.1245534


In [11]:
# merge prediction and truth
df.truth %>% inner_join(df.submit,by='ID') -> df.results

In [None]:
# Accuracy & Logloss per SystemName
df.results %>% 
    mutate(OK=ifelse(Pred==Target,1,0)) %>%
    mutate(LogLoss=Target*log(Prob) + (1-Target)*log(1-Prob)) %>%
    group_by(SystemName) %>%
    summarise(TC=sum(OK),N=n(),FC=N-TC,Acc=TC/N,LogLoss=-mean(LogLoss)) %>%
    select(SystemName,TC,FC,N,Acc,LogLoss) -> df.results.perf
df.results.perf %>% arrange(desc(Acc)) %>% head(10)

In [14]:
df.results.perf %>% filter(SystemName=='RTB')

SystemName,TC,FC,N,Acc,LogLoss
RTB,43,24,67,0.641791,0.579824
