In [109]:
# Load required packages
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)

In [110]:
# load the stats
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
# keep 2014+
data.tourney %>% filter(Season>2013) -> data.tourney

In [111]:
# store 68 tourney teams 
data.tourney %>% 
    select(Season,TeamID=WTeamID) %>% 
    bind_rows(data.tourney %>% 
              select(Season,TeamID=LTeamID)) %>% 
    distinct(Season,TeamID) -> df.tourneyteams

In [112]:
# load ranking db
data.ranking <- readRDS('datafiles_rds/MasseyOrdinals.rds')
# keep 2014+, last day
data.ranking %>% 
    filter(Season>2013) %>%
    group_by(Season) %>%
    filter(RankingDayNum==max(RankingDayNum)) %>%
    select(-RankingDayNum) %>%
    ungroup() -> data.lastranking

In [113]:
data.ranking %>% select(SystemName) %>% distinct() %>% arrange(SystemName) %>% count

n
166


In [114]:
# Count Teams each year
data.lastranking %>% 
    select(Season,TeamID) %>%
    distinct() %>%
    group_by(Season) %>%
    summarise(n=n())

Season,n
2014,351
2015,351
2016,351
2017,351
2018,351


In [115]:
# keep systems present last 5 years for all TeamID
data.lastranking %>%
    select(Season,SystemName,TeamID) %>%
    group_by(SystemName) %>%
    summarise(n=n()) %>%
    filter(n==1755) %>%    # 351 x5
    select(-n) -> df.systems

In [116]:
# keep stats from these systems only
data.lastranking %>% 
    inner_join(df.systems,by='SystemName') -> data.lastranking

In [117]:
# add ranking mean as new system 
data.lastranking %>%
    group_by(Season,TeamID) %>%
    summarise(OrdinalRank=mean(OrdinalRank)) %>%
    mutate(SystemName='AVG') -> df.sysavg
data.lastranking %>%
    bind_rows(df.sysavg) %>% 
    rename(Rank=OrdinalRank) -> data.lastranking

In [118]:
data.lastranking %>% head

Season,SystemName,TeamID,Rank
2014,7OT,1101,343
2014,7OT,1102,290
2014,7OT,1103,103
2014,7OT,1104,82
2014,7OT,1105,288
2014,7OT,1106,317


In [129]:
data.lastranking %>%
    inner_join(df.tourneyteams,by=c('Season','TeamID')) -> df.ratingsystem

In [213]:
# matchups dataframe : all the possible matches between the 68 teams per Season 
df.ratingsystem %>% 
    inner_join(df.ratingsystem,by=c('Season','SystemName')) %>% 
    filter(TeamID.x < TeamID.y) -> df.matchups
df.matchups %>% sample_n(5)

Season,SystemName,TeamID.x,Rank.x,TeamID.y,Rank.y
2017,CNG,1423,69,1448,33
2014,BUR,1203,45,1444,106
2015,KPK,1172,28,1344,26
2015,WOL,1301,40,1452,21
2015,SAG,1173,48,1235,11


In [214]:
# 11390 matchups for each system
df.matchups %>% group_by(SystemName) %>% summarise(n=n()) %>% summarise(mean=mean(n))

mean
11390


In [215]:
df.slopes <- data.frame(Season=rep(2014:2018,1,each=17),qtl=rep(seq(0.1,0.9,0.05),5))
df.slopes %>% 
    mutate(frac=360*(1-qtl),Slope=3/frac) %>% 
    select(-frac) -> df.slopes

In [216]:
df.slopes %>% head

Season,qtl,Slope
2014,0.1,0.009259259
2014,0.15,0.009803922
2014,0.2,0.010416667
2014,0.25,0.011111111
2014,0.3,0.011904762
2014,0.35,0.012820513


In [217]:
df.matchups %>% 
    inner_join(df.slopes,by='Season') -> df.matchups

In [218]:
df.matchups %>% head

Season,SystemName,TeamID.x,Rank.x,TeamID.y,Rank.y,qtl,Slope
2014,7OT,1107,177,1110,118,0.1,0.009259259
2014,7OT,1107,177,1110,118,0.15,0.009803922
2014,7OT,1107,177,1110,118,0.2,0.010416667
2014,7OT,1107,177,1110,118,0.25,0.011111111
2014,7OT,1107,177,1110,118,0.3,0.011904762
2014,7OT,1107,177,1110,118,0.35,0.012820513


In [219]:
# machups predictions
df.matchups %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(Rank.y-Rank.x,a=Slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    select(qtl,ID,SystemName,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

qtl,ID,SystemName,Pred,Prob
0.4,2014_1243_1301,STH,1,0.51041516
0.6,2016_1320_1438,BUR,0,0.19518468
0.45,2015_1173_1207,BBT,0,0.46219351
0.7,2014_1203_1433,LMC,0,0.3208213
0.5,2016_1214_1234,SPR,0,0.01657194


In [220]:
# load true results
data.truth <- readRDS('datafiles_rds/TourneyTrueResults.rds')

In [221]:
# reduce to played tourney matchups
df.submit %>% 
    inner_join(data.truth,by='ID') -> df.results

In [222]:
df.results %>% head

qtl,ID,SystemName,Pred,Prob,Target
0.1,2014_1107_1196,7OT,0,0.16643441,0
0.15,2014_1107_1196,7OT,0,0.15369856,0
0.2,2014_1107_1196,7OT,0,0.14033625,0
0.25,2014_1107_1196,7OT,0,0.12638209,0
0.3,2014_1107_1196,7OT,0,0.11190499,0
0.35,2014_1107_1196,7OT,0,0.09702123,0


In [223]:
# Accuracy & Logloss per SystemName
df.results %>% 
    mutate(OK=ifelse(Pred==Target,1,0)) %>%
    mutate(LogLoss=Target*log(Prob) + (1-Target)*log(1-Prob)) %>%
    group_by(qtl,SystemName) %>%
    summarise(TC=sum(OK),N=n(),FC=N-TC,Acc=TC/N,LogLoss=-mean(LogLoss)) %>%
    select(qtl,SystemName,TC,FC,N,Acc,LogLoss) %>%
    ungroup() -> df.results.perf
df.results.perf %>% arrange(desc(Acc)) %>% head(5)

qtl,SystemName,TC,FC,N,Acc,LogLoss
0.1,7OT,242,93,335,0.7223881,0.5935596
0.15,7OT,242,93,335,0.7223881,0.5901226
0.2,7OT,242,93,335,0.7223881,0.5865048
0.25,7OT,242,93,335,0.7223881,0.5827074
0.3,7OT,242,93,335,0.7223881,0.57874


In [224]:
df.results.perf %>% arrange(LogLoss) %>% head(5)

qtl,SystemName,TC,FC,N,Acc,LogLoss
0.6,MOR,233,102,335,0.6955224,0.5504743
0.65,MOR,233,102,335,0.6955224,0.5511963
0.55,MOR,233,102,335,0.6955224,0.5519371
0.65,7OT,242,93,335,0.7223881,0.5536787
0.5,MOR,233,102,335,0.6955224,0.5546405
