In [115]:
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)

In [116]:
# load the stats
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
# keep 2014
data.tourney %>% filter(Season==2014) -> data.tou2014

In [117]:
# store 68 tourney teams 
data.tou2014 %>% 
    select(TeamID=WTeamID) %>% 
    bind_rows(data.tou2014 %>% 
              select(TeamID=LTeamID)) %>% 
    distinct(TeamID) -> df.teams2014

In [118]:
# Sagarin Rating 2013-2014
# https://ux.kitsapsun.com/sports/ncaab/sagarin/2014/team/
data.sagarin <- read.csv('datafiles_csv/Sagarin2014.csv',stringsAsFactors=F)
data.sagarin %>% head

Rank,Name,Rate,W,L,Sched,RankSched,W25,L25,W50,L50,GoldenMean,RankGM,Predictor,RankPred,Pure,RankPure
1,Louisville,93.92,31,6,76.14,95,4,3,7,6,95.09,1,94.53,1,91.49,7
2,Arizona,93.65,33,5,79.8,21,6,2,13,4,94.19,2,93.84,2,92.06,5
3,Florida,93.27,36,3,80.07,16,9,3,13,3,92.45,3,92.57,3,96.41,2
4,Wisconsin,91.99,30,8,81.91,2,6,4,12,5,91.35,4,91.99,4,92.34,4
5,Michigan State,90.66,29,9,81.09,5,5,5,11,6,90.46,6,89.84,6,91.95,6
6,Connecticut,90.61,32,8,80.24,15,8,4,14,7,89.87,10,89.42,10,96.48,1


In [119]:
threshold <- max(data.sagarin$Predictor) - as.numeric(quantile(data.sagarin$Predictor,0.75))
threshold

In [120]:
data.teamspellings <- readRDS('datafiles_rds/TeamSpellings.rds')
data.teams <- readRDS('datafiles_rds/Teams.rds')

In [121]:
data.sagarin %>% 
    mutate(Name=str_to_lower(Name)) %>% 
    left_join(data.teamspellings,by=c('Name'='TeamNameSpelling')) -> df.sagarin

In [122]:
df.sagarin %>% 
    right_join(df.teams2014,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID')

TeamID,TeamName,FirstD1Season,LastD1Season
1142,Cal Poly SLO,1995,2019
1462,Xavier,1985,2019
1386,St Joseph's PA,1985,2019
1433,VA Commonwealth,1985,2019


In [123]:
df.sagarin %>% 
    filter(is.na(TeamID))

Rank,Name,Rate,W,L,Sched,RankSched,W25,L25,W50,L50,GoldenMean,RankGM,Predictor,RankPred,Pure,RankPure,TeamID
28,vcu(va. commonwealth),85.81,26,9,77.01,81,1,0,5,6,86.15,26,86.18,27,84.08,37,
47,saint joseph's-pa.,82.39,24,10,77.83,66,0,3,6,6,81.96,51,81.51,56,84.22,36,
53,xavier-ohio,81.74,21,13,78.88,44,3,5,4,7,82.54,48,81.34,59,80.49,64,
66,saint mary's-cal.,80.51,22,12,75.89,97,0,3,2,4,80.03,71,80.38,64,80.76,60,
159,stony brook-ny,72.89,22,11,67.09,324,0,0,0,1,73.16,157,71.98,170,73.28,156,
180,omaha(neb.-omaha),71.56,14,15,71.11,201,0,0,0,2,72.39,162,71.35,182,69.64,215,
192,cal poly-slo,70.93,12,20,74.06,119,0,3,0,5,70.55,197,71.05,183,70.52,198,
198,oakland-mich.,70.32,11,20,74.9,110,0,3,0,4,69.73,205,70.53,191,70.09,208,
218,se missouri state(semo),69.14,16,14,66.88,329,0,0,0,2,68.2,233,68.32,222,71.34,182,
223,ark.-little rock,68.8,13,17,72.11,168,0,1,0,3,68.38,228,68.23,224,69.71,214,


In [124]:
df.sagarin %>%
    mutate_which(Rank==28,TeamID=1433) %>%
    mutate_which(Rank==47,TeamID=1386) %>%
    mutate_which(Rank==53,TeamID=1462) %>%
    mutate_which(Rank==192,TeamID=1142) -> df.sagarin

In [125]:
df.sagarin %>% 
    inner_join(df.teams2014,by='TeamID') %>%
    select(TeamID,Predictor) -> df.sagarin

In [126]:
# compute the Truth 
data.tou2014 %>% 
    select(Season,WTeamID,LTeamID) %>% 
    mutate(TeamID.1=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.2=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(ID=str_c(Season,'_',TeamID.1,'_',TeamID.2)) %>% 
    mutate(Target=ifelse(WTeamID==TeamID.1,1,0)) %>% 
    select(ID,Target) -> df.truth
df.truth %>% sample_n(5)

ID,Target
2014_1174_1277,0
2014_1372_1433,1
2014_1163_1277,1
2014_1332_1458,0
2014_1173_1196,0


In [127]:
# compute slope of sigmoid
slope = 3 / threshold
slope

In [128]:
# matchups dataframe
df.sagarin %>% 
    semi_join(df.teams2014,by='TeamID') %>% 
    mutate(Season=2014) -> tmp01
tmp01 %>% 
    inner_join(tmp01,by='Season') %>% 
    filter(TeamID.x < TeamID.y) -> df.matchup
df.matchup %>% sample_n(5)

TeamID.x,Predictor.x,Season,TeamID.y,Predictor.y
1264,79.96,2014,1328,85.0
1314,85.85,2014,1437,89.02
1332,86.21,2014,1344,82.2
1243,83.06,2014,1276,89.76
1142,71.05,2014,1269,81.7


In [129]:
# machups predictions
df.matchup %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(Predictor.x-Predictor.y,a=slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    select(ID,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

ID,Pred,Prob
2014_1160_1372,1,0.54794105
2014_1160_1257,0,0.06144126
2014_1174_1438,0,0.06573487
2014_1153_1397,0,0.3303649
2014_1140_1217,0,0.3983239


In [130]:
# merge prediction and truth
df.truth %>% inner_join(df.submit,by='ID') -> df.results

In [136]:
# confusion matrix
cm <- table(df.results$Target,df.results$Pred)
cat('Confusion Matrix')
cm
perf <- (cm[1,1]+cm[2,2]) / nrow(df.results)
cat(paste('Accuracy: ',perf))

Confusion Matrix

   
     0  1
  0 24  5
  1 13 25

Accuracy:  0.73134328358209

In [140]:
cat(paste('Baseline logloss:',-log(0.5)))
# logloss (kaggle metric)
df.results %>% 
    mutate(LogLoss=Target*log(Prob) + (1-Target)*log(1-Prob)) %>% 
    summarise(ModelLogLoss = -mean(LogLoss))

Baseline logloss: 0.693147180559945

ModelLogLoss
0.4949369
