In [1]:
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)

In [2]:
# load the stats
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
# keep 2014+
data.tourney %>% filter(Season>2013) -> data.tourney

In [3]:
# store 68 tourney teams 
data.tourney %>% 
    select(Season,TeamID=WTeamID) %>% 
    bind_rows(data.tourney %>% 
              select(Season,TeamID=LTeamID)) %>% 
    distinct(Season,TeamID) -> df.tourneyteams

In [6]:
# Sagarin Rating 2014+
# https://ux.kitsapsun.com/sports/ncaab/sagarin/201?/team/
data.sagarin <- read.csv('datafiles_csv/Sagarin20142018.csv',stringsAsFactors=F)
data.sagarin %>% head

Season,Rank,Name,Rate,W,L,Sched,RankSched,W25,L25,W50,L50,GoldenMean,RankGM,Predictor,RankPred,Pure,RankPure
2014,1,Louisville,93.92,31,6,76.14,95,4,3,7,6,95.09,1,94.53,1,91.49,7
2014,2,Arizona,93.65,33,5,79.8,21,6,2,13,4,94.19,2,93.84,2,92.06,5
2014,3,Florida,93.27,36,3,80.07,16,9,3,13,3,92.45,3,92.57,3,96.41,2
2014,4,Wisconsin,91.99,30,8,81.91,2,6,4,12,5,91.35,4,91.99,4,92.34,4
2014,5,Michigan State,90.66,29,9,81.09,5,5,5,11,6,90.46,6,89.84,6,91.95,6
2014,6,Connecticut,90.61,32,8,80.24,15,8,4,14,7,89.87,10,89.42,10,96.48,1


In [8]:
data.teamspellings <- readRDS('datafiles_rds/TeamSpellings.rds')
data.teams <- readRDS('datafiles_rds/Teams.rds')

In [119]:
data.sagarin %>% 
    mutate(Name=str_to_lower(Name)) %>% 
    left_join(data.teamspellings,by=c('Name'='TeamNameSpelling')) -> df.sagarin

In [120]:
df.sagarin %>% 
    filter(is.na(TeamID)) %>%
    select(Name) %>%
    distinct() %>%
    arrange(Name)

Name
ark.-little rock
ark.-little rock ualr
binghamton-ny
cal poly-slo
central connecticut st.
central florida ucf
central floridaucf
east tennessee state ets
fla. international
fort wayne ipfw


In [121]:
df.sagarin %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season
1142,Cal Poly SLO,1995,2019
1254,Long Island,1985,2019
1316,North Florida,2006,2019
1386,St Joseph's PA,1985,2019
1388,St Mary's CA,1985,2019
1392,Stony Brook,2000,2019
1433,VA Commonwealth,1985,2019
1462,Xavier,1985,2019


In [122]:
df.sagarin %>%
    mutate_which(str_detect(Name,'xavier-ohio'),TeamID=1462) %>%
    mutate_which(str_detect(Name,'commonwealth'),TeamID=1433) %>%
    mutate_which(str_detect(Name,'stony brook'),TeamID=1392) %>%
    mutate_which(str_detect(Name,'saint mary'),TeamID=1388) %>%
    mutate_which(str_detect(Name,'saint joseph'),TeamID=1386) %>%
    mutate_which(str_detect(Name,'north florida'),TeamID=1316) %>%
    mutate_which(str_detect(Name,'long island'),TeamID=1254) %>%
    mutate_which(str_detect(Name,'cal poly'),TeamID=1142) %>%
    mutate_which(str_detect(Name,'^iona'),TeamID=1233) %>%    # Iona renamed in Iona College in 2017
    mutate_which(str_detect(Name,'east tennessee'),TeamID=1190) -> df.sagarin    # ETSU = East Tennessee State University

In [123]:
df.sagarin %>% 
    right_join(df.tourneyteams,by='TeamID') %>%
    filter(is.na(Rank)) %>%
    select(TeamID) %>%
    inner_join(data.teams,by='TeamID') %>%
    distinct() %>%
    arrange(TeamID)

TeamID,TeamName,FirstD1Season,LastD1Season


In [124]:
df.sagarin %>% 
    inner_join(df.tourneyteams,by=c('TeamID','Season')) %>%
    select(Season,TeamID,Predictor) -> df.sagarin

In [125]:
df.sagarin %>% 
    group_by(Season) %>% 
    count

Season,n
2014,68
2015,68
2016,68
2017,68
2018,68


In [126]:
df.tourneyteams %>% 
    anti_join(df.sagarin,by=c('Season','TeamID'))

Season,TeamID


In [127]:
# compute the Truth 
data.tourney %>% 
    select(Season,WTeamID,LTeamID) %>% 
    mutate(TeamID.1=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.2=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(ID=str_c(Season,'_',TeamID.1,'_',TeamID.2)) %>% 
    mutate(Target=ifelse(WTeamID==TeamID.1,1,0)) %>% 
    select(ID,Target) -> df.truth
df.truth %>% sample_n(5)

ID,Target
2014_1235_1300,1
2016_1242_1268,1
2016_1181_1332,0
2017_1388_1433,1
2016_1195_1314,0


In [128]:
df.sagarin %>%
    group_by(Season) %>%
    mutate(thres=max(Predictor)-quantile(Predictor,0.75),Slope=3/thres) %>%
    ungroup() %>%
    select(-thres) -> df.sagarin

In [129]:
# matchups dataframe
df.sagarin %>% 
    inner_join(df.sagarin %>% select(-Slope),by='Season') %>% 
    filter(TeamID.x < TeamID.y) -> df.matchup
df.matchup %>% sample_n(5)

Season,TeamID.x,Predictor.x,Slope,TeamID.y,Predictor.y
2018,1137,77.42,0.322841,1314,90.48
2015,1124,88.19,0.2608129,1400,86.68
2014,1153,85.84,0.4924087,1393,87.14
2017,1278,84.25,0.5597015,1452,92.52
2017,1245,72.75,0.5597015,1452,92.52


In [181]:
# machups predictions
df.matchup %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>% 
    mutate(Prob=logistic(Predictor.x-Predictor.y,a=Slope)) %>% 
    mutate(Pred=ifelse(Prob>0.5,1,0)) %>% 
    mutate(Prob=round(Prob,3)) %>%
    mutate_which(Prob>0.99,Prob=0.95) %>%
    mutate_which(Prob<0.01,Prob=0.05) %>%
    select(ID,Pred,Prob) -> df.submit
df.submit %>% sample_n(5)

ID,Pred,Prob
2018_1112_1196,1,0.529
2014_1211_1409,1,0.978
2018_1308_1314,0,0.033
2017_1376_1455,0,0.149
2016_1114_1338,0,0.132


In [182]:
# merge prediction and truth
df.truth %>% inner_join(df.submit,by='ID') -> df.results

In [183]:
# confusion matrix
cm <- table(df.results$Target,df.results$Pred)
cat('Confusion Matrix')
cm
perf <- (cm[1,1]+cm[2,2]) / nrow(df.results)
cat(paste('Accuracy: ',perf))

Confusion Matrix

   
      0   1
  0 130  43
  1  39 123

Accuracy:  0.755223880597015

In [184]:
cat(paste('Baseline logloss:',-log(0.5)))
# logloss (kaggle metric)
df.results %>% 
    mutate(LogLoss=Target*log(Prob) + (1-Target)*log(1-Prob)) %>% 
    summarise(ModelLogLoss = -mean(LogLoss))

Baseline logloss: 0.693147180559945

ModelLogLoss
0.5524146


In [191]:
df.submit %>% 
    select(ID,Pred=Prob) %>% 
    write.csv('predictions/Pred_Sagarin.csv',quote=F,row.names=F)