In [1]:
# Load required packages
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)
library(tidyr,warn.conflicts=F)

In [2]:
library(tictoc,warn.conflicts=F)

#### From Matchups stats to Teams stats

In [3]:
# load the stats and keep 2014+
data.regulars <- readRDS('datafiles_rds/RegularSeasonDetailedResults.rds')
data.regulars %>% filter(Season>2013) -> data.regulars

In [4]:
data.regulars %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,5362
2015,5354
2016,5369
2017,5395
2018,5405


In [5]:
# Winning teams stats
data.regulars %>% 
    mutate(WW=1,WL=0) %>%
    select_at(vars(Season,DayNum,starts_with('W'),WOppScore=LScore)) %>%
    rename_at(vars(starts_with('W')),list(~str_remove(.,'^W'))) -> df.wteams

# Losing teams stats
data.regulars %>% 
    mutate(LW=0,LL=1) %>%
    select_at(vars(Season,DayNum,starts_with('L'),LOppScore=WScore)) %>%
    rename_at(vars(starts_with('L')),list(~str_remove(.,'^L'))) -> df.lteams

# Merge stats
df.wteams %>% 
    bind_rows(df.lteams) %>%
    arrange(DayNum,TeamID) -> teams.stats

teams.stats %>% 
    mutate_which(is.na(Loc),Loc='N') %>%
    select(-Loc) -> teams.stats

In [6]:
teams.stats %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,10724
2015,10708
2016,10738
2017,10790
2018,10810


#### Compute regular new features

In [7]:
# compute cumulative and average stats
teams.stats %>%
    group_by(Season,TeamID) %>%
    mutate_at(vars(Score,OppScore,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,W,L),list(~cumsum,~cummean)) %>% 
    select(-c(Score,OppScore,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,W,L)) %>%
    rename_all(list(~str_replace(.,'_cumsum','Sum'))) %>%
    rename_all(list(~str_replace(.,'_cummean','Avg'))) %>%
    ungroup() -> teams.stats.cumul

In [8]:
teams.stats.cumul %>% filter(Season==2014,TeamID==1102) %>% head
teams.stats.cumul %>% filter(Season==2015,TeamID==1102) %>% head

Season,DayNum,TeamID,ScoreSum,OppScoreSum,FGMSum,FGASum,FGM3Sum,FGA3Sum,FTMSum,⋯,FTAAvg,ORAvg,DRAvg,AstAvg,TOAvg,StlAvg,BlkAvg,PFAvg,WAvg,LAvg
2014,4,1102,79,68,26,49,10,15,17,⋯,26.0,10.0,30.0,11.0,18.0,8.0,4.0,18.0,1.0,0.0
2014,5,1102,142,139,49,102,15,37,29,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,10,1102,224,223,76,159,27,63,45,⋯,23.66667,9.0,25.0,11.66667,15.66667,6.666667,1.666667,21.0,0.3333333,0.6666667
2014,13,1102,291,287,95,204,36,84,65,⋯,24.25,9.5,23.75,12.75,15.5,6.25,2.75,19.5,0.5,0.5
2014,23,1102,351,370,115,247,40,97,81,⋯,24.6,8.6,22.8,12.0,15.4,5.6,3.0,19.4,0.4,0.6
2014,26,1102,408,451,134,293,46,119,94,⋯,23.66667,7.833333,22.5,12.0,15.83333,5.333333,3.166667,19.16667,0.3333333,0.6666667


Season,DayNum,TeamID,ScoreSum,OppScoreSum,FGMSum,FGASum,FGM3Sum,FGA3Sum,FTMSum,⋯,FTAAvg,ORAvg,DRAvg,AstAvg,TOAvg,StlAvg,BlkAvg,PFAvg,WAvg,LAvg
2015,11,1102,78,84,31,62,6,19,10,⋯,15.0,15.0,17.0,14.0,18.0,8.0,6.0,21.0,0.0,1.0
2015,12,1102,146,139,55,105,15,39,21,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,22,1102,199,207,73,160,19,59,34,⋯,16.33333,12.33333,18.66667,14.0,15.33333,7.0,4.0,21.0,0.3333333,0.6666667
2015,27,1102,261,270,96,207,27,77,42,⋯,14.5,11.25,18.5,12.75,16.0,8.25,4.0,20.5,0.25,0.75
2015,30,1102,320,304,116,260,32,101,56,⋯,16.6,13.6,19.6,12.8,14.6,8.0,3.8,19.6,0.4,0.6
2015,35,1102,397,365,143,310,41,126,70,⋯,16.66667,12.83333,20.0,14.16667,14.33333,7.666667,3.833333,19.0,0.5,0.5


In [9]:
# create missing row
teams.stats.cumul %>% complete(Season=2014:2018,DayNum=1:140,TeamID) %>% 
    group_by(Season,TeamID) %>%
    fill(everything()) %>%
    ungroup() -> teams.stats.cumul

In [10]:
teams.stats.cumul %>% filter(Season==2014,TeamID==1102) %>% head(10)
teams.stats.cumul %>% filter(Season==2015,TeamID==1102) %>% head(19) %>% tail(10)

Season,DayNum,TeamID,ScoreSum,OppScoreSum,FGMSum,FGASum,FGM3Sum,FGA3Sum,FTMSum,⋯,FTAAvg,ORAvg,DRAvg,AstAvg,TOAvg,StlAvg,BlkAvg,PFAvg,WAvg,LAvg
2014,1,1102,,,,,,,,⋯,,,,,,,,,,
2014,2,1102,,,,,,,,⋯,,,,,,,,,,
2014,3,1102,,,,,,,,⋯,,,,,,,,,,
2014,4,1102,79.0,68.0,26.0,49.0,10.0,15.0,17.0,⋯,26.0,10.0,30.0,11.0,18.0,8.0,4.0,18.0,1.0,0.0
2014,5,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,6,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,7,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,8,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,9,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,10,1102,224.0,223.0,76.0,159.0,27.0,63.0,45.0,⋯,23.66667,9.0,25.0,11.66667,15.66667,6.666667,1.666667,21.0,0.3333333,0.6666667


Season,DayNum,TeamID,ScoreSum,OppScoreSum,FGMSum,FGASum,FGM3Sum,FGA3Sum,FTMSum,⋯,FTAAvg,ORAvg,DRAvg,AstAvg,TOAvg,StlAvg,BlkAvg,PFAvg,WAvg,LAvg
2015,10,1102,,,,,,,,⋯,,,,,,,,,,
2015,11,1102,78.0,84.0,31.0,62.0,6.0,19.0,10.0,⋯,15.0,15.0,17.0,14.0,18.0,8.0,6.0,21.0,0.0,1.0
2015,12,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,13,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,14,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,15,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,16,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,17,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,18,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,19,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5


#### Regular matchups

In [11]:
# regular season matchups
data.regulars %>% 
    mutate(TeamID.x=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.y=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(Target=ifelse(WTeamID<LTeamID,1,0)) %>%
    mutate(DayNum=DayNum-1) %>%
    select(Season,DayNum,Target,TeamID.x,TeamID.y) -> train.matchups
train.matchups %>% sample_n(5)

Season,DayNum,Target,TeamID.x,TeamID.y
2014,24,0,1168,1274
2016,109,0,1304,1326
2018,19,1,1229,1409
2018,47,1,1174,1175
2018,66,1,1310,1373


In [12]:
train.matchups %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,5362
2015,5354
2016,5369
2017,5395
2018,5405


In [13]:
# regular matchups features
train.matchups %>%
    left_join(teams.stats.cumul,by=c('Season','DayNum','TeamID.x'='TeamID')) %>% 
    rename_all(list(~str_replace(.,'Sum$','Sum.x'))) %>%
    rename_all(list(~str_replace(.,'Avg$','Avg.x'))) %>%
    left_join(teams.stats.cumul,by=c('Season','DayNum','TeamID.y'='TeamID')) %>% 
    rename_all(list(~str_replace(.,'Sum$','Sum.y'))) %>%
    rename_all(list(~str_replace(.,'Avg$','Avg.y'))) %>%
    na.omit() -> train.matchups.features
train.matchups.features %>% sample_n(5)

Season,DayNum,Target,TeamID.x,TeamID.y,ScoreSum.x,OppScoreSum.x,FGMSum.x,FGASum.x,FGM3Sum.x,⋯,FTAAvg.y,ORAvg.y,DRAvg.y,AstAvg.y,TOAvg.y,StlAvg.y,BlkAvg.y,PFAvg.y,WAvg.y,LAvg.y
2017,19,0,1239,1375,317,299,108,211,32,⋯,21.33333,13.0,28.0,15.666667,11.33333,6.333333,4.666667,22.0,1.0,0.0
2017,64,1,1197,1224,745,949,270,681,63,⋯,24.58333,10.5,20.66667,7.416667,14.66667,5.75,2.916667,17.91667,0.08333333,0.9166667
2016,118,1,1126,1299,1888,2033,667,1624,213,⋯,20.92857,10.678571,24.39286,10.107143,13.82143,5.071429,2.392857,22.46429,0.32142857,0.6785714
2017,65,0,1163,1272,758,785,269,668,75,⋯,25.5,12.714286,26.5,18.142857,11.35714,6.785714,5.714286,17.0,0.71428571,0.2857143
2017,122,0,1180,1241,2243,2384,808,1854,237,⋯,18.7,9.366667,24.83333,11.533333,14.36667,5.7,2.766667,20.36667,0.26666667,0.7333333


In [14]:
train.matchups %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,5362
2015,5354
2016,5369
2017,5395
2018,5405


In [15]:
dim(train.matchups.features) %>% cat('\n')
train.matchups.features %>% colnames %>% cat('\n')

25844 73 
Season DayNum Target TeamID.x TeamID.y ScoreSum.x OppScoreSum.x FGMSum.x FGASum.x FGM3Sum.x FGA3Sum.x FTMSum.x FTASum.x ORSum.x DRSum.x AstSum.x TOSum.x StlSum.x BlkSum.x PFSum.x WSum.x LSum.x ScoreAvg.x OppScoreAvg.x FGMAvg.x FGAAvg.x FGM3Avg.x FGA3Avg.x FTMAvg.x FTAAvg.x ORAvg.x DRAvg.x AstAvg.x TOAvg.x StlAvg.x BlkAvg.x PFAvg.x WAvg.x LAvg.x ScoreSum.y OppScoreSum.y FGMSum.y FGASum.y FGM3Sum.y FGA3Sum.y FTMSum.y FTASum.y ORSum.y DRSum.y AstSum.y TOSum.y StlSum.y BlkSum.y PFSum.y WSum.y LSum.y ScoreAvg.y OppScoreAvg.y FGMAvg.y FGAAvg.y FGM3Avg.y FGA3Avg.y FTMAvg.y FTAAvg.y ORAvg.y DRAvg.y AstAvg.y TOAvg.y StlAvg.y BlkAvg.y PFAvg.y WAvg.y LAvg.y 


#### Tourney matchups

In [16]:
# load the stats and keep 2014+
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
data.tourney %>% filter(Season>2013) -> data.tourney

In [17]:
data.tourney %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,67
2015,67
2016,67
2017,67
2018,67


In [18]:
data.test.matchups <- readRDS('datafiles_rds/TourneyMatchups.rds')

In [19]:
# tourney season matchups
data.test.matchups %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>%
    mutate(DayNum=140) %>%
    select(ID,Season,DayNum,TeamID.x,TeamID.y) -> test.matchups
test.matchups %>% sample_n(5)

ID,Season,DayNum,TeamID.x,TeamID.y
2016_1221_1462,2016,140,1221,1462
2014_1160_1181,2014,140,1160,1181
2018_1166_1172,2018,140,1166,1172
2015_1257_1352,2015,140,1257,1352
2017_1278_1345,2017,140,1278,1345


In [20]:
# tourney season truth
data.tourney %>% 
    mutate(TeamID.x=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.y=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(Target=ifelse(WTeamID<LTeamID,1,0)) %>%
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>%
    mutate(Tourney=1) %>%
    select(ID,Tourney,Target) -> test.truth
test.truth %>% sample_n(5)

ID,Tourney,Target
2018_1305_1400,1,1
2018_1199_1211,1,1
2014_1393_1444,1,1
2017_1242_1332,1,0
2018_1181_1393,1,1


In [21]:
test.matchups %>%
    left_join(test.truth,by='ID') %>% 
    mutate_which(is.na(Target),Target=0,Tourney=0) -> test.matchups

In [22]:
test.matchups %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,2278
2015,2278
2016,2278
2017,2278
2018,2278


In [23]:
# tourney matchups features
test.matchups %>%
    left_join(teams.stats.cumul,by=c('Season','DayNum','TeamID.x'='TeamID')) %>% 
    rename_all(list(~str_replace(.,'Sum$','Sum.x'))) %>%
    rename_all(list(~str_replace(.,'Avg$','Avg.x'))) %>%
    left_join(teams.stats.cumul,by=c('Season','DayNum','TeamID.y'='TeamID')) %>% 
    rename_all(list(~str_replace(.,'Sum$','Sum.y'))) %>%
    rename_all(list(~str_replace(.,'Avg$','Avg.y'))) %>%
    na.omit() -> test.matchups.features
test.matchups.features %>% sample_n(5)

ID,Season,DayNum,TeamID.x,TeamID.y,Tourney,Target,ScoreSum.x,OppScoreSum.x,FGMSum.x,⋯,FTAAvg.y,ORAvg.y,DRAvg.y,AstAvg.y,TOAvg.y,StlAvg.y,BlkAvg.y,PFAvg.y,WAvg.y,LAvg.y
2014_1304_1386,2014,140,1304,1386,0,0,2077,2011,696,⋯,22.81818,9.606061,26.30303,14.06061,12.33333,4.484848,4.606061,15.81818,0.7272727,0.2727273
2014_1112_1459,2014,140,1112,1459,0,0,2484,1977,892,⋯,18.31034,10.551724,21.96552,12.13793,10.27586,6.068966,1.551724,17.62069,0.5862069,0.4137931
2015_1217_1279,2015,140,1217,1279,0,0,1704,1570,595,⋯,21.71875,12.40625,25.15625,12.8125,11.34375,6.59375,3.6875,19.5625,0.625,0.375
2015_1295_1452,2015,140,1295,1452,0,0,1918,1849,660,⋯,24.9375,16.84375,19.9375,14.53125,13.0625,10.9375,2.84375,23.34375,0.71875,0.28125
2014_1272_1417,2014,140,1272,1417,0,0,2382,2175,869,⋯,23.11765,10.088235,25.55882,17.20588,10.52941,9.382353,2.882353,17.73529,0.7647059,0.2352941


In [24]:
test.matchups %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,2278
2015,2278
2016,2278
2017,2278
2018,2278


In [25]:
dim(test.matchups.features) %>% cat('\n')
test.matchups.features %>% colnames %>% cat('\n')

11390 75 
ID Season DayNum TeamID.x TeamID.y Tourney Target ScoreSum.x OppScoreSum.x FGMSum.x FGASum.x FGM3Sum.x FGA3Sum.x FTMSum.x FTASum.x ORSum.x DRSum.x AstSum.x TOSum.x StlSum.x BlkSum.x PFSum.x WSum.x LSum.x ScoreAvg.x OppScoreAvg.x FGMAvg.x FGAAvg.x FGM3Avg.x FGA3Avg.x FTMAvg.x FTAAvg.x ORAvg.x DRAvg.x AstAvg.x TOAvg.x StlAvg.x BlkAvg.x PFAvg.x WAvg.x LAvg.x ScoreSum.y OppScoreSum.y FGMSum.y FGASum.y FGM3Sum.y FGA3Sum.y FTMSum.y FTASum.y ORSum.y DRSum.y AstSum.y TOSum.y StlSum.y BlkSum.y PFSum.y WSum.y LSum.y ScoreAvg.y OppScoreAvg.y FGMAvg.y FGAAvg.y FGM3Avg.y FGA3Avg.y FTMAvg.y FTAAvg.y ORAvg.y DRAvg.y AstAvg.y TOAvg.y StlAvg.y BlkAvg.y PFAvg.y WAvg.y LAvg.y 


#### Save datasets for Keras 

In [26]:
train.matchups.features %>%
    select(-Season,-DayNum,-TeamID.x,-TeamID.y) %>%
    write.csv('kerasfiles/train.matchups.stats.csv',row.names=F)

In [27]:
test.matchups.features %>%
    select(-Season,-DayNum,-TeamID.x,-TeamID.y) %>%
    write.csv('kerasfiles/test.matchups.stats.csv',row.names=F)