In [3]:
# Load required packages
library(dplyr,warn.conflicts=F)
library(stringr,warn.conflicts=F)
library(psych,warn.conflicts=F)
library(lplyr,warn.conflicts=F)
library(tidyr,warn.conflicts=F)

In [4]:
library(tictoc,warn.conflicts=F)

#### From Matchups stats to Teams stats

In [5]:
# load the stats and keep 2014+
data.regulars <- readRDS('datafiles_rds/RegularSeasonDetailedResults.rds')
data.regulars %>% filter(Season>2013) -> data.regulars

In [6]:
data.regulars %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,5362
2015,5354
2016,5369
2017,5395
2018,5405


In [7]:
# Winning teams stats
data.regulars %>% 
    mutate(WW=1,WL=0) %>%
    select_at(vars(Season,DayNum,starts_with('W'),WOppScore=LScore)) %>%
    rename_at(vars(starts_with('W')),list(~str_remove(.,'^W'))) -> df.wteams

# Losing teams stats
data.regulars %>% 
    mutate(LW=0,LL=1) %>%
    select_at(vars(Season,DayNum,starts_with('L'),LOppScore=WScore)) %>%
    rename_at(vars(starts_with('L')),list(~str_remove(.,'^L'))) -> df.lteams

# Merge stats
df.wteams %>% 
    bind_rows(df.lteams) %>%
    arrange(DayNum,TeamID) -> teams.stats

teams.stats %>% 
    mutate_which(is.na(Loc),Loc='N') %>%
    select(-Loc) -> teams.stats

In [8]:
teams.stats %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,10724
2015,10708
2016,10738
2017,10790
2018,10810


#### Compute regular new features

In [9]:
# compute cumulative and average stats
teams.stats %>%
    group_by(Season,TeamID) %>%
    mutate_at(vars(Score,OppScore,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,W,L),list(~cumsum,~cummean)) %>% 
    select(-c(Score,OppScore,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,W,L)) %>%
    rename_all(list(~str_replace(.,'_cumsum','Sum'))) %>%
    rename_all(list(~str_replace(.,'_cummean','Avg'))) %>%
    ungroup() -> teams.stats.cumul

In [10]:
teams.stats.cumul %>% filter(Season==2014,TeamID==1102) %>% head
teams.stats.cumul %>% filter(Season==2015,TeamID==1102) %>% head

Season,DayNum,TeamID,ScoreSum,OppScoreSum,FGMSum,FGASum,FGM3Sum,FGA3Sum,FTMSum,⋯,FTAAvg,ORAvg,DRAvg,AstAvg,TOAvg,StlAvg,BlkAvg,PFAvg,WAvg,LAvg
2014,4,1102,79,68,26,49,10,15,17,⋯,26.0,10.0,30.0,11.0,18.0,8.0,4.0,18.0,1.0,0.0
2014,5,1102,142,139,49,102,15,37,29,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,10,1102,224,223,76,159,27,63,45,⋯,23.66667,9.0,25.0,11.66667,15.66667,6.666667,1.666667,21.0,0.3333333,0.6666667
2014,13,1102,291,287,95,204,36,84,65,⋯,24.25,9.5,23.75,12.75,15.5,6.25,2.75,19.5,0.5,0.5
2014,23,1102,351,370,115,247,40,97,81,⋯,24.6,8.6,22.8,12.0,15.4,5.6,3.0,19.4,0.4,0.6
2014,26,1102,408,451,134,293,46,119,94,⋯,23.66667,7.833333,22.5,12.0,15.83333,5.333333,3.166667,19.16667,0.3333333,0.6666667


Season,DayNum,TeamID,ScoreSum,OppScoreSum,FGMSum,FGASum,FGM3Sum,FGA3Sum,FTMSum,⋯,FTAAvg,ORAvg,DRAvg,AstAvg,TOAvg,StlAvg,BlkAvg,PFAvg,WAvg,LAvg
2015,11,1102,78,84,31,62,6,19,10,⋯,15.0,15.0,17.0,14.0,18.0,8.0,6.0,21.0,0.0,1.0
2015,12,1102,146,139,55,105,15,39,21,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,22,1102,199,207,73,160,19,59,34,⋯,16.33333,12.33333,18.66667,14.0,15.33333,7.0,4.0,21.0,0.3333333,0.6666667
2015,27,1102,261,270,96,207,27,77,42,⋯,14.5,11.25,18.5,12.75,16.0,8.25,4.0,20.5,0.25,0.75
2015,30,1102,320,304,116,260,32,101,56,⋯,16.6,13.6,19.6,12.8,14.6,8.0,3.8,19.6,0.4,0.6
2015,35,1102,397,365,143,310,41,126,70,⋯,16.66667,12.83333,20.0,14.16667,14.33333,7.666667,3.833333,19.0,0.5,0.5


In [11]:
# create missing row
teams.stats.cumul %>% complete(Season=2014:2018,DayNum=1:140,TeamID) %>% 
    group_by(Season,TeamID) %>%
    fill(everything()) %>%
    ungroup() -> teams.stats.cumul

In [12]:
teams.stats.cumul %>% filter(Season==2014,TeamID==1102) %>% head(10)
teams.stats.cumul %>% filter(Season==2015,TeamID==1102) %>% head(19) %>% tail(10)

Season,DayNum,TeamID,ScoreSum,OppScoreSum,FGMSum,FGASum,FGM3Sum,FGA3Sum,FTMSum,⋯,FTAAvg,ORAvg,DRAvg,AstAvg,TOAvg,StlAvg,BlkAvg,PFAvg,WAvg,LAvg
2014,1,1102,,,,,,,,⋯,,,,,,,,,,
2014,2,1102,,,,,,,,⋯,,,,,,,,,,
2014,3,1102,,,,,,,,⋯,,,,,,,,,,
2014,4,1102,79.0,68.0,26.0,49.0,10.0,15.0,17.0,⋯,26.0,10.0,30.0,11.0,18.0,8.0,4.0,18.0,1.0,0.0
2014,5,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,6,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,7,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,8,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,9,1102,142.0,139.0,49.0,102.0,15.0,37.0,29.0,⋯,22.5,8.5,28.0,10.0,17.5,8.0,2.0,19.0,0.5,0.5
2014,10,1102,224.0,223.0,76.0,159.0,27.0,63.0,45.0,⋯,23.66667,9.0,25.0,11.66667,15.66667,6.666667,1.666667,21.0,0.3333333,0.6666667


Season,DayNum,TeamID,ScoreSum,OppScoreSum,FGMSum,FGASum,FGM3Sum,FGA3Sum,FTMSum,⋯,FTAAvg,ORAvg,DRAvg,AstAvg,TOAvg,StlAvg,BlkAvg,PFAvg,WAvg,LAvg
2015,10,1102,,,,,,,,⋯,,,,,,,,,,
2015,11,1102,78.0,84.0,31.0,62.0,6.0,19.0,10.0,⋯,15.0,15.0,17.0,14.0,18.0,8.0,6.0,21.0,0.0,1.0
2015,12,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,13,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,14,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,15,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,16,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,17,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,18,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5
2015,19,1102,146.0,139.0,55.0,105.0,15.0,39.0,21.0,⋯,14.5,13.5,19.5,15.5,18.0,5.5,4.0,21.5,0.5,0.5


#### Regular matchups

In [13]:
# regular season matchups
data.regulars %>% 
    mutate(TeamID.x=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.y=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(Target=ifelse(WTeamID<LTeamID,1,0)) %>%
    mutate(DayNum=DayNum-1) %>%
    select(Season,DayNum,Target,TeamID.x,TeamID.y) -> train.matchups
train.matchups %>% sample_n(5)

Season,DayNum,Target,TeamID.x,TeamID.y
2018,123,1,1225,1381
2015,74,1,1361,1424
2017,95,0,1402,1426
2017,78,1,1179,1356
2017,65,0,1119,1248


In [14]:
train.matchups %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,5362
2015,5354
2016,5369
2017,5395
2018,5405


In [15]:
# regular matchups features
train.matchups %>%
    left_join(teams.stats.cumul,by=c('Season','DayNum','TeamID.x'='TeamID')) %>% 
    rename_all(list(~str_replace(.,'Sum$','Sum.x'))) %>%
    rename_all(list(~str_replace(.,'Avg$','Avg.x'))) %>%
    left_join(teams.stats.cumul,by=c('Season','DayNum','TeamID.y'='TeamID')) %>% 
    rename_all(list(~str_replace(.,'Sum$','Sum.y'))) %>%
    rename_all(list(~str_replace(.,'Avg$','Avg.y'))) %>%
    na.omit() -> train.matchups.features
train.matchups.features %>% sample_n(5)

Season,DayNum,Target,TeamID.x,TeamID.y,ScoreSum.x,OppScoreSum.x,FGMSum.x,FGASum.x,FGM3Sum.x,⋯,FTAAvg.y,ORAvg.y,DRAvg.y,AstAvg.y,TOAvg.y,StlAvg.y,BlkAvg.y,PFAvg.y,WAvg.y,LAvg.y
2017,93,1,1324,1464,1616,1486,556,1264,139,⋯,17.95238,10.190476,26.09524,14.52381,12.61905,5.571429,3.190476,17.47619,0.3809524,0.6190476
2017,52,1,1221,1265,730,777,252,586,95,⋯,18.75,8.25,23.08333,12.083333,13.0,5.25,2.583333,20.0,0.3333333,0.6666667
2018,39,1,1127,1259,644,622,231,482,77,⋯,20.71429,10.571429,25.57143,9.285714,15.0,6.428571,2.285714,22.28571,0.0,1.0
2015,129,0,1364,1414,1948,1858,689,1576,197,⋯,16.27586,9.793103,25.31034,13.758621,11.82759,5.689655,4.655172,17.41379,0.5862069,0.4137931
2017,19,0,1126,1308,128,172,43,118,12,⋯,17.0,12.666667,23.33333,11.333333,15.0,6.333333,5.0,23.66667,0.3333333,0.6666667


In [16]:
train.matchups %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,5362
2015,5354
2016,5369
2017,5395
2018,5405


In [17]:
dim(train.matchups.features)
train.matchups.features %>% colnames

#### Tourney matchups

In [55]:
# load the stats and keep 2014+
data.tourney <- readRDS('datafiles_rds/NCAATourneyDetailedResults.rds')
data.tourney %>% filter(Season>2013) -> data.tourney

In [56]:
data.tourney %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,67
2015,67
2016,67
2017,67
2018,67


In [57]:
data.test.matchups <- readRDS('datafiles_rds/TourneyMatchups.rds')

In [58]:
# tourney season matchups
data.test.matchups %>% 
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>%
    mutate(DayNum=140) %>%
    select(ID,Season,DayNum,TeamID.x,TeamID.y) -> test.matchups
test.matchups %>% sample_n(5)

ID,Season,DayNum,TeamID.x,TeamID.y
2018_1120_1305,2018,140,1120,1305
2017_1243_1266,2017,140,1243,1266
2018_1246_1300,2018,140,1246,1300
2014_1246_1459,2014,140,1246,1459
2016_1345_1421,2016,140,1345,1421


In [59]:
# tourney season truth
data.tourney %>% 
    mutate(TeamID.x=ifelse(WTeamID<LTeamID,WTeamID,LTeamID),
           TeamID.y=ifelse(WTeamID>LTeamID,WTeamID,LTeamID)) %>%
    mutate(Target=ifelse(WTeamID<LTeamID,1,0)) %>%
    mutate(ID=str_c(Season,'_',TeamID.x,'_',TeamID.y)) %>%
    mutate(Tourney=1) %>%
    select(ID,Tourney,Target) -> test.truth
test.truth %>% sample_n(5)

ID,Tourney,Target
2017_1321_1435,1,1
2016_1242_1437,1,0
2014_1173_1390,1,1
2017_1246_1417,1,1
2018_1242_1335,1,1


In [60]:
test.matchups %>%
    left_join(test.truth,by='ID') %>% 
    mutate_which(is.na(Target),Target=0,Tourney=0) -> test.matchups

In [61]:
test.matchups %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,2278
2015,2278
2016,2278
2017,2278
2018,2278


In [62]:
# tourney matchups features
test.matchups %>%
    left_join(teams.stats.cumul,by=c('Season','DayNum','TeamID.x'='TeamID')) %>% 
    rename_all(list(~str_replace(.,'Sum$','Sum.x'))) %>%
    rename_all(list(~str_replace(.,'Avg$','Avg.x'))) %>%
    left_join(teams.stats.cumul,by=c('Season','DayNum','TeamID.y'='TeamID')) %>% 
    rename_all(list(~str_replace(.,'Sum$','Sum.y'))) %>%
    rename_all(list(~str_replace(.,'Avg$','Avg.y'))) %>%
    na.omit() -> test.matchups.features
test.matchups.features %>% sample_n(5)

ID,Season,DayNum,TeamID.x,TeamID.y,Tourney,Target,ScoreSum.x,OppScoreSum.x,FGMSum.x,⋯,FTAAvg.y,ORAvg.y,DRAvg.y,AstAvg.y,TOAvg.y,StlAvg.y,BlkAvg.y,PFAvg.y,WAvg.y,LAvg.y
2017_1425_1462,2017,140,1425,1462,0,0,2597,2415,897,⋯,23.35294,12.205882,25.38235,14.85294,12.79412,6.147059,2.676471,18.76471,0.6176471,0.3823529
2014_1140_1277,2014,140,1140,1277,0,0,2779,2559,966,⋯,19.26471,11.058824,26.0,17.0,11.44118,6.941176,4.823529,18.88235,0.7647059,0.2352941
2014_1295_1437,2014,140,1295,1437,0,0,2093,1889,744,⋯,25.25,11.28125,25.8125,15.625,11.84375,6.9375,4.03125,19.71875,0.875,0.125
2017_1181_1439,2017,140,1181,1439,0,0,2823,2444,968,⋯,22.34375,7.3125,25.21875,15.09375,12.125,4.0,2.78125,16.40625,0.6875,0.3125
2018_1113_1242,2018,140,1113,1242,0,0,2589,2335,867,⋯,15.88235,9.558824,25.64706,17.0,11.67647,6.617647,4.176471,15.29412,0.7941176,0.2058824


In [63]:
test.matchups %>% group_by(Season) %>% summarise(n=n())

Season,n
2014,2278
2015,2278
2016,2278
2017,2278
2018,2278


In [64]:
dim(test.matchups.features)
test.matchups.features %>% colnames

#### Save datasets for Keras 

In [65]:
train.matchups.features %>%
    select(-Season,-DayNum,-TeamID.x,-TeamID.y) %>%
    write.csv('kerasfiles/train.matchups.csv',row.names=F)

In [66]:
test.matchups.features %>%
    select(-Season,-DayNum,-TeamID.x,-TeamID.y) %>%
    write.csv('kerasfiles/test.matchups.csv',row.names=F)