In [1]:
library(rjson)
library(data.table)
library(dplyr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




### Data repo: **STATSBOMB** data

In [2]:
repo <- '/Users/christian/Desktop/University/Birkbeck MSc Applied Statistics/Football/Data/Statsbomb/open-data-master/data'

### Loading in the **competitions** data

In [3]:
# reading in the data in JSON
competitions <- fromJSON(file = file.path(repo, 'competitions.json', fsep=.Platform$file.sep))

# loading data into data frame
competitions.df <- data.frame(do.call(rbind, competitions), stringsAsFactors = FALSE)

head(competitions.df)

Unnamed: 0_level_0,competition_id,season_id,country_name,competition_name,competition_gender,season_name,match_updated,match_available
Unnamed: 0_level_1,<list>,<list>,<list>,<list>,<list>,<list>,<list>,<list>
1,16,4,Europe,Champions League,male,2018/2019,2020-02-27T12:19:39.458017,2020-02-27T12:19:39.458017
2,16,1,Europe,Champions League,male,2017/2018,2020-06-11T01:24:40.306618,2020-06-11T01:24:40.306618
3,16,2,Europe,Champions League,male,2016/2017,2020-06-10T22:06:56.555602,2020-06-10T22:06:56.555602
4,16,27,Europe,Champions League,male,2015/2016,2020-06-10T20:02:56.222690,2020-06-10T20:02:56.222690
5,16,26,Europe,Champions League,male,2014/2015,2020-06-10T17:04:18.637515,2020-06-10T17:04:18.637515
6,16,25,Europe,Champions League,male,2013/2014,2020-06-11T15:06:38.832473,2020-06-11T15:06:38.832473


### Loading in the **match** data

* `match.files`: First you need to span the tree to get all end files and the paths to those files
    * Here the `recursive=TRUE` argument works wonders

In [4]:
match.files <- list.files(path= file.path(repo, 'matches', fsep='/'), recursive = TRUE, full.names = TRUE)

matches.list <- list()

# iterating through the files
for (i in 1:length(match.files)){
    # creating a temporary variable that holds the raw JSON per file
    # this is a list of dictionaries
    match.temp <- fromJSON(file=match.files[i])
    
    # using lapply (list apply) whereby for each json object, we unlist it (flattening it), and then transpose it
    # we do that for all elements, and must use the function x as lapply expects a function
    matches <- lapply(match.temp, function(x) data.frame(t(unlist(x)), stringsAsFactors = FALSE))
    
    # we now stitch together all single row transposes into a dataframe of matches per JSON file                  
    matches.df <- rbindlist(matches, fill=TRUE)
                      
    # and then we stick this in a list, so it'll be a list of data frames
    matches.list[[i]] <- matches.df
}
                      
# and now we stitch all of those data frames together in a master dataframe
all.matches.df <- data.frame(rbindlist(matches.list, fill=TRUE))

In [5]:
head(all.matches.df)

Unnamed: 0_level_0,match_id,match_date,kick_off,competition.competition_id,competition.country_name,competition.competition_name,season.season_id,season.season_name,home_team.home_team_id,home_team.home_team_name,⋯,stadium.country.name,referee.id,referee.name,referee.country.id,referee.country.name,home_team.managers.dob,away_team.managers.dob,metadata.xy_fidelity_version,home_team.home_team_group,away_team.away_team_group
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,9827,2018-03-01,21:00:00.000,11,Spain,La Liga,1,2017/2018,208,Las Palmas,⋯,Spain,180,Antonio Mateu,214,Spain,,,,,
2,9575,2017-08-20,20:15:00.000,11,Spain,La Liga,1,2017/2018,217,Barcelona,⋯,Spain,210,Daniel Trujillo,112,Italy,,,,,
3,9642,2017-10-14,20:45:00.000,11,Spain,La Liga,1,2017/2018,212,Atlético Madrid,⋯,Spain,180,Antonio Mateu,214,Spain,,,,,
4,9870,2018-04-07,20:45:00.000,11,Spain,La Liga,1,2017/2018,217,Barcelona,⋯,Spain,215,Ricardo De Burgos,112,Italy,,,,,
5,9661,2017-10-28,20:45:00.000,11,Spain,La Liga,1,2017/2018,215,Athletic Bilbao,⋯,Spain,207,Juan Martínez,112,Italy,1966-10-01,,,,
6,9700,2017-12-02,13:00:00.000,11,Spain,La Liga,1,2017/2018,217,Barcelona,⋯,Spain,209,Mario Melero,112,Italy,,,,,


In [7]:
all.matches.df

match_id,match_date,kick_off,competition.competition_id,competition.country_name,competition.competition_name,season.season_id,season.season_name,home_team.home_team_id,home_team.home_team_name,⋯,stadium.country.name,referee.id,referee.name,referee.country.id,referee.country.name,home_team.managers.dob,away_team.managers.dob,metadata.xy_fidelity_version,home_team.home_team_group,away_team.away_team_group
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
9827,2018-03-01,21:00:00.000,11,Spain,La Liga,1,2017/2018,208,Las Palmas,⋯,Spain,180,Antonio Mateu,214,Spain,,,,,
9575,2017-08-20,20:15:00.000,11,Spain,La Liga,1,2017/2018,217,Barcelona,⋯,Spain,210,Daniel Trujillo,112,Italy,,,,,
9642,2017-10-14,20:45:00.000,11,Spain,La Liga,1,2017/2018,212,Atlético Madrid,⋯,Spain,180,Antonio Mateu,214,Spain,,,,,
9870,2018-04-07,20:45:00.000,11,Spain,La Liga,1,2017/2018,217,Barcelona,⋯,Spain,215,Ricardo De Burgos,112,Italy,,,,,
9661,2017-10-28,20:45:00.000,11,Spain,La Liga,1,2017/2018,215,Athletic Bilbao,⋯,Spain,207,Juan Martínez,112,Italy,1966-10-01,,,,
9700,2017-12-02,13:00:00.000,11,Spain,La Liga,1,2017/2018,217,Barcelona,⋯,Spain,209,Mario Melero,112,Italy,,,,,
9742,2018-01-07,16:15:00.000,11,Spain,La Liga,1,2017/2018,217,Barcelona,⋯,Spain,223,Carlos Del Cerro,112,Italy,,1968-11-02,,,
9682,2017-11-18,16:15:00.000,11,Spain,La Liga,1,2017/2018,205,Leganés,⋯,Spain,221,Alberto Undiano,112,Italy,,,,,
9799,2018-02-17,16:15:00.000,11,Spain,La Liga,1,2017/2018,322,Eibar,⋯,Spain,208,Alejandro Hernández,112,Italy,,,,,
9717,2017-12-10,20:45:00.000,11,Spain,La Liga,1,2017/2018,222,Villarreal,⋯,Spain,215,Ricardo De Burgos,112,Italy,,,,,


### Cleaning up **matches**

> Want all columns that don't contain an N/A

> `which` provides index value of a list back when condition is true

> here we're counting cases **per row** where there's an N/A

> only interested in cases where columns have N/A zero times

In [41]:
columns.to.keep <- names(which(unlist(lapply(all.matches.df, function(x) length(which(is.na(x))))) == 0))
                                             
all.matches.clean <- all.matches.df[,columns.to.keep]

### And now transforming some columns from factors to numbers

> can call `str(all.matches.clean)` to learn the data type of the variables

In [38]:
str(all.matches.clean)

'data.frame':	844 obs. of  25 variables:
 $ match_id                    : chr  "9827" "9575" "9642" "9870" ...
 $ match_date                  : chr  "2018-03-01" "2017-08-20" "2017-10-14" "2018-04-07" ...
 $ competition.competition_id  : chr  "11" "11" "11" "11" ...
 $ competition.country_name    : chr  "Spain" "Spain" "Spain" "Spain" ...
 $ competition.competition_name: chr  "La Liga" "La Liga" "La Liga" "La Liga" ...
 $ season.season_id            : chr  "1" "1" "1" "1" ...
 $ season.season_name          : chr  "2017/2018" "2017/2018" "2017/2018" "2017/2018" ...
 $ home_team.home_team_id      : chr  "208" "217" "212" "217" ...
 $ home_team.home_team_name    : chr  "Las Palmas" "Barcelona" "Atlético Madrid" "Barcelona" ...
 $ home_team.home_team_gender  : chr  "male" "male" "male" "male" ...
 $ home_team.country.id        : chr  "214" "214" "214" "214" ...
 $ home_team.country.name      : chr  "Spain" "Spain" "Spain" "Spain" ...
 $ away_team.away_team_id      : chr  "217" "218" "217" 

In [43]:
all.matches.clean$match_week <- as.numeric(all.matches.clean$match_week)
all.matches.clean$home_score <- as.numeric(all.matches.clean$home_score)
all.matches.clean$away_score <- as.numeric(all.matches.clean$away_score)

In [44]:
head(all.matches.clean)

Unnamed: 0_level_0,match_id,match_date,competition.competition_id,competition.country_name,competition.competition_name,season.season_id,season.season_name,home_team.home_team_id,home_team.home_team_name,home_team.home_team_gender,⋯,away_team.country.id,away_team.country.name,home_score,away_score,match_status,last_updated,metadata.data_version,match_week,competition_stage.id,competition_stage.name
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>
1,9827,2018-03-01,11,Spain,La Liga,1,2017/2018,208,Las Palmas,male,⋯,214,Spain,1,1,available,2020-02-25T17:31:26.330,1.0.2,26,1,Regular Season
2,9575,2017-08-20,11,Spain,La Liga,1,2017/2018,217,Barcelona,male,⋯,214,Spain,2,0,available,2020-02-26T13:31:37.949,1.1.0,1,1,Regular Season
3,9642,2017-10-14,11,Spain,La Liga,1,2017/2018,212,Atlético Madrid,male,⋯,214,Spain,1,1,available,2019-12-16T23:09:16.168756,1.0.2,8,1,Regular Season
4,9870,2018-04-07,11,Spain,La Liga,1,2017/2018,217,Barcelona,male,⋯,214,Spain,3,1,available,2019-12-16T23:09:16.168756,1.0.2,31,1,Regular Season
5,9661,2017-10-28,11,Spain,La Liga,1,2017/2018,215,Athletic Bilbao,male,⋯,214,Spain,0,2,available,2019-12-16T23:09:16.168756,1.0.2,10,1,Regular Season
6,9700,2017-12-02,11,Spain,La Liga,1,2017/2018,217,Barcelona,male,⋯,214,Spain,2,2,available,2019-12-16T23:09:16.168756,1.0.2,14,1,Regular Season


### Loading in the **events** data

> There are 855 event files

In [47]:
event.files <- list.files(path= file.path(repo, 'events', fsep='/'), full.names = TRUE, recursive = TRUE)

In [167]:
event.list <- list()
l <- length(event.files)

# looping through all of the event files...
## takes a while
for (i in 1:l){
    event.temp <- fromJSON(file = event.files[i])
    
    team.id.clist <- c()
    
    starting.x11.index <- which(unlist(lapply(event.temp, function(x) x$type$name)) == 'Starting XI')
    starting.x11.list <- list()
    
    # looping through the two indices for the two teams, populating the above list with two dataframes, one per team
    for (s in starting.x11.index){

        starting.x11.team1 <- data.frame(matrix(t(unlist(event.temp[[s]]$tactics$lineup)), ncol=5, byrow=TRUE), stringsAsFactors = FALSE)
        
        # unlisting the event.temp element, getting the names of the tactics$lineup
        colnames(starting.x11.team1) <- names(unlist(event.temp[[s]]$tactics$lineup))[1:5]
                                              
        # adding three extra columns, one for formation, team_id, team_name
        starting.x11.team1$formation <- event.temp[[s]]$tactics$formation
        starting.x11.team1$team_id <- event.temp[[s]]$team$id
        starting.x11.team1$team_name <- event.temp[[s]]$team$name
        
        # update our clist of teamId's
        team.id.clist <- c(team.id.clist, event.temp[[s]]$team$id)
        
        # appending the starting.x11 data to the starting x11 list
        ## this will produce a list of two dataframes, one per team
        starting.x11.list[[s]] <- starting.x11.team1      
    }
                                              
    # now looking at passes
    pass.index <- which(unlist(lapply(event.temp, function(x) x$type$name)) == 'Pass')
    
    # and now filtering to get the pass indices for team 1, using the team 1 identifier from team.id.clist[1]
    pass.team1 <- pass.index[which(unlist(lapply(pass.index, function(x) event.temp[[x]]$team$id)) == team.id.clist[1])]
                                                 
    pass.team1.df <- data.frame(matrix(NA, nrow=1, ncol=13))
    colnames(pass.team1.df) <- c('Possession','Passer','X.Pass','Y.Pass','Pass.Type','Receiver','X.Receive','Y.Receive',
                                 'Pass.Length','Pass.Angle','Body.Part','Pass.Pressure','Pass.Outcome')
                                                 
    for (p in 1:length(pass.team1)){
        pass.temp <- event.temp[[pass.team1[p]]]
        possession <- pass.temp$possession
        passer <- pass.temp$player$id
        pass.location <- pass.temp$location
        pass.type <- pass.temp$pass$height$name
        receiver <- pass.temp$pass$recipient$id
        receive.location <- pass.temp$pass$end_location
        pass.length <- pass.temp$pass$length
        pass.angle <- pass.temp$pass$angle
        body.part <- pass.temp$pass$body_part$name
        pass.pressure <- pass.temp$under_pressure
        pass.outcome <- pass.temp$pass$outcome$name
        
        row.toadd <- c(possession, passer, pass.location, pass.type, receiver, receive.location
                      ,pass.length, pass.angle, body.part, pass.pressure, pass.outcome)
        
        pass.team1.df <- rbind(pass.team1.df, row.toadd)
        
    }
    
    # getting rid of the first empty row (this is minging)
    pass.team1.df <- pass.team1.df[-1,]
    pass.team1.df[,c(1:4,6:10)] <- lapply(pass.team1.df[,c(1:4,6:10)], as.numeric)
                                                 
    # this is basically a row number (partition by...)
    # providing 
    pass.team1.df <- pass.team1.df %>% group_by(Possession) %>% mutate(seq = row_number())
    pass.team1.df$team_id <- team.id.clist[1]
                                                 
                                                 
                                                 
    ## AND NOW TO DO THE EXACT SAME THING FOR TEAM 2!

    # and now filtering to get the pass indices for team 2, using the team 2 identifier from team.id.clist[2]
    pass.team2 <- pass.index[which(unlist(lapply(pass.index, function(x) event.temp[[x]]$team$id)) == team.id.clist[2])]
                                                 
    pass.team2.df <- data.frame(matrix(NA, nrow=1, ncol=13))
    colnames(pass.team2.df) <- c('Possession','Passer','X.Pass','Y.Pass','Pass.Type','Receiver','X.Receive','Y.Receive',
                                 'Pass.Length','Pass.Angle','Body.Part','Pass.Pressure','Pass.Outcome')
                                                 
    for (p in 1:length(pass.team2)){
        pass.temp <- event.temp[[pass.team2[p]]]
        possession <- pass.temp$possession
        passer <- pass.temp$player$id
        pass.location <- pass.temp$location
        pass.type <- pass.temp$pass$height$name
        receiver <- pass.temp$pass$recipient$id
        receive.location <- pass.temp$pass$end_location
        pass.length <- pass.temp$pass$length
        pass.angle <- pass.temp$pass$angle
        body.part <- pass.temp$pass$body_part$name
        pass.pressure <- pass.temp$under_pressure
        pass.outcome <- pass.temp$pass$outcome$name
        
        row.toadd <- c(possession, passer, pass.location, pass.type, receiver, receive.location
                      ,pass.length, pass.angle, body.part, pass.pressure, pass.outcome)
        
        pass.team2.df <- rbind(pass.team2.df, row.toadd)
        
    }
    
    # getting rid of the first empty row (this is minging)
    pass.team2.df <- pass.team2.df[-1,]
    pass.team2.df[,c(1:4,6:10)] <- lapply(pass.team2.df[,c(1:4,6:10)], as.numeric)
                                                 
    # this is basically a row number (partition by...)
    # providing 
    pass.team2.df <- pass.team2.df %>% group_by(Possession) %>% mutate(seq = row_number())
    pass.team2.df$team_id <- team.id.clist[2]
                                                 
    
    ## AND NOW PUTTING IT ALL TOGETHER
    pass.list <- list(pass.team1.df, pass.team2.df)
                                                 
    match.id <- strsplit(basename(event.files[i]), '[.]')[[1]][1]
                                                 
    event.list[[match.id]] <- list(starting.x11.list, pass.list)
                                                 
}

“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduced by coercion”
“NAs introduce

In [166]:
event.list[[1]]

player.id,player.name,position.id,position.name,jersey_number,formation,team_id,team_name
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
20055,Marc-André ter Stegen,1,Goalkeeper,1,442,217,Barcelona
6374,Nélson Cabral Semedo,2,Right Back,2,442,217,Barcelona
5213,Gerard Piqué Bernabéu,3,Right Center Back,3,442,217,Barcelona
5492,Samuel Yves Umtiti,5,Left Center Back,23,442,217,Barcelona
5211,Jordi Alba Ramos,6,Left Back,18,442,217,Barcelona
6379,Sergi Roberto Carnicer,12,Right Midfield,20,442,217,Barcelona
5203,Sergio Busquets i Burgos,13,Right Center Midfield,5,442,217,Barcelona
5470,Ivan Rakitić,15,Left Center Midfield,4,442,217,Barcelona
5477,Ousmane Dembélé,16,Left Midfield,11,442,217,Barcelona
5246,Luis Alberto Suárez Díaz,22,Right Center Forward,9,442,217,Barcelona

player.id,player.name,position.id,position.name,jersey_number,formation,team_id,team_name
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
6629,Fernando Pacheco Flores,1,Goalkeeper,1,4141,206,Deportivo Alavés
6618,Martín Aguirregabiria Padilla,2,Right Back,21,4141,206,Deportivo Alavés
6615,Víctor Laguardia Cisneros,3,Right Center Back,5,4141,206,Deportivo Alavés
6855,Guillermo Alfonso Maripán Loaysa,5,Left Center Back,6,4141,206,Deportivo Alavés
6612,Rubén Duarte Sánchez,6,Left Back,3,4141,206,Deportivo Alavés
6839,Daniel Alejandro Torres Rojas,10,Center Defensive Midfield,16,4141,206,Deportivo Alavés
6617,Ibai Gómez Pérez,12,Right Midfield,11,4141,206,Deportivo Alavés
6626,Mubarak Wakaso,13,Right Center Midfield,22,4141,206,Deportivo Alavés
6632,Manuel Alejandro García Sánchez,15,Left Center Midfield,19,4141,206,Deportivo Alavés
6581,Jonathan Rodríguez Menéndez,16,Left Midfield,23,4141,206,Deportivo Alavés

Possession,Passer,X.Pass,Y.Pass,Pass.Type,Receiver,X.Receive,Y.Receive,Pass.Length,Pass.Angle,Body.Part,Pass.Pressure,Pass.Outcome,seq,team_id
<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<int>,<dbl>
3,5203,35,8,High Pass,5470,35,21,13.000000,1.5707964,Head,3,5203,1,217
3,5470,36,21,Low Pass,5477,36,2,19.000000,-1.5707964,Head,3,5470,2,217
3,5477,34,3,Ground Pass,5211,26,2,8.062258,-3.0172377,Right Foot,TRUE,3,3,217
3,5211,25,2,High Pass,5246,94,20,71.309190,0.2551824,Left Foot,Incomplete,3,4,217
3,20055,14,28,Ground Pass,5213,23,44,18.357560,1.0584068,Keeper Arm,3,20055,5,217
3,5213,35,54,Ground Pass,6374,44,72,20.124610,1.1071488,Right Foot,3,5213,6,217
3,6374,50,72,Ground Pass,6379,57,63,11.401754,-0.9097531,Right Foot,3,6374,7,217
3,6379,56,63,Ground Pass,6374,47,69,10.816654,2.5535900,Right Foot,3,6379,8,217
3,6374,48,69,Ground Pass,5492,39,34,36.138622,-1.8224863,Right Foot,3,6374,9,217
3,5492,43,25,Ground Pass,5211,55,2,25.942244,-1.0899091,Right Foot,3,5492,10,217

Possession,Passer,X.Pass,Y.Pass,Pass.Type,Receiver,X.Receive,Y.Receive,Pass.Length,Pass.Angle,Body.Part,Pass.Pressure,Pass.Outcome,seq,team_id
<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<int>,<dbl>
2,6581,61,41,Ground Pass,6855,36,26.000000,29.1547600,-2.6011732,Left Foot,2,6581,1,206
2,6855,36,29,High Pass,6613,86,73.000000,66.6033000,0.7216548,Right Foot,Incomplete,2,2,206
3,6615,36,69,High Pass,6613,108,55.000000,73.3484800,-0.1920480,Right Foot,Incomplete,3,1,206
4,6855,22,14,Ground Pass,6612,22,11.000000,3.0000000,-1.5707964,Left Foot,4,6855,1,206
4,6612,2,4,Low Pass,6581,25,3.000000,23.0217290,-0.0434509,Left Foot,TRUE,4,2,206
6,6617,51,61,Ground Pass,6626,54,49.000000,12.3693170,-1.3258177,Right Foot,6,6617,1,206
6,6626,55,50,Ground Pass,6617,83,73.000000,36.2353400,0.6876712,Left Foot,TRUE,Incomplete,2,206
8,6632,36,17,Ground Pass,6613,47,20.000000,11.4017540,0.2662520,Left Foot,8,6632,1,206
8,6613,46,19,Ground Pass,6581,53,9.000000,12.2065550,-0.9600704,Right Foot,8,6613,2,206
8,6581,103,12,Ground Pass,6613,115,40.000000,30.4630930,1.1659045,Left Foot,Incomplete,8,3,206
