/
processing.R
278 lines (257 loc) · 8.89 KB
/
processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#' Add extra columns filled with NA values to a dataset.
#'
#' @description Adds extra columns filled with NAs to a dataset.
#' This ensures that all datasets from the covidregionaldata package return
#' datasets of the same underlying structure (i.e. same columns).
#' @param data A data frame
#' @return A tibble with relevant NA columns added
#' @family compulsory_processing
#' @concept compulsory_processing
#' @importFrom dplyr mutate
#' @importFrom rlang !!!
add_extra_na_cols <- function(data) {
expected_col_names <- c(
"cases_new", "cases_total", "deaths_new", "deaths_total", "recovered_new",
"recovered_total", "tested_new", "tested_total", "hosp_new", "hosp_total"
)
new_cols <- rep(list(NA_real_), length(expected_col_names))
names(new_cols) <- expected_col_names
data <- mutate(
data,
!!!new_cols[!(names(new_cols) %in% names(data))]
)
return(data)
}
#' Set negative data to 0
#'
#' @description Set data values to 0 if they are negative in a dataset. Data in
#' the datasets should always be > 0.
#' @param data A data frame
#' @return A data frame with all relevant data > 0.
#' @family optional_processing
#' @concept optional_processing
#' @importFrom dplyr mutate_if
set_negative_values_to_zero <- function(data) {
data <- suppressMessages(
mutate_if(data, is.numeric, ~ replace(., . < 0, 0))
)
return(data)
}
#' Add rows of NAs for dates where a region does not have any data
#'
#' @description There are points, particularly early during data collection,
#' where data was not collected for all regions. This function finds dates
#' which have data for some regions, but not all, and adds rows of NAs for the
#' missing regions. This is mainly for reasons of completeness.
#' @param data A data frame
#' @return A tibble with rows of NAs added.
#' @family compulsory_processing
#' @concept compulsory_processing
#' @importFrom tidyr complete full_seq nesting
#' @importFrom dplyr starts_with group_by ungroup group_vars
#' @importFrom rlang !!! syms
fill_empty_dates_with_na <- function(data) {
regions <- select(data, starts_with("level_")) %>%
names()
groups <- group_vars(data)
data <- data %>%
ungroup() %>%
complete(
date = full_seq(data$date, period = 1),
nesting(!!!syms(regions))
) %>%
group_by(across(.cols = all_of(groups)))
return(data)
}
#' Completes cumulative columns if rows were added with NAs.
#'
#' @description If a dataset had a row of NAs added to it (using
#' fill_empty_dates_with_na) then cumulative data columns will have NAs which
#' can cause issues later. This function fills these values with the previous
#' non-NA value.
#' @param data A data frame
#' @return A data tibble with NAs filled in for cumulative data columns.
#' @family compulsory_processing
#' @concept compulsory_processing
#' @importFrom tidyr fill
#' @importFrom dplyr all_of
complete_cumulative_columns <- function(data) {
cumulative_col_names <- c(
"deaths_total", "cases_total", "recovered_total",
"hosp_total", "tested_total"
)
idx <- cumulative_col_names %in% colnames(data)
targets <- cumulative_col_names[idx]
data <- fill(data, all_of(targets))
return(data)
}
#' Cumulative counts from daily counts or daily counts from cumulative,
#' dependent on which columns already exist
#'
#' @description Checks which columns are missing (cumulative/daily counts) and
#' if one is present and the other not then calculates the second from the
#' first.
#' @param data A data frame
#' @return A data frame with extra columns if required
#' @family compulsory_processing
#' @concept compulsory_processing
#' @importFrom dplyr mutate group_by_at arrange vars starts_with lag ends_with
#' @importFrom purrr walk2
#' @importFrom tidyr replace_na
#' @importFrom rlang !! :=
calculate_columns_from_existing_data <- function(data) {
possible_counts <- c("cases", "deaths", "hosp", "recovered", "tested")
count_today_name <- paste0(possible_counts, "_new")
cumulative_count_name <- paste0(possible_counts, "_total")
idx_1 <- (count_today_name %in% colnames(data) &
!(cumulative_count_name %in% colnames(data)))
idx_2 <- (!(count_today_name %in% colnames(data)) &
cumulative_count_name %in% colnames(data))
data <- data %>%
group_by_at(vars(ends_with("_region"))) %>%
arrange(date, .by_group = TRUE)
# if new is avaliable but total is not, calculate totals
walk2(
count_today_name[idx_1], cumulative_count_name[idx_1],
~ {
data <<- data %>%
mutate(
!!.y := cumsum(replace_na(!!as.name(.x), 0))
)
}
)
# if total is avaliable but new is not, calculate new
walk2(
count_today_name[idx_2], cumulative_count_name[idx_2],
~ {
data <<- data %>%
fill(!!.y) %>%
mutate(
!!.x :=
(!!as.name(.y)) -
lag(!!as.name(.y), default = 0)
)
}
)
return(data)
}
#' Get totals data given the time series data.
#'
#' @description Get totals data given the time series data.
#' @param data A data table
#' @return A data table, totalled up
#' @importFrom dplyr left_join group_by summarise select arrange
#' @family optional_processing
#' @concept optional_processing
totalise_data <- function(data) {
data <- data %>%
summarise(
cases_total = sum(.data$cases_new, na.rm = TRUE),
deaths_total = sum(.data$deaths_new, na.rm = TRUE),
recovered_total = sum(.data$recovered_new, na.rm = TRUE),
hosp_total = sum(.data$hosp_new, na.rm = TRUE),
tested_total = sum(.data$tested_new, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(-.data$cases_total)
return(data)
}
#' Default processing steps to run
#' @description The default processing steps to which are always run. Runs on
#' clean data
#' @param data A data table
#' @importFrom dplyr do
#' @concept utility
#' @family processing
run_default_processing_fns <- function(data) {
. <- NULL
data <- data %>%
do(calculate_columns_from_existing_data(.)) %>%
add_extra_na_cols()
return(data)
}
#' Optional processing steps to run
#' @description user supplied processing steps which are run after default steps
#' @param data A data table
#' @inheritParams process_internal
#' @concept utility
#' @family processing
run_optional_processing_fns <- function(data, process_fns) {
if (!missing(process_fns)) {
if (!is.null(process_fns)) {
if (!is.na(process_fns)) {
for (i in seq_along(process_fns)) {
data <- process_fns[[i]](data)
}
}
}
}
return(data)
}
#' Internal Shared Regional Dataset Processing
#'
#' @description Internal shared regional data cleaning designed to be called
#' by `process`.
#' @param clean_data The clean data for a class, e.g. `Italy$data$clean`
#' @param level The level of the data, e.g. 'level_1_region'
#' @param group_vars Grouping variables, used to
#' for grouping and to localise names. It is assumed that the first entry
#' indicates the main region variable and the second indicates the geocode for
#' this variable.
#' @param totals Logical, defaults to `FALSE`. If `TRUE``, returns totalled
#' data per region up to today's date. If FALSE, returns the full dataset
#' stratified by date and region.
#' @param localise Logical, defaults to `TRUE`. Should region names be
#' localised.
#' @param verbose Logical, defaults to `TRUE`. Should verbose processing
#' messages and warnings be returned.
#' @param process_fns array, additional functions to be called after default
#' processing steps
#' @concept utility
#' @family processing
#' @importFrom dplyr do group_by_at across ungroup select everything arrange
#' @importFrom dplyr rename all_of
#' @importFrom tidyr drop_na
#' @importFrom rlang !!!
process_internal <- function(clean_data, level, group_vars,
totals = FALSE, localise = TRUE,
verbose = TRUE,
process_fns) {
if (!any(class(clean_data) %in% "data.frame")) {
stop("No regional data found to process")
}
group_vars_standard <- names(group_vars)
dat <- group_by(clean_data, across(.cols = all_of(group_vars_standard)))
dat <- run_default_processing_fns(dat)
dat <- run_optional_processing_fns(dat, process_fns)
if (totals) {
dat <- totalise_data(dat)
dat <- dat %>%
select(all_of(
c(
group_vars_standard, "cases_total", "deaths_total", "recovered_total",
"hosp_total", "tested_total"
)
))
} else {
dat <- dat %>%
drop_na(.data$date) %>%
fill_empty_dates_with_na() %>%
complete_cumulative_columns() %>%
select(
all_of(c(
"date", group_vars_standard, "cases_new", "cases_total",
"deaths_new", "deaths_total", "recovered_new", "recovered_total",
"hosp_new", "hosp_total", "tested_new", "tested_total"
)),
everything()
)
}
dat <- ungroup(dat)
if (localise) {
old <- names(group_vars)
names(old) <- group_vars
dat <- rename(dat, !!!old)
}
return(dat)
}