/
utils.R
286 lines (256 loc) · 11.7 KB
/
utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#' Add extra columns filled with NA values to a dataset.
#' @description Adds extra columns filled with NAs to a dataset. This ensures that all datasets from the covidregionaldata package return datasets
#' of the same underlying structure (i.e. same columns).
#' @param data a data table
#' @return a tibble with relevant NA columns added
#' @importFrom tibble tibble
add_extra_na_cols <- function(data) {
expected_col_names <- c(
"cases_new", "cases_total", "deaths_new", "deaths_total",
"recovered_new", "recovered_total", "tested_new", "tested_total", "hosp_new",
"hosp_total"
)
for (colname in expected_col_names) {
if (!(colname %in% colnames(data))) {
original_col_names <- colnames(data)
data$newCol <- rep(NA_integer_, dim(data)[1])
colnames(data) <- c(original_col_names, colname)
}
}
return(tibble::tibble(data))
}
#' Set negative data to 0
#' @description Set data values to 0 if they are negative in a dataset. Data in the datasets should always be > 0.
#' @param data a data table
#' @return a tibble with all relevant data > 0.
#' @importFrom dplyr mutate
#' @importFrom tibble tibble
set_negative_values_to_zero <- function(data) {
numeric_col_names <- c(
"deaths_total", "cases_total", "recovered_total", "hosp_total", "tested_total",
"cases_new", "deaths_new", "recovered_new", "hosp_new", "tested_new"
)
for (numeric_col_name in numeric_col_names) {
if (numeric_col_name %in% colnames(data)) {
data[which(data[, numeric_col_name] < 0), numeric_col_name] <- 0
}
}
return(tibble::tibble(data))
}
#' Add rows of NAs for dates where a region does not have any data
#' @description There are points, particularly early during data collection, where data was not collected for all regions.
#' This function finds dates which have data for some regions, but not all, and adds rows of NAs for the missing regions.
#' This is mainly for reasons of completeness.
#' @param data a data table
#' @return a tibble with rows of NAs added.
#' @importFrom tibble tibble
#' @importFrom tidyr complete full_seq
fill_empty_dates_with_na <- function(data) {
if ("region_level_2" %in% colnames(data)) {
data <- data %>%
tidyr::complete(date = tidyr::full_seq(data$date, period = 1), tidyr::nesting(
region_level_2, level_2_region_code,
region_level_1, level_1_region_code
))
} else {
data <- data %>%
tidyr::complete(date = tidyr::full_seq(data$date, period = 1), tidyr::nesting(region_level_1, level_1_region_code))
}
return(tibble::tibble(data))
}
#' Completes cumulative columns if rows were added with NAs.
#' @description If a dataset had a row of NAs added to it (using fill_empty_dates_with_na) then cumulative data columns will have NAs which can cause
#' issues later. This function fills these values with the previous non-NA value.
#' @param data a data table
#' @return a tibble with NAs filled in for cumulative data columns.
#' @importFrom dplyr group_by
#' @importFrom tidyr fill
complete_cumulative_columns <- function(data) {
cumulative_col_names <- c("deaths_total", "cases_total", "recovered_total", "hosp_total", "tested_total")
for (cumulative_col_name in cumulative_col_names) {
if (cumulative_col_name %in% colnames(data)) {
if ("region_level_2" %in% colnames(data)) {
data <- data %>%
dplyr::group_by(region_level_1, level_1_region_code, region_level_2, level_2_region_code) %>%
tidyr::fill(all_of(cumulative_col_name)) %>%
dplyr::ungroup()
} else {
data <- data %>%
dplyr::group_by(region_level_1, level_1_region_code, ) %>%
tidyr::fill(all_of(cumulative_col_name)) %>%
dplyr::ungroup()
}
}
}
return(tibble::tibble(data))
}
#' Cumulative counts from daily counts or daily counts from cumulative, dependent on which columns already exist
#' @description Checks which columns are missing (cumulative/daily counts) and if one is present and the other not
#' then calculates the second from the first.
#' @param data A data frame
#' @return A data frame with extra columns if required
#' @importFrom dplyr mutate group_by_at arrange vars starts_with lag
#' @importFrom tidyr replace_na
#' @importFrom tibble tibble
calculate_columns_from_existing_data <- function(data) {
possible_counts <- c("cases", "deaths", "hosp", "recovered", "tested")
for (count in possible_counts) {
count_today_name <- paste0(count, "_new")
cumulative_count_name <- paste0(count, "_total")
if (count_today_name %in% colnames(data) & !(cumulative_count_name %in% colnames(data))) {
# in this case the daily count is there but there are no cumulative counts
data <- data %>%
dplyr::group_by_at(dplyr::vars(dplyr::starts_with("region_level"))) %>%
dplyr::arrange(date, .by_group = TRUE) %>%
dplyr::mutate(!!cumulative_count_name := cumsum(tidyr::replace_na(!!as.name(count_today_name), 0)))
} else if (!(count_today_name %in% colnames(data)) & cumulative_count_name %in% colnames(data)) {
# in this case the cumulative counts are there but no daily counts
data <- data %>%
dplyr::group_by_at(dplyr::vars(dplyr::starts_with("region_level"))) %>%
dplyr::arrange(date, .by_group = TRUE) %>%
tidyr::fill(!!cumulative_count_name) %>% # Fill LOCF for cumulative data
dplyr::mutate(!!count_today_name := (!!as.name(cumulative_count_name)) - dplyr::lag(!!as.name(cumulative_count_name), default = 0))
}
}
return(tibble::tibble(data))
}
#' Custom CSV reading function
#' @description Checks for use of memoise and then uses whichever read_csv function is needed by user
#' @param file A URL or filepath to a CSV
#' @param ... extra parameters to be passed to vroom::vroom
#' @return A data table
#' @importFrom memoise memoise cache_filesystem
#' @importFrom vroom vroom
#' @importFrom tibble tibble
csv_reader <- function(file, ...) {
read_csv_fun <- vroom::vroom
if (!is.null(getOption("useMemoise"))) {
if (getOption("useMemoise")) {
# Set up cache
ch <- memoise::cache_filesystem(".cache")
read_csv_fun <- memoise::memoise(vroom::vroom, cache = ch)
}
}
data <- read_csv_fun(file, ..., guess_max = 500)
return(tibble::tibble(data))
}
#' Custom CSV reading function
#' @description Checks for use of memoise and then uses readr::read_csv, which appears
#' more robust in loading some streams
#' @param file A URL or filepath to a CSV
#' @param ... extra parameters to be passed to readr::read_csv
#' @return A data table
#' @importFrom memoise memoise cache_filesystem
#' @importFrom tibble tibble
#' @importFrom readr read_csv
csv_readr <- function(file, ...) {
read_csv_fun <- readr::read_csv
if (!is.null(getOption("useMemoise"))) {
if (getOption("useMemoise")) {
# Set up cache
ch <- memoise::cache_filesystem(".cache")
read_csv_fun <- memoise::memoise(readr::read_csv, cache = ch)
}
}
data <- read_csv_fun(file, ...)
return(tibble::tibble(data))
}
#' Custom left_join function
#' @description Checks if table that is being added is NULL and then uses left_join
#' @param data a data table
#' @param region_codes_table a table of region codes which will be left_join (optionally NULL)
#' @param by see dplyr::left_join() description of by parameter
#' @param ... optional arguments passed into dplyr::left_join()
#' @return A data table
#' @importFrom dplyr left_join
#' @importFrom tibble tibble
#'
left_join_region_codes <- function(data, region_codes_table, by = NULL, ...) {
if (is.null(region_codes_table)) {
return(data)
}
data <- dplyr::left_join(data, region_codes_table, by = by, ...)
return(tibble::tibble(data))
}
#' Get totals data given the time series data.
#'
#' @description Get totals data given the time series data.
#' @param data a data table
#' @param include_level_2_regions Boolean. Are level 2 regions included in the data
#' @return A data table, totalled up
#' @importFrom dplyr left_join group_by %>% summarise select arrange
#' @importFrom tibble tibble
#'
totalise_data <- function(data, include_level_2_regions) {
# Group the data ------------------------------------------------------
if (include_level_2_regions) {
data <- data %>%
dplyr::group_by(region_level_1, level_1_region_code, region_level_2, level_2_region_code)
} else {
data <- data %>%
dplyr::group_by(region_level_1, level_1_region_code)
}
# Total the data ------------------------------------------------------
data <- data %>%
dplyr::summarise(
cases_total = sum(cases_new, na.rm = TRUE),
deaths_total = sum(deaths_new, na.rm = TRUE),
recovered_total = sum(recovered_new, na.rm = TRUE),
hosp_total = sum(hosp_new, na.rm = TRUE),
tested_total = sum(tested_new, na.rm = TRUE)
) %>%
dplyr::ungroup()
# Select correct data -------------------------------------------------
if (include_level_2_regions) {
data <- data %>%
dplyr::select(
region_level_2, level_2_region_code,
region_level_1, level_1_region_code, cases_total, deaths_total,
recovered_total, hosp_total, tested_total
)
} else {
data <- data %>%
dplyr::select(
region_level_1, level_1_region_code, cases_total, deaths_total,
recovered_total, hosp_total, tested_total
)
}
return(tibble::tibble(data))
}
# Global variables --------------------------------------------------------
utils::globalVariables(c(
".", ":=", "AnzahlFall", "Area type", "Specimen date", "casos",
"AnzahlTodesfall", "Area", "Bundesland", "CASES", "Cases",
"Code", "Country", "DATE", "DEATHS", "Date", "Deaths", "Landkreis", "Meldedatum",
"NEW_IN", "PROVINCE", "Province", "Province_State", "REGION", "Recoveries", "Specimen",
"date", "Status", "TT", "TotalCases", "all_of", "cases", "cases_new", "cases_total",
"casos_confirmados", "casos_fallecido", "city", "countriesAndTerritories",
"country", "county", "data", "data_type", "dateRep", "deaths", "deaths_new",
"deaths_total", "deceduti", "denominazione_regione", "departamento", "fecha", "fips", "geoId",
"hosp_new", "hosp_total", "iso_code", "level_1_region_code", "level_2_region_code",
"location", "location_code", "n", "n.x", "n.y", "newCases", "newDeaths", "numdeaths",
"numrecover", "numtested", "numtoday", "numtotal", "popData2019", "population_2019",
"prname", "province", "pruebas", "pruid", "recovered_new", "recovered_total", "region_level_1",
"region_level_2", "state", "state_name", "tamponi", "tested_new",
"Area", "type", "ID", "Specimen", "date", "date_admission_hospital", "date_confirm",
"date_confirmation", "date_death_or_discharge", "date_onset",
"date_onset_symptoms", "days_onset_to_report", "id", "read.csv", "tested_total",
"totalCases", "totale_casi", "un_region", "untar", "value", "who_region",
"areaCode", "areaName", "cumAdmissions", "cumCasesByPublishDate",
"cumCasesBySpecimenDate", "cumDeaths28DaysByPublishDate",
"cumTestsByPublishDate", "data_list", "newAdmissions", "newCasesByPublishDate",
"newCasesBySpecimenDate", "newDeaths28DaysByPublishDate",
"newTestsByPublishDate", "setTxtProgressBar", "txtProgressBar",
"level_2_region_code.x", "level_2_region_code.y", "cumDeaths28DaysByDeathDate", "newDeaths28DaysByDeathDate",
"outcome", "death", "delay_onset_report", "delay_onset_admission", "delay_onset_death",
"hosp_new_first_admissions",
"iso_3166_2", "min_date", "max_date", "iso_na",
"cases_new_na", "cases_total_na", "deaths_new_na", "deaths_total_na",
"ENTRY_DATE", "DATE_IMPLEMENTED",
"cases_weekly", "deaths_weekly",
"fecha_confirmacion", "provincia", "get_data_function",
"col_character", "YYYYMMDD", "total",
"Subdivision name", "cl_age90", "jour", "reg", "P", "incid_hosp", "incid_dc", "dep",
"In region(since 2016)", "Subdivision category", "Subdivision name (fr)",
"poblacion", "nombre", "cve_ent", "inegi_state"
))