-
Notifications
You must be signed in to change notification settings - Fork 6
/
xl_sheets_to_output.R
239 lines (216 loc) · 7.48 KB
/
xl_sheets_to_output.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#' Combine spreadsheets from excel workbooks and output common data structures.
#'
#' These functions combine spreadsheets from excel workbooks into common data
#' structures. `xl_sheets_to_csv()` and `xl_sheets_to_xl()` write a .csv or
#' excel (.xlsx) file per workbook -- combining all spreadsheets.
#' `xl_sheets_to_df` outputs a list where each dataframes combines all
#' spreadsheeets of a workbook.
#'
#' This is a rigid function with a very specific goal: To process data from a
#' specific sampling software -- FastField. Specifically, this is what this
#' function does:
#' * Reads each spreadsheet from each workbook and map it to a dataframe.
#' * Lowercases and links the names of each dataframe.
#' * Keeps only these dataframes: (1) original_stems, (2) new_secondary_stems,
#' and (3) "recruits".
#' * Dates the data by `submission_id` (`date` comes from the spreadsheet
#' `root`).
#' * Lowercases and links the names of each dataframe-variable.
#' * Drops fake stems.
#' * Output a common data structure of your choice.
#'
#' @param input_dir String giving the directory containing the excel workbooks
#' to read from.
#' @param output_dir String giving the directory where to write .csv files to.
#' @param first_census This argument tells these functions what sheets to expect
#' in the input.
#' * Use `TRUE` if this is your first census. The expected input must have
#' sheets (1) "root", (2) "multi_stems", (3) "secondary_stems", and
#' (4) "single_stems".
#' * Use `FALSE` (default) if this is not your first census. The expected
#' input must have sheets (1) "root", (2) "original_stems", (3)
#' "new_secondary_stems", and (4) "recruits".
#'
#' @return Writes one .csv file for each workbook.
#'
#' @author Mauro Lepore and Jessica Shue.
#'
#' @section Acknowledgment:
#' Sabrina Russo helped to make these functions useful with first censuses.
#'
#' @examples
#' library(fs)
#' library(readr)
#' library(readxl)
#'
#' # NOT A FIRST CENSUS
#' # Path to the folder I want to read excel files from
#' input_dir <- dirname(example_path("two_files/new_stem_1.xlsx"))
#' input_dir
#'
#' # Files I want to read
#' dir(input_dir, pattern = "xlsx")
#'
#' # Path to the folder I want to write .csv files to
#' output_dir <- tempdir()
#'
#' # Output a csv file
#' xl_sheets_to_csv(input_dir, output_dir)
#'
#' # Confirm
#' path_file(dir_ls(output_dir, regexp = "new_stem.*csv$"))
#'
#' # Also possible to output excel and a list of dataframe. See next section.
#'
#' # FIRST CENSUS
#' input_dir <- dirname(example_path("first_census/census.xlsx"))
#' # As a reminder you'll get a warning of missing sheets
#' # Output list of dataframes (one per input workbook -- here only one)
#' xl_sheets_to_df(input_dir, first_census = TRUE)
#'
#' # Output excel
#' xl_sheets_to_xl(input_dir, output_dir, first_census = TRUE)
#' # Read back
#' filename <- path(output_dir, "census.xlsx")
#' out <- read_excel(filename)
#' str(out, give.attr = FALSE)
#' @name xl_sheets_to_output
NULL
xl_sheets_to_file <- function(ext, fun_write) {
function(input_dir, output_dir = "./", first_census = FALSE) {
check_output_dir(output_dir = output_dir, print_as = "`output_dir`")
dfs <- xl_sheets_to_df(input_dir = input_dir, first_census = first_census)
files <- fs::path_ext_remove(names(dfs))
paths <- fs::path(output_dir, fs::path_ext_set(files, ext))
purrr::walk2(dfs, paths, fun_write)
}
}
#' @export
#' @rdname xl_sheets_to_output
xl_sheets_to_csv <- xl_sheets_to_file("csv", readr::write_csv)
#' @export
#' @rdname xl_sheets_to_output
xl_sheets_to_xl <- xl_sheets_to_file("xlsx", writexl::write_xlsx)
#' @export
#' @rdname xl_sheets_to_output
xl_sheets_to_df <- function(input_dir, first_census = FALSE) {
check_input_dir(input_dir = input_dir, print_as = "`input_dir`")
out <- purrr::map(
xl_workbooks_to_chr(input_dir),
xl_sheets_to_df_, first_census = first_census
)
purrr::set_names(out, basename(names(out)))
}
#' Do xl_sheets_to_df() for each excel file.
#' @noRd
xl_sheets_to_df_ <- function(file, first_census = FALSE) {
dfm_list <- fgeo.tool::nms_tidy(fgeo.tool::ls_list_spreadsheets(file))
if (first_census) {
key <- c("root", "multi_stems", "secondary_stems", "single_stems")
dfm_list <- ensure_key_sheets(dfm_list, key = key)
} else {
key <- c("original_stems", "new_secondary_stems", "recruits", "root")
dfm_list <- ensure_key_sheets(dfm_list, key = key)
}
# Piping functions to avoid useless intermediate variables
clean_dfm_list <- dfm_list %>%
purrr::keep(~!purrr::is_empty(.)) %>%
lapply(fgeo.tool::nms_tidy) %>%
drop_fake_stems() %>%
fgeo.tool::ls_name_df(name = "sheet") %>%
warn_if_empty("new_secondary_stems") %>%
warn_if_empty("recruits") %>%
# Avoid merge errors
coerce_as_character()
with_date <- join_and_date(clean_dfm_list)
# In columns matching "codes", replace commas by semicolon
.df <- purrr::modify_if(
with_date, grepl("codes", names(with_date)), ~gsub(",", ";", .x)
)
.df
}
#' Check that key spreadsheets exist.
#' @noRd
ensure_key_sheets <- function(x, key) {
missing_key_sheet <- !all(key %in% names(x))
if (missing_key_sheet) {
msg <- paste0(
"Data should contain these sheets:\n", commas(key), "\n",
"* Missing sheets: ", commas(setdiff(key, names(x)))
)
abort(msg)
}
x[intersect(key, names(x))]
}
#' Remove rows equal to cero from the spreadsheet sheet new_secondary_stem.
#' @noRd
drop_fake_stems <- function(.df) {
dropped <- purrr::modify_at(
.df, .at = "new_secondary_stems", ~.x[.x$new_stem != 0, ]
)
dropped
}
#' Warns if a dataframe in a list of dataframes has empty rows.
#' @noRd
warn_if_empty <- function(.x, dfm_nm) {
dfm <- .x[[dfm_nm]]
if (is.null(dfm)) {
warn(paste("`.x` has no dataframe", dfm_nm), ". Is this intentional?")
return(invisible(.x))
}
has_cero_rows <- nrow(dfm) == 0
if (has_cero_rows) {
warn(paste0("`", dfm_nm, "`", " has cero rows."))
}
invisible(.x)
}
coerce_as_character <- function(.x, ...) {
purrr::map(.x, ~purrr::modify(., .f = as.character, ...))
}
join_and_date <- function(.x) {
# From `root`, pull only `date` (plus a column to merge by)
date <- .x[["root"]][c("submission_id", "date")]
# Join data from all sheets except from `root`
is_not_root <- !grepl("root", names(.x))
not_root_dfm <- purrr::keep(.x, is_not_root)
# Nothing to join date with
first_census <- length(not_root_dfm) == 0
if (first_census) {
return(date)
}
# Collapse into a single dataframe, add variable, and join with date
not_root_dfm %>%
fgeo.tool::ls_join_df() %>%
dplyr::mutate(unique_stem = paste0(.data$tag, "_", .data$stem_tag)) %>%
dplyr::left_join(date, by = "submission_id")
}
check_input_dir <- function(input_dir, print_as) {
stopifnot(is.character(input_dir))
validate_dir(input_dir, "`input_dir`")
msg <- "`input_dir` must contain at least one excel file."
file_names <- xl_workbooks_to_chr(input_dir)
if (length(file_names) == 0) {
abort(msg)
}
invisible()
}
check_output_dir <- function(output_dir, print_as) {
stopifnot(is.character(output_dir))
validate_dir(output_dir, "`output_dir`")
invisible()
}
validate_dir <- function(dir, dir_name) {
invalid_dir <- !fs::dir_exists(dir)
if (invalid_dir) {
msg <- paste0(
dir_name, " must match a valid directory.\n",
"bad ", dir_name, ": ", "'", dir, "'"
)
abort(msg)
} else {
invisible(dir)
}
}
xl_workbooks_to_chr <- function(input_dir) {
fs::dir_ls(input_dir, regexp = "\\.xls")
}