Allow first census (closes #31).

forestgeo · May 15, 2018 · b1dca96 · b1dca96
1 parent a17dea7
commit b1dca96
Show file tree

Hide file tree

Showing 13 changed files with 508 additions and 28 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -9,3 +9,5 @@
 ^man-roxygen$
 ^tmp\.R$
 ^inst/useless$
+^\.github$
+^inst/issues$
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # fgeo.tool 0.0.0.9002
 
+* `xl_sheets_to_csv()` and friends gain the argument `first_census` to now accept first census data.
+
 * Clarify the internal structure of fgeo.tool with function's prefixes.
     * `add_*()`
     * `str_*()`

diff --git a/R/xl_sheets_to_output.R b/R/xl_sheets_to_output.R
@@ -11,26 +11,39 @@
 #' function does:
 #' * Reads each spreadsheet from each workbook and map it to a dataframe.
 #' * Lowercases and links the names of each dataframe.
-#' * Keeps only these dataframes: (1) original_stems; (2) new_secondary_stems;
-#'   and (3).
+#' * Keeps only these dataframes: (1) original_stems, (2) new_secondary_stems,
+#' and (3) "recruits".
 #' * Dates the data by `submission_id` (`date` comes from the spreadsheet
 #' `root`).
 #' * Lowercases and links the names of each dataframe-variable.
 #' * Drops fake stems.
 #' * Output a common data structure of your choice.
-#' 
 #'
 #' @param input_dir String giving the directory containing the excel workbooks
 #'   to read from.
 #' @param output_dir String giving the directory where to write .csv files to.
+#' @param first_census This argument tells these functions what sheets to expect
+#'   in the input. 
+#'   * Use `TRUE` if this is your first census. The expected input must have
+#'     sheets (1) "root", (2) "multi_stems", (3) "secondary_stems", and
+#'     (4) "single_stems".
+#'   * Use `FALSE` (default) if this is not your first census. The expected 
+#'     input must have sheets (1) "root", (2) "original_stems", (3) 
+#'     "new_secondary_stems", and (4) "recruits".
 #'
 #' @return Writes one .csv file for each workbook.
 #' 
 #' @author Mauro Lepore and Jessica Shue.
+#' 
+#' @section Acknowledgment:
+#' Sabrina Russo helped to make these functions useful with first censuses.
 #'
 #' @examples
 #' library(fs)
+#' library(readr)
+#' library(readxl)
 #' 
+#' # NOT A FIRST CENSUS
 #' # Path to the folder I want to read excel files from
 #' input_dir <- dirname(example_path("two_files/new_stem_1.xlsx"))
 #' input_dir
@@ -41,18 +54,33 @@
 #' # Path to the folder I want to write .csv files to
 #' output_dir <- tempdir()
 #' 
-#' # Do the work
+#' # Output a csv file
 #' xl_sheets_to_csv(input_dir, output_dir)
 #' 
 #' # Confirm
 #' path_file(dir_ls(output_dir, regexp = "new_stem.*csv$"))
+#' 
+#' # Also possible to output excel and a list of dataframe. See next section.
+#' 
+#' # FIRST CENSUS
+#' input_dir <- dirname(example_path("first_census/census.xlsx"))
+#' # As a reminder you'll get a warning of missing sheets
+#' # Output list of dataframes (one per input workbook -- here only one)
+#' xl_sheets_to_df(input_dir, first_census = TRUE)
+#' 
+#' # Output excel
+#' xl_sheets_to_xl(input_dir, output_dir, first_census = TRUE)
+#' # Read back
+#' filename <- path(output_dir, "census.xlsx")
+#' out <- read_excel(filename)
+#' str(out, give.attr = FALSE)
 #' @name xl_sheets_to_output
 NULL
 
 xl_sheets_to_file <- function(ext, fun_write) {
-    function(input_dir, output_dir = "./") {
+    function(input_dir, output_dir = "./", first_census = FALSE) {
     check_output_dir(output_dir = output_dir, print_as = "`output_dir`")
-    dfs <- xl_sheets_to_df(input_dir = input_dir)
+    dfs <- xl_sheets_to_df(input_dir = input_dir, first_census = first_census)
     files <- fs::path_ext_remove(names(dfs))
     paths <- fs::path(output_dir, fs::path_ext_set(files, ext))
     purrr::walk2(dfs, paths, fun_write)
@@ -69,19 +97,30 @@ xl_sheets_to_xl <- xl_sheets_to_file("xlsx", writexl::write_xlsx)
 
 #' @export
 #' @rdname xl_sheets_to_output
-xl_sheets_to_df <- function(input_dir) {
+xl_sheets_to_df <- function(input_dir, first_census = FALSE) {
   check_input_dir(input_dir = input_dir, print_as = "`input_dir`")
-  out <- purrr::map(xl_workbooks_to_chr(input_dir), xl_sheets_to_df_)
+  out <- purrr::map(
+    xl_workbooks_to_chr(input_dir), 
+    xl_sheets_to_df_, first_census = first_census
+  )
   purrr::set_names(out, basename(names(out)))
 }
 
 #' Do xl_sheets_to_df() for each excel file.
 #' @noRd
-xl_sheets_to_df_ <- function(file) {
+xl_sheets_to_df_ <- function(file, first_census = FALSE) {
+  dfm_list <- fgeo.tool::nms_tidy(fgeo.tool::ls_list_spreadsheets(file))
+
+  if (first_census) {
+    key <- c("root", "multi_stems", "secondary_stems", "single_stems")
+    dfm_list <- ensure_key_sheets(dfm_list, key = key)
+  } else {
+    key <- c("original_stems", "new_secondary_stems", "recruits", "root")
+    dfm_list <- ensure_key_sheets(dfm_list, key = key)
+  }
+
   # Piping functions to avoid useless intermediate variables
-  clean_dfm_list <- fgeo.tool::ls_list_spreadsheets(file) %>%
-    fgeo.tool::nms_tidy() %>%
-    ensure_key_sheets() %>%
+  clean_dfm_list <- dfm_list %>% 
     purrr::keep(~!purrr::is_empty(.)) %>%
     lapply(fgeo.tool::nms_tidy) %>%
     drop_fake_stems() %>%
@@ -101,8 +140,7 @@ xl_sheets_to_df_ <- function(file) {
 
 #' Check that key spreadsheets exist.
 #' @noRd
-ensure_key_sheets <- function(x) {
-  key <- c("original_stems", "new_secondary_stems", "recruits", "root")
+ensure_key_sheets <- function(x, key) {
   missing_key_sheet <- !all(key %in% names(x))
   if (missing_key_sheet) {
     msg <- paste0(
@@ -128,6 +166,12 @@ drop_fake_stems <- function(.df) {
 #' @noRd
 warn_if_empty <- function(.x, dfm_nm) {
   dfm <- .x[[dfm_nm]]
+
+  if (is.null(dfm)) {
+    warn(paste("`.x` has no dataframe", dfm_nm), ". Is this intentional?")
+    return(invisible(.x))
+  }
+
   has_cero_rows <- nrow(dfm) == 0
   if (has_cero_rows) {
     warn(paste0("`", dfm_nm, "`", " has cero rows."))
@@ -140,18 +184,24 @@ coerce_as_character <- function(.x, ...) {
 }
 
 join_and_date <- function(.x) {
+  # From `root`, pull only `date` (plus a column to merge by)
+  date <- .x[["root"]][c("submission_id", "date")]
+
   # Join data from all sheets except from `root`
   is_not_root <- !grepl("root", names(.x))
-  not_root_dfm <- .x %>% 
-    purrr::keep(is_not_root) %>% 
-    fgeo.tool::ls_join_df() %>% 
-    dplyr::mutate(unique_stem = paste0(.data$tag, "_", .data$stem_tag))
+  not_root_dfm <- purrr::keep(.x, is_not_root)
 
-  # From `root`, pull only `date` (plus a column to merge by)
-  date <- .x[["root"]][c("submission_id", "date")]
+  # Nothing to join date with
+  first_census <- length(not_root_dfm) == 0
+  if (first_census) {
+    return(date)
+  }
 
-  # Add date
-  dplyr::left_join(not_root_dfm, date, by = "submission_id")
+  # Collapse into a single dataframe, add variable, and join with date
+  not_root_dfm %>% 
+    fgeo.tool::ls_join_df() %>% 
+    dplyr::mutate(unique_stem = paste0(.data$tag, "_", .data$stem_tag)) %>% 
+    dplyr::left_join(date, by = "submission_id")
 }
 
 check_input_dir <- function(input_dir, print_as) {

diff --git a/inst/extdata/first_census/census.xlsx b/inst/extdata/first_census/census.xlsx
diff --git a/inst/extdata/first_census/position.csv b/inst/extdata/first_census/position.csv
@@ -0,0 +1,24 @@
+PtID,East,North
+0816006,149.7519,311.8259
+0816007,153.0656,300.8340
+0816008,143.0636,303.4999
+0816009,146.8176,301.9608
+0916001,169.5130,317.3759
+0916002,169.6355,313.8165
+0916003,167.6531,310.4807
+0916003,167.6606,310.4932
+0916004,164.8548,316.7378
+0916005,165.3046,311.6328
+0916006,165.6578,309.7452
+0916007,162.5376,309.1591
+0916008,161.7250,305.1514
+0916009,163.1855,303.5896
+0916010,164.5169,303.0404
+0916011,165.7609,301.5969
+0916012,178.8555,303.8969
+1017001,198.2785,325.3465
+1017002,187.3418,327.3521
+1017003,182.4314,331.8514
+1017004,185.1744,322.1447
+0818001,152.7749,358.5041
+0818002,155.1088,353.1732
diff --git a/inst/issues/31_first_census.Rmd b/inst/issues/31_first_census.Rmd
@@ -0,0 +1,98 @@
+---
+title: "Process and explore FastField data of a first census"
+output: github_document
+params:
+  input_dir: !r dirname(system.file("extdata", "first_census/census.xlsx", package = "fgeo.tool"))
+  output_dir: !r tempdir()
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(
+  echo = TRUE,  # {mine}
+  comment = "#>",
+  collapse = TRUE,
+  out.width = "95%",
+  fig.align = "center",
+  fig.width = 6,
+  fig.asp = 0.618,  # 1 / phi
+  fig.show = "hold",
+  rows.print = 3  # {mine}
+)
+```
+
+```{r}
+# https://forestgeo.github.io/fgeo.tool/#installation
+library(fgeo.tool)
+
+# Install with `install.packages("pkg")`
+library(fs)
+library(tidyverse)
+```
+
+Change your input and output directory before knitting this file, and re-run every day you come back from the field.
+
+![](https://i.imgur.com/fGOtlVz.png)
+
+![](https://i.imgur.com/C2SbGMb.png)
+
+```{r}
+# My input directory
+params$input_dir
+dir(params$input_dir, "xlsx")
+
+# My output directory (temporary)
+params$output_dir
+```
+
+```{r}
+xl_sheets_to_csv(params$input_dir, params$output_dir, first_census = TRUE)
+# Output is one .csv per workbook
+dir(params$output_dir, "csv")
+```
+
+```{r}
+# Now input multiple .csv and output a single dataframe
+combo <- csv_to_df(params$output_dir)
+combo
+```
+
+```{r}
+# Read data of tree positions
+where_dir <- example_path("first_census/position.csv")
+where <- read_csv(where_dir)
+
+# Compare
+where
+select(combo, quadrat, tag)
+```
+
+```{r}
+# Create a variable that we can later use to merge the two datasets
+combo <- mutate(combo, PtID = paste0(quadrat, tag))
+combo2 <- left_join(combo, where)
+# Reorganize columns for easier visualization
+combo2 <- select(combo2, PtID, East, North, date, everything())
+combo2
+```
+
+```{r}
+# Helpers to avoid duplication
+set_brk <- function(max, by) seq(0, max, by)
+set_lim <- function(max) c(0, max)
+
+xmax <- 560
+ymax <- 360
+ggplot(combo2, aes(East, North, color = date)) + 
+  geom_point() +
+  coord_equal() +
+  scale_x_continuous(minor_breaks = set_brk(xmax, 20), limits = set_lim(xmax)) +
+  scale_y_continuous(minor_breaks = set_brk(ymax, 20), limits = set_lim(ymax)) +
+  theme_bw()
+```
+
+__ggplot2__ removes missing values from `East` and `North` but there may be missing values in `date`.
+
+```{r}
+filter(combo2, is.na(date))
+```
+