Skip to content

Commit

Permalink
Allow first census (closes #31).
Browse files Browse the repository at this point in the history
  • Loading branch information
maurolepore committed May 15, 2018
1 parent a17dea7 commit b1dca96
Show file tree
Hide file tree
Showing 13 changed files with 508 additions and 28 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Expand Up @@ -9,3 +9,5 @@
^man-roxygen$
^tmp\.R$
^inst/useless$
^\.github$
^inst/issues$
2 changes: 2 additions & 0 deletions NEWS.md
@@ -1,5 +1,7 @@
# fgeo.tool 0.0.0.9002

* `xl_sheets_to_csv()` and friends gain the argument `first_census` to now accept first census data.

* Clarify the internal structure of fgeo.tool with function's prefixes.
* `add_*()`
* `str_*()`
Expand Down
94 changes: 72 additions & 22 deletions R/xl_sheets_to_output.R
Expand Up @@ -11,26 +11,39 @@
#' function does:
#' * Reads each spreadsheet from each workbook and map it to a dataframe.
#' * Lowercases and links the names of each dataframe.
#' * Keeps only these dataframes: (1) original_stems; (2) new_secondary_stems;
#' and (3).
#' * Keeps only these dataframes: (1) original_stems, (2) new_secondary_stems,
#' and (3) "recruits".
#' * Dates the data by `submission_id` (`date` comes from the spreadsheet
#' `root`).
#' * Lowercases and links the names of each dataframe-variable.
#' * Drops fake stems.
#' * Output a common data structure of your choice.
#'
#'
#' @param input_dir String giving the directory containing the excel workbooks
#' to read from.
#' @param output_dir String giving the directory where to write .csv files to.
#' @param first_census This argument tells these functions what sheets to expect
#' in the input.
#' * Use `TRUE` if this is your first census. The expected input must have
#' sheets (1) "root", (2) "multi_stems", (3) "secondary_stems", and
#' (4) "single_stems".
#' * Use `FALSE` (default) if this is not your first census. The expected
#' input must have sheets (1) "root", (2) "original_stems", (3)
#' "new_secondary_stems", and (4) "recruits".
#'
#' @return Writes one .csv file for each workbook.
#'
#' @author Mauro Lepore and Jessica Shue.
#'
#' @section Acknowledgment:
#' Sabrina Russo helped to make these functions useful with first censuses.
#'
#' @examples
#' library(fs)
#' library(readr)
#' library(readxl)
#'
#' # NOT A FIRST CENSUS
#' # Path to the folder I want to read excel files from
#' input_dir <- dirname(example_path("two_files/new_stem_1.xlsx"))
#' input_dir
Expand All @@ -41,18 +54,33 @@
#' # Path to the folder I want to write .csv files to
#' output_dir <- tempdir()
#'
#' # Do the work
#' # Output a csv file
#' xl_sheets_to_csv(input_dir, output_dir)
#'
#' # Confirm
#' path_file(dir_ls(output_dir, regexp = "new_stem.*csv$"))
#'
#' # Also possible to output excel and a list of dataframe. See next section.
#'
#' # FIRST CENSUS
#' input_dir <- dirname(example_path("first_census/census.xlsx"))
#' # As a reminder you'll get a warning of missing sheets
#' # Output list of dataframes (one per input workbook -- here only one)
#' xl_sheets_to_df(input_dir, first_census = TRUE)
#'
#' # Output excel
#' xl_sheets_to_xl(input_dir, output_dir, first_census = TRUE)
#' # Read back
#' filename <- path(output_dir, "census.xlsx")
#' out <- read_excel(filename)
#' str(out, give.attr = FALSE)
#' @name xl_sheets_to_output
NULL

xl_sheets_to_file <- function(ext, fun_write) {
function(input_dir, output_dir = "./") {
function(input_dir, output_dir = "./", first_census = FALSE) {
check_output_dir(output_dir = output_dir, print_as = "`output_dir`")
dfs <- xl_sheets_to_df(input_dir = input_dir)
dfs <- xl_sheets_to_df(input_dir = input_dir, first_census = first_census)
files <- fs::path_ext_remove(names(dfs))
paths <- fs::path(output_dir, fs::path_ext_set(files, ext))
purrr::walk2(dfs, paths, fun_write)
Expand All @@ -69,19 +97,30 @@ xl_sheets_to_xl <- xl_sheets_to_file("xlsx", writexl::write_xlsx)

#' @export
#' @rdname xl_sheets_to_output
xl_sheets_to_df <- function(input_dir) {
xl_sheets_to_df <- function(input_dir, first_census = FALSE) {
check_input_dir(input_dir = input_dir, print_as = "`input_dir`")
out <- purrr::map(xl_workbooks_to_chr(input_dir), xl_sheets_to_df_)
out <- purrr::map(
xl_workbooks_to_chr(input_dir),
xl_sheets_to_df_, first_census = first_census
)
purrr::set_names(out, basename(names(out)))
}

#' Do xl_sheets_to_df() for each excel file.
#' @noRd
xl_sheets_to_df_ <- function(file) {
xl_sheets_to_df_ <- function(file, first_census = FALSE) {
dfm_list <- fgeo.tool::nms_tidy(fgeo.tool::ls_list_spreadsheets(file))

if (first_census) {
key <- c("root", "multi_stems", "secondary_stems", "single_stems")
dfm_list <- ensure_key_sheets(dfm_list, key = key)
} else {
key <- c("original_stems", "new_secondary_stems", "recruits", "root")
dfm_list <- ensure_key_sheets(dfm_list, key = key)
}

# Piping functions to avoid useless intermediate variables
clean_dfm_list <- fgeo.tool::ls_list_spreadsheets(file) %>%
fgeo.tool::nms_tidy() %>%
ensure_key_sheets() %>%
clean_dfm_list <- dfm_list %>%
purrr::keep(~!purrr::is_empty(.)) %>%
lapply(fgeo.tool::nms_tidy) %>%
drop_fake_stems() %>%
Expand All @@ -101,8 +140,7 @@ xl_sheets_to_df_ <- function(file) {

#' Check that key spreadsheets exist.
#' @noRd
ensure_key_sheets <- function(x) {
key <- c("original_stems", "new_secondary_stems", "recruits", "root")
ensure_key_sheets <- function(x, key) {
missing_key_sheet <- !all(key %in% names(x))
if (missing_key_sheet) {
msg <- paste0(
Expand All @@ -128,6 +166,12 @@ drop_fake_stems <- function(.df) {
#' @noRd
warn_if_empty <- function(.x, dfm_nm) {
dfm <- .x[[dfm_nm]]

if (is.null(dfm)) {
warn(paste("`.x` has no dataframe", dfm_nm), ". Is this intentional?")
return(invisible(.x))
}

has_cero_rows <- nrow(dfm) == 0
if (has_cero_rows) {
warn(paste0("`", dfm_nm, "`", " has cero rows."))
Expand All @@ -140,18 +184,24 @@ coerce_as_character <- function(.x, ...) {
}

join_and_date <- function(.x) {
# From `root`, pull only `date` (plus a column to merge by)
date <- .x[["root"]][c("submission_id", "date")]

# Join data from all sheets except from `root`
is_not_root <- !grepl("root", names(.x))
not_root_dfm <- .x %>%
purrr::keep(is_not_root) %>%
fgeo.tool::ls_join_df() %>%
dplyr::mutate(unique_stem = paste0(.data$tag, "_", .data$stem_tag))
not_root_dfm <- purrr::keep(.x, is_not_root)

# From `root`, pull only `date` (plus a column to merge by)
date <- .x[["root"]][c("submission_id", "date")]
# Nothing to join date with
first_census <- length(not_root_dfm) == 0
if (first_census) {
return(date)
}

# Add date
dplyr::left_join(not_root_dfm, date, by = "submission_id")
# Collapse into a single dataframe, add variable, and join with date
not_root_dfm %>%
fgeo.tool::ls_join_df() %>%
dplyr::mutate(unique_stem = paste0(.data$tag, "_", .data$stem_tag)) %>%
dplyr::left_join(date, by = "submission_id")
}

check_input_dir <- function(input_dir, print_as) {
Expand Down
Binary file added inst/extdata/first_census/census.xlsx
Binary file not shown.
24 changes: 24 additions & 0 deletions inst/extdata/first_census/position.csv
@@ -0,0 +1,24 @@
PtID,East,North
0816006,149.7519,311.8259
0816007,153.0656,300.8340
0816008,143.0636,303.4999
0816009,146.8176,301.9608
0916001,169.5130,317.3759
0916002,169.6355,313.8165
0916003,167.6531,310.4807
0916003,167.6606,310.4932
0916004,164.8548,316.7378
0916005,165.3046,311.6328
0916006,165.6578,309.7452
0916007,162.5376,309.1591
0916008,161.7250,305.1514
0916009,163.1855,303.5896
0916010,164.5169,303.0404
0916011,165.7609,301.5969
0916012,178.8555,303.8969
1017001,198.2785,325.3465
1017002,187.3418,327.3521
1017003,182.4314,331.8514
1017004,185.1744,322.1447
0818001,152.7749,358.5041
0818002,155.1088,353.1732
98 changes: 98 additions & 0 deletions inst/issues/31_first_census.Rmd
@@ -0,0 +1,98 @@
---
title: "Process and explore FastField data of a first census"
output: github_document
params:
input_dir: !r dirname(system.file("extdata", "first_census/census.xlsx", package = "fgeo.tool"))
output_dir: !r tempdir()
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
echo = TRUE, # {mine}
comment = "#>",
collapse = TRUE,
out.width = "95%",
fig.align = "center",
fig.width = 6,
fig.asp = 0.618, # 1 / phi
fig.show = "hold",
rows.print = 3 # {mine}
)
```

```{r}
# https://forestgeo.github.io/fgeo.tool/#installation
library(fgeo.tool)
# Install with `install.packages("pkg")`
library(fs)
library(tidyverse)
```

Change your input and output directory before knitting this file, and re-run every day you come back from the field.

![](https://i.imgur.com/fGOtlVz.png)

![](https://i.imgur.com/C2SbGMb.png)

```{r}
# My input directory
params$input_dir
dir(params$input_dir, "xlsx")
# My output directory (temporary)
params$output_dir
```

```{r}
xl_sheets_to_csv(params$input_dir, params$output_dir, first_census = TRUE)
# Output is one .csv per workbook
dir(params$output_dir, "csv")
```

```{r}
# Now input multiple .csv and output a single dataframe
combo <- csv_to_df(params$output_dir)
combo
```

```{r}
# Read data of tree positions
where_dir <- example_path("first_census/position.csv")
where <- read_csv(where_dir)
# Compare
where
select(combo, quadrat, tag)
```

```{r}
# Create a variable that we can later use to merge the two datasets
combo <- mutate(combo, PtID = paste0(quadrat, tag))
combo2 <- left_join(combo, where)
# Reorganize columns for easier visualization
combo2 <- select(combo2, PtID, East, North, date, everything())
combo2
```

```{r}
# Helpers to avoid duplication
set_brk <- function(max, by) seq(0, max, by)
set_lim <- function(max) c(0, max)
xmax <- 560
ymax <- 360
ggplot(combo2, aes(East, North, color = date)) +
geom_point() +
coord_equal() +
scale_x_continuous(minor_breaks = set_brk(xmax, 20), limits = set_lim(xmax)) +
scale_y_continuous(minor_breaks = set_brk(ymax, 20), limits = set_lim(ymax)) +
theme_bw()
```

__ggplot2__ removes missing values from `East` and `North` but there may be missing values in `date`.

```{r}
filter(combo2, is.na(date))
```

0 comments on commit b1dca96

Please sign in to comment.