Skip to content

Commit

Permalink
Fix #294 (#337)
Browse files Browse the repository at this point in the history
* Refactor

And it passes all existing tests

* Update doc on zip directory [no ci]

* Add tests for #294

* Update NEWS [no ci]
  • Loading branch information
chainsawriot committed Sep 4, 2023
1 parent 067a0ae commit 8bf9a23
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 76 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- POTENTIALLY BREAKING: YAML are exported using yaml::write_yaml(). But it can't pass the UTF-8 check on older systems.
Disclaimer added. #318
- More check for the `file` argument #301
- `import_list` works with single Excel/HTML/Zip online #294
* Declutter
- remove the obsolete data.table option #323
- write all documentation blocks in markdown #311
Expand Down
167 changes: 93 additions & 74 deletions R/import_list.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' @title Import list of data frames
#' @description Use [import()] to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zip directory, or HTML file)
#' @param file A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip directory, or HTML file), or a vector of file paths for multiple files to be imported.
#' @description Use [import()] to import a list of data frames from a vector of file names or from a multi-object file (Excel workbook, .Rdata file, zipped directory in a zip file, or HTML file)
#' @param file A character string containing a single file name for a multi-object file (e.g., Excel workbook, zip file, or HTML file), or a vector of file paths for multiple files to be imported.
#' @param which If `file` is a single file path, this specifies which objects should be extracted (passed to [import()]'s `which` argument). Ignored otherwise.
#' @param rbind A logical indicating whether to pass the import list of data frames through [data.table::rbindlist()].
#' @param rbind_label If `rbind = TRUE`, a character string specifying the name of a column to add to the data frame indicating its source file.
Expand Down Expand Up @@ -38,79 +38,13 @@ function(file,
if (missing(setclass)) {
setclass <- NULL
}
strip_exts <- function(file) {
vapply(file, function(x) tools::file_path_sans_ext(basename(x)), character(1))
}
if (length(file) > 1) {
names(file) <- strip_exts(file)
x <- lapply(file, function(thisfile) {
out <- try(import(thisfile, setclass = setclass, ...), silent = TRUE)
if (inherits(out, "try-error")) {
warning(sprintf("Import failed for %s", thisfile))
out <- NULL
} else if (isTRUE(rbind)) {
out[[rbind_label]] <- thisfile
}
structure(out, filename = thisfile)
})
names(x) <- names(file)
## special cases
if (length(file) == 1) {
x <- .read_file_as_list(file = file, which = which, setclass = setclass, rbind = rbind, rbind_label = rbind_label, ...)
} else {
if (get_ext(file) == "rdata") {
e <- new.env()
load(file, envir = e)
x <- as.list(e)
} else {
if (get_ext(file) == "html") {
.check_pkg_availability("xml2")
tables <- xml2::xml_find_all(xml2::read_html(unclass(file)), ".//table")
if (missing(which)) {
which <- seq_along(tables)
}
whichnames <- vapply(xml2::xml_attrs(tables[which]),
function(x) if ("class" %in% names(x)) x["class"] else "",
FUN.VALUE = character(1))
names(which) <- whichnames
} else if (get_ext(file) %in% c("xls","xlsx")) {
.check_pkg_availability("readxl")
whichnames <- readxl::excel_sheets(path = file)
if (missing(which)) {
which <- seq_along(whichnames)
names(which) <- whichnames
} else if (is.character(which)) {
whichnames <- which
} else {
whichnames <- whichnames[which]
}
} else if (get_ext(file) %in% c("zip")) {
if (missing(which)) {
whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
which <- seq_along(whichnames)
names(which) <- strip_exts(whichnames)
} else if (is.character(which)) {
whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
whichnames <- whichnames[whichnames %in% which]
} else {
whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
names(which) <- strip_exts(whichnames)
}
} else {
which <- 1
whichnames <- NULL
}
x <- lapply(which, function(thiswhich) {
out <- try(import(file, setclass = setclass, which = thiswhich, ...), silent = TRUE)
if (inherits(out, "try-error")) {
warning(sprintf("Import failed for %s from %s", thiswhich, file))
out <- NULL
} else if (isTRUE(rbind) && length(which) > 1) {
out[[rbind_label]] <- thiswhich
}
out
})
names(x) <- whichnames
}
## note the plural
x <- .read_multiple_files_as_list(files = file, setclass = setclass, rbind = rbind, rbind_label = rbind_label, ...)
}

# optionally rbind
if (isTRUE(rbind)) {
if (length(x) == 1) {
Expand All @@ -124,7 +58,7 @@ function(file,
x <- x2
}
}
# set class
## set class
a <- list(...)
if (is.null(setclass)) {
if ("data.table" %in% names(a) && isTRUE(a[["data.table"]])) {
Expand All @@ -148,3 +82,88 @@ function(file,

return(x)
}

.strip_exts <- function(file) {
vapply(file, function(x) tools::file_path_sans_ext(basename(x)), character(1))
}

.read_multiple_files_as_list <- function(files, setclass, rbind, rbind_label,...) {
names(files) <- .strip_exts(files)
x <- lapply(files, function(thisfile) {
out <- try(import(thisfile, setclass = setclass, ...), silent = TRUE)
if (inherits(out, "try-error")) {
warning(sprintf("Import failed for %s", thisfile))
out <- NULL
} else if (isTRUE(rbind)) {
out[[rbind_label]] <- thisfile
}
structure(out, filename = thisfile)
})
names(x) <- names(files)
return(x)
}

.read_file_as_list <- function(file, which, setclass, rbind, rbind_label,...) {
if (grepl("^http.*://", file)) {
file <- remote_to_local(file)
}
if (get_ext(file) == "rdata") {
e <- new.env()
load(file, envir = e)
return(as.list(e))
}
if (!get_ext(file) %in% c("html", "xlsx", "xls", "zip")) {
which <- 1
whichnames <- NULL
}
## getting list of `whichnames`
if (get_ext(file) == "html") {
.check_pkg_availability("xml2")
tables <- xml2::xml_find_all(xml2::read_html(unclass(file)), ".//table")
if (missing(which)) {
which <- seq_along(tables)
}
whichnames <- vapply(xml2::xml_attrs(tables[which]),
function(x) if ("class" %in% names(x)) x["class"] else "",
FUN.VALUE = character(1))
names(which) <- whichnames
}
if (get_ext(file) %in% c("xls","xlsx")) {
##.check_pkg_availability("readxl")
whichnames <- readxl::excel_sheets(path = file)
if (missing(which)) {
which <- seq_along(whichnames)
names(which) <- whichnames
} else if (is.character(which)) {
whichnames <- which
} else {
whichnames <- whichnames[which]
}
}
if (get_ext(file) %in% c("zip")) {
if (missing(which)) {
whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
which <- seq_along(whichnames)
names(which) <- .strip_exts(whichnames)
} else if (is.character(which)) {
whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
whichnames <- whichnames[whichnames %in% which]
} else {
whichnames <- utils::unzip(file, list = TRUE)[, "Name"]
names(which) <- .strip_exts(whichnames)
}
}
## reading all `whichnames`
x <- lapply(which, function(thiswhich) {
out <- try(import(file, setclass = setclass, which = thiswhich, ...), silent = TRUE)
if (inherits(out, "try-error")) {
warning(sprintf("Import failed for %s from %s", thiswhich, file))
out <- NULL
} else if (isTRUE(rbind) && length(which) > 1) {
out[[rbind_label]] <- thiswhich
}
out
})
names(x) <- whichnames
return(x)
}
4 changes: 2 additions & 2 deletions man/import_list.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions tests/testthat/test_import_list.R
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,23 @@ test_that("File names are added as attributes by import_list()", {
unlink(c("mtcars.csv", "mtcars.tsv"))
})

test_that("URL #294", {
skip_on_cran()
## url <- "https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.xls" That's 10MB!
url <- "https://github.com/tidyverse/readxl/raw/main/tests/testthat/sheets/sheet-xml-lookup.xlsx"
expect_error(x <- import_list(url), NA)
expect_true(inherits(x, "list"))
expect_true("Asia" %in% names(x))
expect_true("Africa" %in% x[[1]]$continent)
expect_false("Africa" %in% x[[2]]$continent)
## double URLs; it reads twice the first sheet by default
urls <- c(url, url)
expect_error(x2 <- import_list(urls), NA)
expect_true("sheet-xml-lookup" %in% names(x2))
expect_true("Africa" %in% x2[[1]]$continent)
expect_true("Africa" %in% x2[[2]]$continent)
})

unlink("data.rdata")
unlink("mtcars.rds")
unlink("mtcars.csv.zip")

0 comments on commit 8bf9a23

Please sign in to comment.