version 0.2.3

cran · Nov 9, 2023 · 414d883 · 414d883
commit 414d883
Show file tree

Hide file tree

Showing 42 changed files with 1,875 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,28 @@
+Package: filecacher
+Type: Package
+Title: File Cacher
+Version: 0.2.3
+Authors@R: 
+    person("Or", "Gadish", email = "orgadish@gmail.com", role = c("aut", "cre", "cph"))
+Description: 
+    The main functions in this package are with_cache() and cached_read(). 
+    The former is a simple way to cache an R object into a file on disk, 
+    using 'cachem'. The latter is a wrapper around any standard read function, 
+    but caches both the output and the file list info. If the input file list
+    info hasn't changed, the cache is used; otherwise, the original files are
+    re-read. This can save time if the original operation requires reading from
+    many files, and/or involves lots of processing.
+License: MIT + file LICENSE
+Encoding: UTF-8
+Imports: cachem, glue, here, purrr, rlang, vctrs
+RoxygenNote: 7.2.3
+Suggests: arrow, data.table, dplyr, fs, readr, testthat (>= 3.0.0)
+Config/testthat/edition: 3
+URL: https://github.com/orgadish/filecacher
+BugReports: https://github.com/orgadish/filecacher/issues
+NeedsCompilation: no
+Packaged: 2023-11-09 08:17:04 UTC; orgadish
+Author: Or Gadish [aut, cre, cph]
+Maintainer: Or Gadish <orgadish@gmail.com>
+Repository: CRAN
+Date/Publication: 2023-11-09 10:30:02 UTC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2023
+COPYRIGHT HOLDER: filecacher authors
diff --git a/MD5 b/MD5
@@ -0,0 +1,41 @@
+ab5e50bc7a3736ac7b611ea2b8663d75 *DESCRIPTION
+a699489fbaa4c98f0be448cb32b5ce39 *LICENSE
+030e9df7f37ee257cff118182f32630b *NAMESPACE
+da6460df974ce8421fcc06a1bcca6a18 *NEWS.md
+3ed23ba66095a20862982211659b34b9 *R/cached_read.R
+db56f626aaf2ceb024cc59d2fb749043 *R/file_cache.R
+7cd4505a59b22a915f14ec3bd64fff52 *R/interpret_type.R
+57f8c3d43fca95fdb6eba3265ddde21f *R/util-dfs_equal.R
+47ba852b5c44ae23c92ab08a6fb43ed7 *R/util-fns_equal.R
+8e0b2dc83dd19e55a19f75d89c11e2dd *R/util-get_file_info.R
+370946680ccdc94145b2051f41eacd6d *R/vectorize_reader.R
+de2b0df4045c92ae0afc83c555991e67 *R/with_cache.R
+1e871536b2d1ba54e17007186c3a29e5 *README.md
+071c2fc2ab835760ab14c7ec44ab8d85 *inst/WORDLIST
+6087ff9f569743b5b84d6e0114aea7e5 *inst/examples/cache.R
+8b028e4421ceec1db98e204aba5adef2 *inst/examples/cached_read.R
+0be50cdfc61e4711032a55aaef73fa01 *inst/examples/vectorize_reader.R
+fa33100515046319623e73627fb82a50 *inst/extdata/create_extdata_from_iris.R
+d100dbed6c5ce6f74e0ba16127fa7e16 *inst/extdata/iris_complete.csv
+97a71b453f64dee5d29a4de436bbd739 *inst/extdata/iris_setosa_only.csv
+892af90bb2a229c30e8140658e2fc1d4 *inst/extdata/iris_versicolor_only.csv
+904ae0155a55b8618059a144f9e7e525 *inst/extdata/iris_virginica_only.csv
+a910434f5b1f1ededf32aea20efa7195 *man/cached_read.Rd
+f1f3a5d5d050fee4013f1a277d8baefd *man/dfs_equal.Rd
+abcc22cc06204a12f6067331f6711798 *man/file_cache.Rd
+24db7f85a969168ff6c54e66fe88cb60 *man/fns_equal.Rd
+ce98425ac564644c7590657217df6da2 *man/get_csv_fns.Rd
+1cc345250e7fa7d4cb305749701754bb *man/get_csv_read_fn.Rd
+d0f2ad5d3f676040e055272dfab3d664 *man/get_file_info.Rd
+48108a8b173d37ed634da43bd2f2947b *man/interpret_cache_type.Rd
+6bddb3b8d0d7c02d9c1db5da895d8d2c *man/vectorize_reader.Rd
+17f9bab2ff57b42f977e0dc72c5ed760 *man/with_cache.Rd
+7347c3a61edadfd418402153d613c76c *tests/testthat.R
+a943b01dbddfcd0b5067767b3d47ed6a *tests/testthat/test-cached_read.R
+e5321ba8ccb06d3fe9e9f0f720adfb5b *tests/testthat/test-file_cache.R
+f1169a22866b9364f64c0f42d71f277b *tests/testthat/test-interpret_type.R
+9b7423ed0fa99644850bdd05f0e3f6fc *tests/testthat/test-util-dfs_equal.R
+3d1ff45ccb9389492678f3a49eba2c34 *tests/testthat/test-util-fns_equal.R
+e11e606c90be9f3fb6d63a0ea0b60593 *tests/testthat/test-util-get_file_info.R
+14d1411a2e03dc3e5846f8cec10db5ae *tests/testthat/test-vectorize_reader.R
+5d2464789c4ea4ba83932c3125686316 *tests/testthat/test-with_cache.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,7 @@
+# Generated by roxygen2: do not edit by hand
+
+export(cached_read)
+export(cached_read_csv)
+export(file_cache)
+export(vectorize_reader)
+export(with_cache)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,3 @@
+# filecacher 0.2.3
+
+* Initial CRAN submission.
diff --git a/R/cached_read.R b/R/cached_read.R
@@ -0,0 +1,79 @@
+#' Read files via cache of file list and contents
+#'
+#' @description
+#' Reads data and save to a local file for easier management and re-reading.
+#'
+#' By default, also saves the file info to determine whether the cache
+#' is valid, or whether the contents need to be updated because the files
+#' have been modified. To skip this, or force reading from scratch, set
+#' `skip_file_info=TRUE` or `force=TRUE`, respectively.
+#'
+#' If updating is called for, all the files are re-read.
+#'
+#' `cached_read_csv()` is a convenience function using a csv read function
+#'   based on `read_type`.
+#'
+#' @param files A file or files to read with `read_fn`.
+#' @param read_fn A function which takes file(s) as its first parameter and
+#'   reads them. To use a single-input read function such as
+#'   `arrow::read_csv_arrow()` with multiple files, use [vectorize_reader()],
+#'   e.g. `read_fn = vectorize_reader(arrow::read_csv_arrow)`.
+#' @param skip_file_info Whether to skip saving and/or checking the file info.
+#'  Use this when just querying the file system (without opening files) is slow.
+#' @inheritParams with_cache
+#'
+#' @seealso [vectorize_reader()] to convert a single-input read function into a
+#'   multiple-input function.
+#'
+#' @return The result of `read_fn(files)`.
+#' @export
+#' @example inst/examples/cached_read.R
+cached_read <- function(files, label, read_fn,
+                        cache = NULL, type = NULL, force = FALSE,
+                        skip_file_info = FALSE) {
+  .cache <- file_cache(cache = cache, type = type)
+
+  read_with_cache <- function() {
+    with_cache(
+      read_fn(files),
+      label = label, cache = .cache, type = type, force = force
+    )
+  }
+
+  # Option 1: If skipping file info, simply call `with_cache`.
+  if (skip_file_info) {
+    return(read_with_cache())
+  }
+
+  # Option 2: Caching via file info.
+  file_info_label <- paste0(label, "-file_info")
+  cached_file_info <- .cache$get(file_info_label)
+  # cached_file_info = <key_missing> if doesn't exist.
+  new_file_info <- get_file_info(files)
+
+  if (!dfs_equal(cached_file_info, new_file_info)) {
+    .cache$set(file_info_label, new_file_info)
+  }
+
+  read_with_cache()
+}
+
+
+#' @rdname cached_read
+#'
+#' @inheritParams get_csv_read_fn
+#'
+#' @export
+cached_read_csv <- function(files, label, read_type = NULL,
+                            cache = NULL, type = NULL,
+                            skip_file_info = FALSE, force = FALSE) {
+  cached_read(
+    files = files,
+    label = label,
+    read_fn = get_csv_read_fn(read_type),
+    cache = cache,
+    type = type,
+    skip_file_info = skip_file_info,
+    force = force
+  )
+}
diff --git a/R/file_cache.R b/R/file_cache.R
@@ -0,0 +1,37 @@
+#' Gets or creates a `cachem` object for use with other functions.
+#'
+#' @param cache The path to an existing directory to use for caching.
+#'   If `NULL` (default) uses the current path, using [here::here()].
+#'
+#'   For advanced use, also accepts (and passes on) an existing
+#'   `cachem` object. If so, all other parameters are ignored.
+#' @inheritParams interpret_cache_type
+#'
+#' @return A [cachem::cache_disk()] object.
+#' @export
+#'
+#' @seealso [cachem::cache_disk()]
+#'
+#' @example inst/examples/cache.R
+file_cache <- function(cache = NULL, type = NULL, ext_prefix = "cache_") {
+  if (inherits(cache, "cachem")) {
+    return(cache)
+  }
+
+  if (is.null(cache)) {
+    cache <- here::here()
+  } else if (!is.character(cache) || !dir.exists(cache)) {
+    stop(
+      "`cache` must be an existing cache or the path to an existing directory."
+    )
+  }
+
+  cache_type <- interpret_cache_type(type, ext_prefix = ext_prefix)
+  cachem::cache_disk(
+    dir = cache,
+    max_size = Inf,
+    read_fn = cache_type$read_fn,
+    write_fn = cache_type$write_fn,
+    extension = cache_type$extension
+  )
+}
diff --git a/R/interpret_type.R b/R/interpret_type.R
@@ -0,0 +1,134 @@
+#' Get the CSV read/write function
+#'
+#' @description
+#' Read functions are vectorized.
+#'
+#' @param type Type of csv read/write functions to get.
+#'   If `NULL`, returns the first installed.
+#'
+#' @return List of read/write functions.
+get_csv_fns <- function(type = NULL) {
+  if (!is.null(type)) vctrs::vec_assert(type, character(), size = 1)
+
+  rw_fns <- list(
+    "readr" = if (rlang::is_installed("readr")) {
+      list(
+        read = \(f) readr::read_csv(f, id = "file_path"),
+        write = readr::write_csv
+      )
+    },
+    "arrow" = if (rlang::is_installed("arrow")) {
+      list(
+        read = vectorize_reader(arrow::read_csv_arrow, "file_path"),
+        write = arrow::write_csv_arrow
+      )
+    },
+    "data.table" = if (rlang::is_installed("data.table")) {
+      list(
+        read = vectorize_reader(data.table::fread, "file_path"),
+        write = data.table::fwrite
+      )
+    },
+    "base" = list(
+      read = vectorize_reader(utils::read.csv, "file_path"),
+      write = utils::write.csv
+    )
+  )
+
+  if (is.null(type)) {
+    installed_fns <- rw_fns[!is.null(rw_fns)]
+    fn_list <- installed_fns[[1]]
+  } else if (type %in% names(rw_fns)) {
+    fn_list <- rw_fns[[type]]
+  } else {
+    collapsed_names <- glue::glue_collapse(
+      glue::glue("'{names(rw_fns)}'"),
+      sep = ", ", last = ", or "
+    )
+    stop(glue::glue(
+      "`type` must be NULL or one of {collapsed_names}, not '{type}'."
+    ))
+  }
+
+  fn_list
+}
+
+
+#' Get the first CSV Read function installed
+#'
+#' @param read_type Type of csv read function to use. One of:
+#'   * "readr": `readr::read_csv()`
+#'   * "arrow": `vectorize_reader(arrow::read_csv_arrow)()`
+#'   * "data.table": `vectorize_reader(data.table::fread)()`
+#'   * "base": `vectorize_reader(utils::read.csv)()`
+#'   * `NULL` (default): uses the first installed.
+#'
+#' @return Function that reads multiple paths to CSVs.
+get_csv_read_fn <- function(read_type = NULL) {
+  if (!is.null(read_type)) vctrs::vec_assert(read_type, character(), size = 1)
+
+  get_csv_fns(read_type)$read
+}
+
+
+#' Generate cache parameters from preexisting shorthand types.
+#'
+#' @param type A string describing the type of cache.
+#'   Must be `NULL` or one of 'rds', 'parquet', or 'csv'.
+#'   If `NULL` (default), uses 'rds'.
+#' @param ext_prefix The prefix to use with the file extension,
+#'   e.g. "cache_csv", instead of "csv".
+#'
+#' @return List of `read_fn`, `write_fn`, and `extension` for use with
+#'   [cachem::cache_disk()].
+interpret_cache_type <- function(type, ext_prefix = "cache_") {
+  if (is.null(type)) {
+    type <- "rds"
+  } else {
+    vctrs::vec_assert(type, character(), size = 1)
+  }
+
+  arrow_is_installed <- rlang::is_installed("arrow")
+  if (type == "parquet" && !arrow_is_installed) {
+    stop("The `arrow` package must be installed to use `type='parquet'`.")
+  }
+
+
+  build_ext <- function(ext) {
+    paste0(".", ext_prefix, ext)
+  }
+
+  csv_fn_list <- get_csv_fns()
+
+  types <- list(
+    "rds" = list(
+      read_fn = NULL,
+      write_fn = NULL,
+      extension = build_ext("rds")
+    ),
+    "parquet" = if (arrow_is_installed) {
+      list(
+        read_fn = arrow::read_parquet,
+        write_fn = arrow::write_parquet,
+        extension = build_ext("parquet")
+      )
+    },
+    "csv" = list(
+      read_fn = csv_fn_list$read,
+      write_fn = csv_fn_list$write,
+      extension = build_ext("csv")
+    )
+  )
+
+  if (!type %in% names(types)) {
+    collapsed_names <- glue::glue_collapse(
+      glue::glue("'{names(types)}'"),
+      sep = ", ", last = ", or "
+    )
+    stop(glue::glue(
+      "`type` must be NULL or one of {collapsed_names}, not '{type}'."
+    ))
+  }
+
+  types[[type]]
+}
diff --git a/R/util-dfs_equal.R b/R/util-dfs_equal.R
@@ -0,0 +1,29 @@
+#' Compare two data frames (ignoring row order) and ensure they are equal.
+#'
+#' @description
+#' Similar to `dplyr::all_equal(x, y, ignore_row_order=TRUE)`,
+#' which is now deprecated.
+#'
+#' If either argument is not a data.frame it returns `FALSE`,
+#' rather than raise an error.
+#'
+#' @inheritParams base::all.equal
+dfs_equal <- function(target, current) {
+  if (!is.data.frame(target) || !is.data.frame(current)) {
+    return(FALSE)
+  }
+
+  df_sort <- function(df) df[do.call(order, df), , drop = FALSE]
+
+  target_df <- df_sort(target)
+  current_df <- df_sort(current)
+
+  target_names <- names(target_df)
+  current_names <- names(current_df)
+
+  (
+    length(target_names) == length(current_names) &&
+      all(target_names == current_names) &&
+      setequal(target_df, current_df)
+  )
+}
diff --git a/R/util-fns_equal.R b/R/util-fns_equal.R
@@ -0,0 +1,16 @@
+#' Check whether two function objects have the same text definition.
+#'
+#' @param x First function to compare.
+#' @param y Second function to compare.
+#'
+#' @return Logical
+fns_equal <- function(x, y) {
+  if (!rlang::is_function(x) || !rlang::is_function(y)) {
+    stop("`x` and `y` must be functions.")
+  }
+
+  # Replacement for `base::deparse1`, which is only available in R >= 4.0.0.
+  deparse_1 <- function(z) paste(deparse(z), collapse = " ")
+
+  deparse_1(x) == deparse_1(y)
+}