version 0.1.4

cran · Oct 25, 2023 · d0e99ba · d0e99ba
commit d0e99ba
Show file tree

Hide file tree

Showing 14 changed files with 534 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,22 @@
+Package: deduped
+Type: Package
+Title: Making "Deduplicated" Functions
+Version: 0.1.4
+Authors@R: 
+    person("Or", "Gadish", email = "orgadish@gmail.com", role = c("aut", "cre", "cph"))
+Description: Contains one main function deduped() that returns a 
+    function that acts on the unique values of the first input
+    and expands the results back. This can significantly speed
+    up certain slow iterative functions.
+License: MIT + file LICENSE
+Encoding: UTF-8
+Imports: collapse, fastmatch
+RoxygenNote: 7.2.3
+Suggests: dplyr, hashr, purrr, readr, testthat (>= 3.0.0), withr
+Config/testthat/edition: 3
+NeedsCompilation: no
+Packaged: 2023-10-24 02:27:37 UTC; orgadish
+Author: Or Gadish [aut, cre, cph]
+Maintainer: Or Gadish <orgadish@gmail.com>
+Repository: CRAN
+Date/Publication: 2023-10-25 06:40:02 UTC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2023
+COPYRIGHT HOLDER: deduped authors
diff --git a/MD5 b/MD5
@@ -0,0 +1,13 @@
+895b260db25b3aed744415c33eea80ff *DESCRIPTION
+3b044d4694d6fc79c80261e87aacaeff *LICENSE
+740cfc93a761151c059234c2dcede7cb *NAMESPACE
+66a0ea409392245a5abea194a5f14269 *NEWS.md
+3b9ef76c6e04236529e83892a007489b *R/deduped.R
+5251556e180ab40194426e052129a6a8 *R/deduped_map.R
+d0cdff83abea5112517e721f165acabc *README.md
+a23f56cec68f01553baff070e8bfa4f4 *inst/WORDLIST
+34b53a4babff210ac654ebc9b986b2e4 *man/deduped.Rd
+be728636a552fa809d17ba382432a7c2 *man/deduped_map.Rd
+1ac7611399102b20ae5cf244cfdfcd4f *tests/testthat.R
+1aca527ba7ba753138389a64080e4024 *tests/testthat/test-deduped.R
+4aea0353bc7fbd52d3bf054aa6315d23 *tests/testthat/test-deduped_map.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,4 @@
+# Generated by roxygen2: do not edit by hand
+
+export(deduped)
+export(deduped_map)
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,4 @@
+# deduped 0.1.4
+
+* Initial successful CRAN submission.
+
diff --git a/R/deduped.R b/R/deduped.R
@@ -0,0 +1,42 @@
+#' Deduplicate a vectorized function to act on _unique_ elements
+#'
+#' @description
+#' Converts a vectorized function into one that only performs the computations
+#' on unique values in the first argument. The result is then expanded so that
+#' it is the same as if the computation was performed on all elements.
+#'
+#' @param f Function to deduplicate.
+#'
+#' @return Deduplicated function.
+#' @export
+#'
+#' @seealso [deduped_map()], a deduplicated version of [purrr::map()].
+#'
+#' @examples
+#'
+#' x <- sample(LETTERS, 10)
+#' x
+#'
+#' large_x <- sample(rep(x, 10))
+#' length(large_x)
+#'
+#' slow_func <- function(x) {
+#'   for (i in x) {
+#'     Sys.sleep(0.001)
+#'   }
+#' }
+#'
+#' system.time({
+#'   y1 <- slow_func(large_x)
+#' })
+#' system.time({
+#'   y2 <- deduped(slow_func)(large_x)
+#' })
+#'
+#' all(y1 == y2)
+deduped <- function(f) {
+  function(x, ...) {
+    ux <- collapse::funique(x)
+    f(ux, ...)[fastmatch::fmatch(x, ux)]
+  }
+}
diff --git a/R/deduped_map.R b/R/deduped_map.R
@@ -0,0 +1,77 @@
+#' Apply a function to each _unique_ element
+#'
+#' @description
+#' Acts like [purrr::map()] but only performs the computation on unique
+#' elements.
+#'
+#' @inheritParams purrr::map
+#'
+#' @return
+#' A list whose length is the same as the length of the input,
+#' matching the output of [purrr::map()].
+#'
+#' @seealso [deduped()]
+#'
+#' @export
+#'
+#' @examples
+#' slow_func <- function(x) {
+#'   for (i in x) {
+#'     Sys.sleep(0.001)
+#'   }
+#' }
+#' ux <- purrr::map(1:5, function(j) sample(LETTERS, j, replace = TRUE))
+#' x <- sample(rep(ux, 10)) # Create a duplicated vector
+#'
+#' system.time({
+#'   y1 <- purrr::map(x, slow_func)
+#' })
+#' system.time({
+#'   y2 <- deduped_map(x, slow_func)
+#' })
+#'
+#' all.equal(y1, y2)
+deduped_map <- function(.x, .f, ..., .progress = FALSE) {
+  check_map_pkgs()
+
+  # purrr::map can map over a vector, but a list is needed here.
+  .x <- as.list(.x)
+
+  # Use `hashr::hash(x, recursive=FALSE)` directly once markvanderloo/hashr#3
+  # is resolved.
+  nonrecursive_hash <- function(x) {
+    if (inherits(x, c("list", "character"))) {
+      hashr::hash(x, recursive = FALSE)
+    } else {
+      hashr::hash(x)
+    }
+  }
+
+  hashes <- purrr::map_int(.x, nonrecursive_hash)
+  unq_hashes <- collapse::funique(hashes)
+  unq_x <- purrr::map(unq_hashes, \(h) .x[[collapse::whichv(hashes, h)[1]]])
+
+  redup_indices <- fastmatch::fmatch(hashes, unq_hashes)
+  purrr::map(unq_x, .f, ..., .progress = .progress)[redup_indices]
+}
+
+
+check_map_pkgs <- function() {
+  not_installed <- c(
+    if (!requireNamespace("hashr", quietly = TRUE)) "hashr",
+    if (!requireNamespace("purrr", quietly = TRUE)) "purrr"
+  )
+
+  if (length(not_installed) == 0) {
+    return(invisible())
+  }
+
+  not_installed_txt <- paste(paste0("\"", not_installed, "\""), collapse = ", ")
+
+  stop(paste0(
+    "The following packages are missing and ",
+    "must be installed to use `deduped_map()`: ",
+    not_installed_txt,
+    "\nUse `install.packages(c(", not_installed_txt, "))` to install them."
+  ))
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,164 @@
+
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+
+# deduped
+
+<!-- badges: start -->
+<!-- badges: end -->
+
+The goal of `deduped` is to provide utility functions that make it
+easier to speed up vectorized functions (`deduped()`) or map functions
+(`deduped_map()`) when the arguments contain significant duplication.
+
+One particular use case of `deduped()` that I come across a lot is when
+using `basename()` and `dirname()` on the `file_path` column after
+reading multiple CSVs (e.g. with
+`readr::read_csv(..., id="file_path")`). `basename()` and `dirname()`
+are surprisingly slow (especially on Windows), and most of the column is
+duplicated.
+
+## Installation
+
+You can install the development version of `deduped` like so:
+
+``` r
+if(!requireNamespace("remotes")) install.packages("remotes")
+
+remotes::install_github("orgadish/dedup")
+```
+
+## Examples
+
+``` r
+library(deduped)
+
+slow_func <- function(x) {
+  for (i in x) {
+    Sys.sleep(0.001)
+  }
+}
+
+# deduped()
+unique_vec <- sample(LETTERS, 10)
+unique_vec
+#>  [1] "O" "A" "E" "C" "K" "F" "D" "S" "V" "G"
+
+duplicated_vec <- sample(rep(unique_vec, 100))
+length(duplicated_vec)
+#> [1] 1000
+
+system.time({
+  y1 <- slow_func(duplicated_vec)
+})
+#>    user  system elapsed 
+#>   0.026   0.019   1.268
+system.time({
+  y2 <- deduped(slow_func)(duplicated_vec)
+})
+#>    user  system elapsed 
+#>   0.115   0.012   0.141
+all(y1 == y2)
+#> [1] TRUE
+
+
+# deduped_map()
+unique_list <- purrr::map(1:5, function(j) sample(LETTERS, j, replace = TRUE))
+unique_list
+#> [[1]]
+#> [1] "I"
+#> 
+#> [[2]]
+#> [1] "A" "W"
+#> 
+#> [[3]]
+#> [1] "F" "C" "I"
+#> 
+#> [[4]]
+#> [1] "I" "X" "Y" "P"
+#> 
+#> [[5]]
+#> [1] "T" "H" "L" "R" "B"
+
+duplicated_list <- sample(rep(unique_list, 100)) # Create a duplicated list
+length(duplicated_list)
+#> [1] 500
+
+system.time({
+  z1 <- purrr::map(duplicated_list, slow_func)
+})
+#>    user  system elapsed 
+#>   0.040   0.025   1.916
+system.time({
+  z2 <- deduped_map(duplicated_list, slow_func)
+})
+#>    user  system elapsed 
+#>   0.020   0.008   0.048
+
+all.equal(z1, z2)
+#> [1] TRUE
+```
+
+## `file_path` Example
+
+``` r
+# Create multiple CSVs to read
+tf <- tempfile()
+dir.create(tf)
+
+# Duplicate mtcars 10,000x and write 1 CSV for each value of `am`
+duplicated_mtcars <- dplyr::slice(mtcars, rep(1:nrow(mtcars), 10000))
+invisible(sapply(
+  dplyr::group_split(duplicated_mtcars, am),
+  function(k) {
+    file_name <- paste0("mtcars_", unique(k$am), ".csv")
+    readr::write_csv(k, file.path(tf, file_name))
+  }
+))
+
+duplicated_mtcars_from_files <- readr::read_csv(
+  list.files(tf, full.names = TRUE),
+  id = "file_path"
+)
+#> Rows: 320000 Columns: 12
+#> ── Column specification ────────────────────────────────────────────────────────
+#> Delimiter: ","
+#> dbl (11): mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb
+#> 
+#> ℹ Use `spec()` to retrieve the full column specification for this data.
+#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
+duplicated_mtcars_from_files
+#> # A tibble: 320,000 × 12
+#>    file_path     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
+#>    <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
+#>  1 /var/folde…  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
+#>  2 /var/folde…  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
+#>  3 /var/folde…  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
+#>  4 /var/folde…  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
+#>  5 /var/folde…  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
+#>  6 /var/folde…  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
+#>  7 /var/folde…  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
+#>  8 /var/folde…  17.8     6  168.   123  3.92  3.44  18.9     1     0     4     4
+#>  9 /var/folde…  16.4     8  276.   180  3.07  4.07  17.4     0     0     3     3
+#> 10 /var/folde…  17.3     8  276.   180  3.07  3.73  17.6     0     0     3     3
+#> # ℹ 319,990 more rows
+
+system.time({
+  df1 <- dplyr::mutate(duplicated_mtcars_from_files,
+    file_name = basename(file_path)
+  )
+})
+#>    user  system elapsed 
+#>   0.080   0.001   0.081
+system.time({
+  df2 <- dplyr::mutate(duplicated_mtcars_from_files,
+    file_name = deduped(basename)(file_path)
+  )
+})
+#>    user  system elapsed 
+#>   0.006   0.000   0.007
+
+all.equal(df1, df2)
+#> [1] TRUE
+
+unlink(tf)
+```
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -0,0 +1,6 @@
+Deduplicate
+Deduplicated
+deduplicate
+deduplicated
+purrr
+vectorized
diff --git a/man/deduped.Rd b/man/deduped.Rd