feat: New tbl_file() and tbl_query() to explicitly access tables …

…and queries as dbplyr lazy tables
duckdb · Mar 3, 2024 · 3b415d8 · 3b415d8
1 parent 71ff5b1
commit 3b415d8
Show file tree

Hide file tree

Showing 4 changed files with 141 additions and 0 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -23,6 +23,8 @@ export(duckdb_unregister)
 export(duckdb_unregister_arrow)
 export(read_csv_duckdb)
 export(simulate_duckdb)
+export(tbl_file)
+export(tbl_query)
 export(translate_duckdb)
 exportClasses(duckdb_connection)
 exportClasses(duckdb_driver)

diff --git a/R/backend-dbplyr__duckdb_connection.R b/R/backend-dbplyr__duckdb_connection.R
@@ -381,5 +381,47 @@ tbl.duckdb_connection <- function(src, from, cache = FALSE, ...) {
   NextMethod("tbl")
 }
 
+#' Create a lazy table from a Parquet or SQL file
+#'
+#' `tbl_file()` is an experimental variant of [dplyr::tbl()] to directly access files on disk.
+#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the request,
+#' and paths with special characters are supported.
+#'
+#' @param src A duckdb connection object
+#' @param path Path to existing Parquet, CSV or JSON file
+#' @param cache Enable object cache for Parquet files
+#' @export
+#' @rdname backend-duckdb
+tbl_file <- function(src, path, ..., cache = FALSE) {
+  if (...length() > 0) {
+    stop("... must be empty.", call. = FALSE)
+  }
+  if (!file.exists(path)) {
+    stop("File '", path, "' not found", call. = FALSE)
+  }
+  if (grepl("'", path)) {
+    stop("File '", path, "' contains a single quote, this is not supported", call. = FALSE)
+  }
+  tbl_query(src, paste0("'", path, "'"), cache = cache)
+}
+
+#' Create a lazy table from a query
+#'
+#' `tbl_query()` is an experimental variant of [dplyr::tbl()]
+#' to create a lazy table from a table-generating function,
+#' useful for reading nonstandard CSV files or other data sources.
+#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the query.
+#' Use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries.
+#' See <https://duckdb.org/docs/data/overview> for details on data importing functions.
+#'
+#' @param query SQL code, omitting the `FROM` clause
+#' @export
+#' @rdname backend-duckdb
+tbl_query <- function(src, query, ..., cache = FALSE) {
+  if (cache) DBI::dbExecute(src, "PRAGMA enable_object_cache")
+  table <- dplyr::sql(paste0("FROM ", query))
+  dplyr::tbl(src, table)
+}
+
 # Needed to suppress the R CHECK notes (due to the use of sql_expr)
 utils::globalVariables(c("REGEXP_MATCHES", "CAST", "%AS%", "INTEGER", "XOR", "%<<%", "%>>%", "LN", "LOG", "ROUND", "EXTRACT", "%FROM%", "MONTH", "STRFTIME", "QUARTER", "YEAR", "DATE_TRUNC", "DATE", "DOY", "TO_SECONDS", "BIGINT", "TO_MINUTES", "TO_HOURS", "TO_DAYS", "TO_WEEKS", "TO_MONTHS", "TO_YEARS", "STRPOS", "NOT", "REGEXP_REPLACE", "TRIM", "LPAD", "RPAD", "%||%", "REPEAT", "LENGTH", "STRING_AGG", "GREATEST", "LIST_EXTRACT", "LOG10", "LOG2", "STRING_SPLIT_REGEX", "FLOOR", "FMOD", "FDIV"))
diff --git a/man/backend-duckdb.Rd b/man/backend-duckdb.Rd
diff --git a/tests/testthat/test_tbl__duckdb_connection.R b/tests/testthat/test_tbl__duckdb_connection.R
@@ -26,6 +26,29 @@ test_that("Parquet files can be registered with dplyr::tbl()", {
   expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
 })
 
+test_that("Parquet files can be registered with tbl_file() and tbl_query()", {
+  skip_if_not_installed("dbplyr")
+
+  con <- DBI::dbConnect(duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
+
+  tab0 <- tbl_file(con, "data/userdata1.parquet")
+  expect_true(inherits(tab0, "tbl_duckdb_connection"))
+  expect_true(tab0 %>% dplyr::count() %>% dplyr::collect() == 1000)
+
+  tab1 <- tbl_query(con, "read_parquet(['data/userdata1.parquet'])")
+  expect_true(inherits(tab1, "tbl_duckdb_connection"))
+  expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 1000)
+
+  tab2 <- tbl_query(con, "'data/userdata1.parquet'")
+  expect_true(inherits(tab2, "tbl_duckdb_connection"))
+  expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 1000)
+
+  tab3 <- tbl_query(con, "parquet_scan(['data/userdata1.parquet'])")
+  expect_true(inherits(tab3, "tbl_duckdb_connection"))
+  expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
+})
+
 
 test_that("Object cache can be enabled for parquet files with dplyr::tbl()", {
   skip_if_not_installed("dbplyr")
@@ -44,6 +67,23 @@ test_that("Object cache can be enabled for parquet files with dplyr::tbl()", {
   expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
 })
 
+test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_query()", {
+  skip_if_not_installed("dbplyr")
+  # https://github.com/tidyverse/dbplyr/issues/1384
+  skip_if(packageVersion("dbplyr") >= "2.4.0")
+
+  con <- DBI::dbConnect(duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
+
+  DBI::dbExecute(con, "SET enable_object_cache=False;")
+  tab1 <- tbl_file(con, "data/userdata1.parquet", cache = TRUE)
+  expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "true")
+
+  DBI::dbExecute(con, "SET enable_object_cache=False;")
+  tab2 <- tbl_query(con, "'data/userdata1.parquet'", cache = FALSE)
+  expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
+})
+
 
 test_that("CSV files can be registered with dplyr::tbl()", {
   skip_if_not_installed("dbplyr")
@@ -64,6 +104,26 @@ test_that("CSV files can be registered with dplyr::tbl()", {
   expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
 })
 
+test_that("CSV files can be registered with tbl_file() and tbl_query()", {
+  skip_if_not_installed("dbplyr")
+
+  path <- file.path(tempdir(), "duckdbtest.csv")
+  write.csv(iris, file = path)
+  on.exit(unlink(path))
+
+  con <- DBI::dbConnect(duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE)
+
+  tab1 <- tbl_file(con, path)
+  expect_true(inherits(tab1, "tbl_duckdb_connection"))
+  expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 150)
+
+  tab2 <- tbl_query(con, paste0("read_csv_auto('", path, "')"))
+  expect_true(inherits(tab2, "tbl_duckdb_connection"))
+  expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
+})
+
+
 test_that("Other replacement scans or functions can be registered with dplyr::tbl()", {
   skip_if_not_installed("dbplyr")
 
@@ -75,6 +135,18 @@ test_that("Other replacement scans or functions can be registered with dplyr::tb
   expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
 })
 
+test_that("Other replacement scans or functions can be registered with tbl_query()", {
+  skip_if_not_installed("dbplyr")
+
+  con <- DBI::dbConnect(duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
+
+  obj <- tbl_query(con, "duckdb_keywords()")
+  expect_true(inherits(obj, "tbl_duckdb_connection"))
+  expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
+})
+
+
 test_that("Strings tagged as SQL will be handled correctly with dplyr::tbl()", {
   skip_if_not_installed("dbplyr")