Skip to content

Commit

Permalink
feat: New tbl_file() and tbl_query() to explicitly access tables …
Browse files Browse the repository at this point in the history
…and queries as dbplyr lazy tables
  • Loading branch information
krlmlr committed Mar 3, 2024
1 parent 71ff5b1 commit 3b415d8
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 0 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ export(duckdb_unregister)
export(duckdb_unregister_arrow)
export(read_csv_duckdb)
export(simulate_duckdb)
export(tbl_file)
export(tbl_query)
export(translate_duckdb)
exportClasses(duckdb_connection)
exportClasses(duckdb_driver)
Expand Down
42 changes: 42 additions & 0 deletions R/backend-dbplyr__duckdb_connection.R
Original file line number Diff line number Diff line change
Expand Up @@ -381,5 +381,47 @@ tbl.duckdb_connection <- function(src, from, cache = FALSE, ...) {
NextMethod("tbl")
}

#' Create a lazy table from a Parquet or SQL file
#'
#' `tbl_file()` is an experimental variant of [dplyr::tbl()] to directly access files on disk.
#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the request,
#' and paths with special characters are supported.
#'
#' @param src A duckdb connection object
#' @param path Path to existing Parquet, CSV or JSON file
#' @param cache Enable object cache for Parquet files
#' @export
#' @rdname backend-duckdb
tbl_file <- function(src, path, ..., cache = FALSE) {
if (...length() > 0) {
stop("... must be empty.", call. = FALSE)
}
if (!file.exists(path)) {
stop("File '", path, "' not found", call. = FALSE)
}
if (grepl("'", path)) {
stop("File '", path, "' contains a single quote, this is not supported", call. = FALSE)
}
tbl_query(src, paste0("'", path, "'"), cache = cache)
}

#' Create a lazy table from a query
#'
#' `tbl_query()` is an experimental variant of [dplyr::tbl()]
#' to create a lazy table from a table-generating function,
#' useful for reading nonstandard CSV files or other data sources.
#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the query.
#' Use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries.
#' See <https://duckdb.org/docs/data/overview> for details on data importing functions.
#'
#' @param query SQL code, omitting the `FROM` clause
#' @export
#' @rdname backend-duckdb
tbl_query <- function(src, query, ..., cache = FALSE) {
if (cache) DBI::dbExecute(src, "PRAGMA enable_object_cache")
table <- dplyr::sql(paste0("FROM ", query))
dplyr::tbl(src, table)
}

# Needed to suppress the R CHECK notes (due to the use of sql_expr)
utils::globalVariables(c("REGEXP_MATCHES", "CAST", "%AS%", "INTEGER", "XOR", "%<<%", "%>>%", "LN", "LOG", "ROUND", "EXTRACT", "%FROM%", "MONTH", "STRFTIME", "QUARTER", "YEAR", "DATE_TRUNC", "DATE", "DOY", "TO_SECONDS", "BIGINT", "TO_MINUTES", "TO_HOURS", "TO_DAYS", "TO_WEEKS", "TO_MONTHS", "TO_YEARS", "STRPOS", "NOT", "REGEXP_REPLACE", "TRIM", "LPAD", "RPAD", "%||%", "REPEAT", "LENGTH", "STRING_AGG", "GREATEST", "LIST_EXTRACT", "LOG10", "LOG2", "STRING_SPLIT_REGEX", "FLOOR", "FMOD", "FDIV"))
25 changes: 25 additions & 0 deletions man/backend-duckdb.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 72 additions & 0 deletions tests/testthat/test_tbl__duckdb_connection.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,29 @@ test_that("Parquet files can be registered with dplyr::tbl()", {
expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
})

test_that("Parquet files can be registered with tbl_file() and tbl_query()", {
skip_if_not_installed("dbplyr")

con <- DBI::dbConnect(duckdb())
on.exit(DBI::dbDisconnect(con, shutdown = TRUE))

tab0 <- tbl_file(con, "data/userdata1.parquet")
expect_true(inherits(tab0, "tbl_duckdb_connection"))
expect_true(tab0 %>% dplyr::count() %>% dplyr::collect() == 1000)

tab1 <- tbl_query(con, "read_parquet(['data/userdata1.parquet'])")
expect_true(inherits(tab1, "tbl_duckdb_connection"))
expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 1000)

tab2 <- tbl_query(con, "'data/userdata1.parquet'")
expect_true(inherits(tab2, "tbl_duckdb_connection"))
expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 1000)

tab3 <- tbl_query(con, "parquet_scan(['data/userdata1.parquet'])")
expect_true(inherits(tab3, "tbl_duckdb_connection"))
expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
})


test_that("Object cache can be enabled for parquet files with dplyr::tbl()", {
skip_if_not_installed("dbplyr")
Expand All @@ -44,6 +67,23 @@ test_that("Object cache can be enabled for parquet files with dplyr::tbl()", {
expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
})

test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_query()", {
skip_if_not_installed("dbplyr")
# https://github.com/tidyverse/dbplyr/issues/1384
skip_if(packageVersion("dbplyr") >= "2.4.0")

con <- DBI::dbConnect(duckdb())
on.exit(DBI::dbDisconnect(con, shutdown = TRUE))

DBI::dbExecute(con, "SET enable_object_cache=False;")
tab1 <- tbl_file(con, "data/userdata1.parquet", cache = TRUE)
expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "true")

DBI::dbExecute(con, "SET enable_object_cache=False;")
tab2 <- tbl_query(con, "'data/userdata1.parquet'", cache = FALSE)
expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
})


test_that("CSV files can be registered with dplyr::tbl()", {
skip_if_not_installed("dbplyr")
Expand All @@ -64,6 +104,26 @@ test_that("CSV files can be registered with dplyr::tbl()", {
expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
})

test_that("CSV files can be registered with tbl_file() and tbl_query()", {
skip_if_not_installed("dbplyr")

path <- file.path(tempdir(), "duckdbtest.csv")
write.csv(iris, file = path)
on.exit(unlink(path))

con <- DBI::dbConnect(duckdb())
on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE)

tab1 <- tbl_file(con, path)
expect_true(inherits(tab1, "tbl_duckdb_connection"))
expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 150)

tab2 <- tbl_query(con, paste0("read_csv_auto('", path, "')"))
expect_true(inherits(tab2, "tbl_duckdb_connection"))
expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
})


test_that("Other replacement scans or functions can be registered with dplyr::tbl()", {
skip_if_not_installed("dbplyr")

Expand All @@ -75,6 +135,18 @@ test_that("Other replacement scans or functions can be registered with dplyr::tb
expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
})

test_that("Other replacement scans or functions can be registered with tbl_query()", {
skip_if_not_installed("dbplyr")

con <- DBI::dbConnect(duckdb())
on.exit(DBI::dbDisconnect(con, shutdown = TRUE))

obj <- tbl_query(con, "duckdb_keywords()")
expect_true(inherits(obj, "tbl_duckdb_connection"))
expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
})


test_that("Strings tagged as SQL will be handled correctly with dplyr::tbl()", {
skip_if_not_installed("dbplyr")

Expand Down

0 comments on commit 3b415d8

Please sign in to comment.