From 3b415d8d84eb1a35ab63d401262d86201fa40680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Sun, 3 Mar 2024 12:13:13 +0100 Subject: [PATCH] feat: New `tbl_file()` and `tbl_query()` to explicitly access tables and queries as dbplyr lazy tables --- NAMESPACE | 2 + R/backend-dbplyr__duckdb_connection.R | 42 ++++++++++++ man/backend-duckdb.Rd | 25 +++++++ tests/testthat/test_tbl__duckdb_connection.R | 72 ++++++++++++++++++++ 4 files changed, 141 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 9e5ed86af..52648629b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,8 @@ export(duckdb_unregister) export(duckdb_unregister_arrow) export(read_csv_duckdb) export(simulate_duckdb) +export(tbl_file) +export(tbl_query) export(translate_duckdb) exportClasses(duckdb_connection) exportClasses(duckdb_driver) diff --git a/R/backend-dbplyr__duckdb_connection.R b/R/backend-dbplyr__duckdb_connection.R index 6fc7f40f2..a9e2c9b01 100644 --- a/R/backend-dbplyr__duckdb_connection.R +++ b/R/backend-dbplyr__duckdb_connection.R @@ -381,5 +381,47 @@ tbl.duckdb_connection <- function(src, from, cache = FALSE, ...) { NextMethod("tbl") } +#' Create a lazy table from a Parquet or SQL file +#' +#' `tbl_file()` is an experimental variant of [dplyr::tbl()] to directly access files on disk. +#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the request, +#' and paths with special characters are supported. +#' +#' @param src A duckdb connection object +#' @param path Path to existing Parquet, CSV or JSON file +#' @param cache Enable object cache for Parquet files +#' @export +#' @rdname backend-duckdb +tbl_file <- function(src, path, ..., cache = FALSE) { + if (...length() > 0) { + stop("... must be empty.", call. = FALSE) + } + if (!file.exists(path)) { + stop("File '", path, "' not found", call. = FALSE) + } + if (grepl("'", path)) { + stop("File '", path, "' contains a single quote, this is not supported", call. = FALSE) + } + tbl_query(src, paste0("'", path, "'"), cache = cache) +} + +#' Create a lazy table from a query +#' +#' `tbl_query()` is an experimental variant of [dplyr::tbl()] +#' to create a lazy table from a table-generating function, +#' useful for reading nonstandard CSV files or other data sources. +#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the query. +#' Use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries. +#' See for details on data importing functions. +#' +#' @param query SQL code, omitting the `FROM` clause +#' @export +#' @rdname backend-duckdb +tbl_query <- function(src, query, ..., cache = FALSE) { + if (cache) DBI::dbExecute(src, "PRAGMA enable_object_cache") + table <- dplyr::sql(paste0("FROM ", query)) + dplyr::tbl(src, table) +} + # Needed to suppress the R CHECK notes (due to the use of sql_expr) utils::globalVariables(c("REGEXP_MATCHES", "CAST", "%AS%", "INTEGER", "XOR", "%<<%", "%>>%", "LN", "LOG", "ROUND", "EXTRACT", "%FROM%", "MONTH", "STRFTIME", "QUARTER", "YEAR", "DATE_TRUNC", "DATE", "DOY", "TO_SECONDS", "BIGINT", "TO_MINUTES", "TO_HOURS", "TO_DAYS", "TO_WEEKS", "TO_MONTHS", "TO_YEARS", "STRPOS", "NOT", "REGEXP_REPLACE", "TRIM", "LPAD", "RPAD", "%||%", "REPEAT", "LENGTH", "STRING_AGG", "GREATEST", "LIST_EXTRACT", "LOG10", "LOG2", "STRING_SPLIT_REGEX", "FLOOR", "FMOD", "FDIV")) diff --git a/man/backend-duckdb.Rd b/man/backend-duckdb.Rd index 3eab99358..4c052ce20 100644 --- a/man/backend-duckdb.Rd +++ b/man/backend-duckdb.Rd @@ -3,19 +3,44 @@ \name{backend-duckdb} \alias{simulate_duckdb} \alias{translate_duckdb} +\alias{tbl_file} +\alias{tbl_query} \title{DuckDB SQL backend for dbplyr} \usage{ simulate_duckdb(...) translate_duckdb(...) + +tbl_file(src, path, ..., cache = FALSE) + +tbl_query(src, query, ..., cache = FALSE) } \arguments{ \item{...}{Any parameters to be forwarded} + +\item{src}{A duckdb connection object} + +\item{path}{Path to existing Parquet, CSV or JSON file} + +\item{cache}{Enable object cache for Parquet files} + +\item{query}{SQL code, omitting the \code{FROM} clause} } \description{ This is a SQL backend for dbplyr tailored to take into account DuckDB's possibilities. This mainly follows the backend for PostgreSQL, but contains more mapped functions. + +\code{tbl_file()} is an experimental variant of \code{\link[dplyr:tbl]{dplyr::tbl()}} to directly access files on disk. +It is safer than \code{dplyr::tbl()} because there is no risk of misinterpreting the request, +and paths with special characters are supported. + +\code{tbl_query()} is an experimental variant of \code{\link[dplyr:tbl]{dplyr::tbl()}} +to create a lazy table from a table-generating function, +useful for reading nonstandard CSV files or other data sources. +It is safer than \code{dplyr::tbl()} because there is no risk of misinterpreting the query. +Use \code{dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))} for custom SQL queries. +See \url{https://duckdb.org/docs/data/overview} for details on data importing functions. } \examples{ \dontshow{if (rlang::is_installed("dbplyr")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} diff --git a/tests/testthat/test_tbl__duckdb_connection.R b/tests/testthat/test_tbl__duckdb_connection.R index eeda63dfe..d10d5d399 100644 --- a/tests/testthat/test_tbl__duckdb_connection.R +++ b/tests/testthat/test_tbl__duckdb_connection.R @@ -26,6 +26,29 @@ test_that("Parquet files can be registered with dplyr::tbl()", { expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000) }) +test_that("Parquet files can be registered with tbl_file() and tbl_query()", { + skip_if_not_installed("dbplyr") + + con <- DBI::dbConnect(duckdb()) + on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) + + tab0 <- tbl_file(con, "data/userdata1.parquet") + expect_true(inherits(tab0, "tbl_duckdb_connection")) + expect_true(tab0 %>% dplyr::count() %>% dplyr::collect() == 1000) + + tab1 <- tbl_query(con, "read_parquet(['data/userdata1.parquet'])") + expect_true(inherits(tab1, "tbl_duckdb_connection")) + expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 1000) + + tab2 <- tbl_query(con, "'data/userdata1.parquet'") + expect_true(inherits(tab2, "tbl_duckdb_connection")) + expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 1000) + + tab3 <- tbl_query(con, "parquet_scan(['data/userdata1.parquet'])") + expect_true(inherits(tab3, "tbl_duckdb_connection")) + expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000) +}) + test_that("Object cache can be enabled for parquet files with dplyr::tbl()", { skip_if_not_installed("dbplyr") @@ -44,6 +67,23 @@ test_that("Object cache can be enabled for parquet files with dplyr::tbl()", { expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false") }) +test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_query()", { + skip_if_not_installed("dbplyr") + # https://github.com/tidyverse/dbplyr/issues/1384 + skip_if(packageVersion("dbplyr") >= "2.4.0") + + con <- DBI::dbConnect(duckdb()) + on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) + + DBI::dbExecute(con, "SET enable_object_cache=False;") + tab1 <- tbl_file(con, "data/userdata1.parquet", cache = TRUE) + expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "true") + + DBI::dbExecute(con, "SET enable_object_cache=False;") + tab2 <- tbl_query(con, "'data/userdata1.parquet'", cache = FALSE) + expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false") +}) + test_that("CSV files can be registered with dplyr::tbl()", { skip_if_not_installed("dbplyr") @@ -64,6 +104,26 @@ test_that("CSV files can be registered with dplyr::tbl()", { expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150) }) +test_that("CSV files can be registered with tbl_file() and tbl_query()", { + skip_if_not_installed("dbplyr") + + path <- file.path(tempdir(), "duckdbtest.csv") + write.csv(iris, file = path) + on.exit(unlink(path)) + + con <- DBI::dbConnect(duckdb()) + on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE) + + tab1 <- tbl_file(con, path) + expect_true(inherits(tab1, "tbl_duckdb_connection")) + expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 150) + + tab2 <- tbl_query(con, paste0("read_csv_auto('", path, "')")) + expect_true(inherits(tab2, "tbl_duckdb_connection")) + expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150) +}) + + test_that("Other replacement scans or functions can be registered with dplyr::tbl()", { skip_if_not_installed("dbplyr") @@ -75,6 +135,18 @@ test_that("Other replacement scans or functions can be registered with dplyr::tb expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1) }) +test_that("Other replacement scans or functions can be registered with tbl_query()", { + skip_if_not_installed("dbplyr") + + con <- DBI::dbConnect(duckdb()) + on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) + + obj <- tbl_query(con, "duckdb_keywords()") + expect_true(inherits(obj, "tbl_duckdb_connection")) + expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1) +}) + + test_that("Strings tagged as SQL will be handled correctly with dplyr::tbl()", { skip_if_not_installed("dbplyr")