From 3b415d8d84eb1a35ab63d401262d86201fa40680 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kirill=20M=C3=BCller?= <kirill@cynkra.com>
Date: Sun, 3 Mar 2024 12:13:13 +0100
Subject: [PATCH] feat: New `tbl_file()` and `tbl_query()` to explicitly access
 tables and queries as dbplyr lazy tables

---
 NAMESPACE                                    |  2 +
 R/backend-dbplyr__duckdb_connection.R        | 42 ++++++++++++
 man/backend-duckdb.Rd                        | 25 +++++++
 tests/testthat/test_tbl__duckdb_connection.R | 72 ++++++++++++++++++++
 4 files changed, 141 insertions(+)

diff --git a/NAMESPACE b/NAMESPACE
index 9e5ed86af..52648629b 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -23,6 +23,8 @@ export(duckdb_unregister)
 export(duckdb_unregister_arrow)
 export(read_csv_duckdb)
 export(simulate_duckdb)
+export(tbl_file)
+export(tbl_query)
 export(translate_duckdb)
 exportClasses(duckdb_connection)
 exportClasses(duckdb_driver)
diff --git a/R/backend-dbplyr__duckdb_connection.R b/R/backend-dbplyr__duckdb_connection.R
index 6fc7f40f2..a9e2c9b01 100644
--- a/R/backend-dbplyr__duckdb_connection.R
+++ b/R/backend-dbplyr__duckdb_connection.R
@@ -381,5 +381,47 @@ tbl.duckdb_connection <- function(src, from, cache = FALSE, ...) {
   NextMethod("tbl")
 }
 
+#' Create a lazy table from a Parquet or SQL file
+#'
+#' `tbl_file()` is an experimental variant of [dplyr::tbl()] to directly access files on disk.
+#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the request,
+#' and paths with special characters are supported.
+#'
+#' @param src A duckdb connection object
+#' @param path Path to existing Parquet, CSV or JSON file
+#' @param cache Enable object cache for Parquet files
+#' @export
+#' @rdname backend-duckdb
+tbl_file <- function(src, path, ..., cache = FALSE) {
+  if (...length() > 0) {
+    stop("... must be empty.", call. = FALSE)
+  }
+  if (!file.exists(path)) {
+    stop("File '", path, "' not found", call. = FALSE)
+  }
+  if (grepl("'", path)) {
+    stop("File '", path, "' contains a single quote, this is not supported", call. = FALSE)
+  }
+  tbl_query(src, paste0("'", path, "'"), cache = cache)
+}
+
+#' Create a lazy table from a query
+#'
+#' `tbl_query()` is an experimental variant of [dplyr::tbl()]
+#' to create a lazy table from a table-generating function,
+#' useful for reading nonstandard CSV files or other data sources.
+#' It is safer than `dplyr::tbl()` because there is no risk of misinterpreting the query.
+#' Use `dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))` for custom SQL queries.
+#' See <https://duckdb.org/docs/data/overview> for details on data importing functions.
+#'
+#' @param query SQL code, omitting the `FROM` clause
+#' @export
+#' @rdname backend-duckdb
+tbl_query <- function(src, query, ..., cache = FALSE) {
+  if (cache) DBI::dbExecute(src, "PRAGMA enable_object_cache")
+  table <- dplyr::sql(paste0("FROM ", query))
+  dplyr::tbl(src, table)
+}
+
 # Needed to suppress the R CHECK notes (due to the use of sql_expr)
 utils::globalVariables(c("REGEXP_MATCHES", "CAST", "%AS%", "INTEGER", "XOR", "%<<%", "%>>%", "LN", "LOG", "ROUND", "EXTRACT", "%FROM%", "MONTH", "STRFTIME", "QUARTER", "YEAR", "DATE_TRUNC", "DATE", "DOY", "TO_SECONDS", "BIGINT", "TO_MINUTES", "TO_HOURS", "TO_DAYS", "TO_WEEKS", "TO_MONTHS", "TO_YEARS", "STRPOS", "NOT", "REGEXP_REPLACE", "TRIM", "LPAD", "RPAD", "%||%", "REPEAT", "LENGTH", "STRING_AGG", "GREATEST", "LIST_EXTRACT", "LOG10", "LOG2", "STRING_SPLIT_REGEX", "FLOOR", "FMOD", "FDIV"))
diff --git a/man/backend-duckdb.Rd b/man/backend-duckdb.Rd
index 3eab99358..4c052ce20 100644
--- a/man/backend-duckdb.Rd
+++ b/man/backend-duckdb.Rd
@@ -3,19 +3,44 @@
 \name{backend-duckdb}
 \alias{simulate_duckdb}
 \alias{translate_duckdb}
+\alias{tbl_file}
+\alias{tbl_query}
 \title{DuckDB SQL backend for dbplyr}
 \usage{
 simulate_duckdb(...)
 
 translate_duckdb(...)
+
+tbl_file(src, path, ..., cache = FALSE)
+
+tbl_query(src, query, ..., cache = FALSE)
 }
 \arguments{
 \item{...}{Any parameters to be forwarded}
+
+\item{src}{A duckdb connection object}
+
+\item{path}{Path to existing Parquet, CSV or JSON file}
+
+\item{cache}{Enable object cache for Parquet files}
+
+\item{query}{SQL code, omitting the \code{FROM} clause}
 }
 \description{
 This is a SQL backend for dbplyr tailored to take into account DuckDB's
 possibilities. This mainly follows the backend for PostgreSQL, but
 contains more mapped functions.
+
+\code{tbl_file()} is an experimental variant of \code{\link[dplyr:tbl]{dplyr::tbl()}} to directly access files on disk.
+It is safer than \code{dplyr::tbl()} because there is no risk of misinterpreting the request,
+and paths with special characters are supported.
+
+\code{tbl_query()} is an experimental variant of \code{\link[dplyr:tbl]{dplyr::tbl()}}
+to create a lazy table from a table-generating function,
+useful for reading nonstandard CSV files or other data sources.
+It is safer than \code{dplyr::tbl()} because there is no risk of misinterpreting the query.
+Use \code{dplyr::tbl(src, dplyr::sql("SELECT ... FROM ..."))} for custom SQL queries.
+See \url{https://duckdb.org/docs/data/overview} for details on data importing functions.
 }
 \examples{
 \dontshow{if (rlang::is_installed("dbplyr")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
diff --git a/tests/testthat/test_tbl__duckdb_connection.R b/tests/testthat/test_tbl__duckdb_connection.R
index eeda63dfe..d10d5d399 100644
--- a/tests/testthat/test_tbl__duckdb_connection.R
+++ b/tests/testthat/test_tbl__duckdb_connection.R
@@ -26,6 +26,29 @@ test_that("Parquet files can be registered with dplyr::tbl()", {
   expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
 })
 
+test_that("Parquet files can be registered with tbl_file() and tbl_query()", {
+  skip_if_not_installed("dbplyr")
+
+  con <- DBI::dbConnect(duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
+
+  tab0 <- tbl_file(con, "data/userdata1.parquet")
+  expect_true(inherits(tab0, "tbl_duckdb_connection"))
+  expect_true(tab0 %>% dplyr::count() %>% dplyr::collect() == 1000)
+
+  tab1 <- tbl_query(con, "read_parquet(['data/userdata1.parquet'])")
+  expect_true(inherits(tab1, "tbl_duckdb_connection"))
+  expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 1000)
+
+  tab2 <- tbl_query(con, "'data/userdata1.parquet'")
+  expect_true(inherits(tab2, "tbl_duckdb_connection"))
+  expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 1000)
+
+  tab3 <- tbl_query(con, "parquet_scan(['data/userdata1.parquet'])")
+  expect_true(inherits(tab3, "tbl_duckdb_connection"))
+  expect_true(tab3 %>% dplyr::count() %>% dplyr::collect() == 1000)
+})
+
 
 test_that("Object cache can be enabled for parquet files with dplyr::tbl()", {
   skip_if_not_installed("dbplyr")
@@ -44,6 +67,23 @@ test_that("Object cache can be enabled for parquet files with dplyr::tbl()", {
   expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
 })
 
+test_that("Object cache can be enabled for parquet files with tbl_file() and tbl_query()", {
+  skip_if_not_installed("dbplyr")
+  # https://github.com/tidyverse/dbplyr/issues/1384
+  skip_if(packageVersion("dbplyr") >= "2.4.0")
+
+  con <- DBI::dbConnect(duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
+
+  DBI::dbExecute(con, "SET enable_object_cache=False;")
+  tab1 <- tbl_file(con, "data/userdata1.parquet", cache = TRUE)
+  expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "true")
+
+  DBI::dbExecute(con, "SET enable_object_cache=False;")
+  tab2 <- tbl_query(con, "'data/userdata1.parquet'", cache = FALSE)
+  expect_true(DBI::dbGetQuery(con, "SELECT value FROM duckdb_settings() WHERE name='enable_object_cache';") == "false")
+})
+
 
 test_that("CSV files can be registered with dplyr::tbl()", {
   skip_if_not_installed("dbplyr")
@@ -64,6 +104,26 @@ test_that("CSV files can be registered with dplyr::tbl()", {
   expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
 })
 
+test_that("CSV files can be registered with tbl_file() and tbl_query()", {
+  skip_if_not_installed("dbplyr")
+
+  path <- file.path(tempdir(), "duckdbtest.csv")
+  write.csv(iris, file = path)
+  on.exit(unlink(path))
+
+  con <- DBI::dbConnect(duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE)
+
+  tab1 <- tbl_file(con, path)
+  expect_true(inherits(tab1, "tbl_duckdb_connection"))
+  expect_true(tab1 %>% dplyr::count() %>% dplyr::collect() == 150)
+
+  tab2 <- tbl_query(con, paste0("read_csv_auto('", path, "')"))
+  expect_true(inherits(tab2, "tbl_duckdb_connection"))
+  expect_true(tab2 %>% dplyr::count() %>% dplyr::collect() == 150)
+})
+
+
 test_that("Other replacement scans or functions can be registered with dplyr::tbl()", {
   skip_if_not_installed("dbplyr")
 
@@ -75,6 +135,18 @@ test_that("Other replacement scans or functions can be registered with dplyr::tb
   expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
 })
 
+test_that("Other replacement scans or functions can be registered with tbl_query()", {
+  skip_if_not_installed("dbplyr")
+
+  con <- DBI::dbConnect(duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE))
+
+  obj <- tbl_query(con, "duckdb_keywords()")
+  expect_true(inherits(obj, "tbl_duckdb_connection"))
+  expect_true(obj %>% dplyr::filter(keyword_name == "all") %>% dplyr::count() %>% dplyr::collect() == 1)
+})
+
+
 test_that("Strings tagged as SQL will be handled correctly with dplyr::tbl()", {
   skip_if_not_installed("dbplyr")