ARROW-17355: [R] Refactor the handle_* utility functions for a better…

… dev experience (apache#14030) For context; these `handle_*` functions originally caught an error and if it contained a particular string raised an augmented error message with extra guidance for the user (and if not, raised the original error message). This became problematic in a later PR where we wanted to test multiple conditions and only raise the original error if none of the conditions were met - the temporary approach was to move the responsibility for the raising of the original error to outside of the `handle_*` functions. The issue here is that this makes it easy for developers to forget to add in this line of code. The proposed solution here implements a generic error augmentation function `augment_io_error_msg()`, which tests all conditions, raises an error with an augmented message if any conditions are met, or raises the original error if not. Authored-by: Nic Crane <thisisnic@gmail.com> Signed-off-by: Nic Crane <thisisnic@gmail.com>
fatemehp · Oct 17, 2022 · 2b4413a · 2b4413a
1 parent 53ca96f
commit 2b4413a
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 28 deletions.
diff --git a/r/R/csv.R b/r/R/csv.R
@@ -206,9 +206,9 @@ read_delim_arrow <- function(file,
   tryCatch(
     tab <- reader$Read(),
     # n = 4 because we want the error to show up as being from read_delim_arrow()
-    # and not handle_csv_read_error()
+    # and not augment_io_error_msg()
     error = function(e, call = caller_env(n = 4)) {
-      handle_csv_read_error(e, schema, call)
+      augment_io_error_msg(e, call, schema = schema)
     }
   )
 

diff --git a/r/R/dataset.R b/r/R/dataset.R
@@ -221,10 +221,9 @@ open_dataset <- function(sources,
     # Default is _not_ to inspect/unify schemas
     factory$Finish(schema, isTRUE(unify_schemas)),
     # n = 4 because we want the error to show up as being from open_dataset()
-    # and not handle_parquet_io_error()
+    # and not augment_io_error_msg()
     error = function(e, call = caller_env(n = 4)) {
-      handle_parquet_io_error(e, format, call)
-      abort(conditionMessage(e), call = call)
+      augment_io_error_msg(e, call, format = format)
     }
   )
 }

diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R
@@ -22,11 +22,9 @@ collect.arrow_dplyr_query <- function(x, as_data_frame = TRUE, ...) {
   tryCatch(
     out <- as_arrow_table(x),
     # n = 4 because we want the error to show up as being from collect()
-    # and not handle_csv_read_error()
+    # and not augment_io_error_msg()
     error = function(e, call = caller_env(n = 4)) {
-      handle_csv_read_error(e, x$.data$schema, call)
-      handle_augmented_field_misuse(e, call)
-      abort(conditionMessage(e), call = call)
+      augment_io_error_msg(e, call, schema = x$.data$schema)
     }
   )
 

diff --git a/r/R/util.R b/r/R/util.R
@@ -134,12 +134,7 @@ read_compressed_error <- function(e) {
   stop(e)
 }
 
-# This function was refactored in ARROW-15260 to only raise an error if
-# the appropriate string was found and so errors must be raised manually after
-# calling this if matching error not found
-# TODO: Refactor as part of ARROW-17355 to prevent potential missed errors
-handle_parquet_io_error <- function(e, format, call) {
-  msg <- conditionMessage(e)
+handle_parquet_io_error <- function(msg, call, format) {
   if (grepl("Parquet magic bytes not found in footer", msg) && length(format) > 1 && is_character(format)) {
     # If length(format) > 1, that means it is (almost certainly) the default/not specified value
     # so let the user know that they should specify the actual (not parquet) format
@@ -209,13 +204,7 @@ repeat_value_as_array <- function(object, n) {
   return(Scalar$create(object)$as_array(n))
 }
 
-# This function was refactored in ARROW-15260 to only raise an error if
-# the appropriate string was found and so errors must be raised manually after
-# calling this if matching error not found
-# TODO: Refactor as part of ARROW-17355 to prevent potential missed errors
-handle_csv_read_error <- function(e, schema, call) {
-  msg <- conditionMessage(e)
-
+handle_csv_read_error <- function(msg, call, schema) {
   if (grepl("conversion error", msg) && inherits(schema, "Schema")) {
     msg <- c(
       msg,
@@ -229,12 +218,7 @@ handle_csv_read_error <- function(e, schema, call) {
   }
 }
 
-# This function only raises an error if
-# the appropriate string was found and so errors must be raised manually after
-# calling this if matching error not found
-# TODO: Refactor as part of ARROW-17355 to prevent potential missed errors
-handle_augmented_field_misuse <- function(e, call) {
-  msg <- conditionMessage(e)
+handle_augmented_field_misuse <- function(msg, call) {
   if (grepl("No match for FieldRef.Name(__filename)", msg, fixed = TRUE)) {
     msg <- c(
       msg,
@@ -251,3 +235,19 @@ handle_augmented_field_misuse <- function(e, call) {
 is_compressed <- function(compression) {
   !identical(compression, "uncompressed")
 }
+
+# handler function which checks for a number of different read errors
+augment_io_error_msg <- function(e, call, schema = NULL, format = NULL) {
+
+  msg <- conditionMessage(e)
+
+  if (!is.null(schema)) {
+    handle_csv_read_error(msg, call, schema)
+  }
+  if (!is.null(format)) {
+    handle_parquet_io_error(msg, call, format)
+  }
+
+  handle_augmented_field_misuse(msg, call)
+  abort(msg, call = call)
+}