From da7946f16d82ee03316512c82bd433c5b7634f30 Mon Sep 17 00:00:00 2001 From: wendycwong Date: Thu, 18 Apr 2024 10:41:05 -0700 Subject: [PATCH 1/3] GH-15947: fixed column length discrepancies when skipped_columns are specified when calling h2o.H2OFrame. GH-15947: fixed skipped columns for normal import_file path. GH-15947: added same skipped column capability when transforming R data frames to h2O data frames. GH-15947: add parameter to avoid R cmd test failure. GH-15947: Incorporate Seb code review comments. --- h2o-py/h2o/h2o.py | 11 +++++----- .../Data_Manipulation/pyunit_h2oH2OFrame.py | 12 +++++----- .../pyunit_GH_15947_skipped_column_error.py | 15 +++++++++++++ h2o-r/h2o-package/R/frame.R | 22 ++++++++++++------- h2o-r/h2o-package/R/parse.R | 6 +++-- .../runit_hexdev_29_import_types.R | 4 ++-- .../runit_GH_15947_skipped_column_error.R | 10 +++++++++ 7 files changed, 56 insertions(+), 24 deletions(-) create mode 100644 h2o-py/tests/testdir_parser/pyunit_GH_15947_skipped_column_error.py create mode 100644 h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py index 542cb117d872..394850dd47d0 100644 --- a/h2o-py/h2o/h2o.py +++ b/h2o-py/h2o/h2o.py @@ -868,14 +868,15 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co if ind in skipped_columns: use_type[ind]=False - if column_names is not None: + if column_names is not None: # used when python object is converted to H2O Frame if not isinstance(column_names, list): raise ValueError("col_names should be a list") if (skipped_columns is not None) and len(skipped_columns)>0: - if (len(column_names)) != parse_column_len: + # if ((len(column_names)-len(skipped_columns)) != parse_column_len) and (len(column_names) != parse_column_len): + if (len(column_names)-len(skipped_columns)) != parse_column_len: raise ValueError( - "length of col_names should be equal to the number of columns parsed: %d vs %d" - % (len(column_names), parse_column_len)) - else: + "length of col_names minus length of skipped_columns should equal the number of columns parsed: " + "%d vs %d" % ((len(column_names)-len(skipped_columns), parse_column_len))) + else: # no skipped columns here if len(column_names) != len(j["column_types"]): raise ValueError( "length of col_names should be equal to the number of columns: %d vs %d" % (len(column_names), len(j["column_types"]))) diff --git a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py index b9f101dda2a7..b284f6e53f39 100644 --- a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py +++ b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py @@ -125,12 +125,10 @@ def H2OFrame_from_H2OFrame(): assert dupl4.columns == ["n1", "s1"] -def H2OFrame_skipped_columns_is_BUGGY(): - try: - h2o.H2OFrame(data, skipped_columns=[1]) - assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue https://github.com/h2oai/h2o-3/issues/15947 - except ValueError as e: - assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e) +def H2OFrame_skipped_columns_BUG_fixed(): + f1 = h2o.H2OFrame(data, skipped_columns=[1]) + f2 = h2o.H2OFrame(data) + assert f1.ncol == (f2.ncol-1), "expected number of columns: {0}, actual column numbers: {1}".format(f1.ncol, (f2.ncol-1)) pu.run_tests([ @@ -141,5 +139,5 @@ def H2OFrame_skipped_columns_is_BUGGY(): H2OFrame_from_pandas, H2OFrame_from_scipy, H2OFrame_from_H2OFrame, - H2OFrame_skipped_columns_is_BUGGY + H2OFrame_skipped_columns_BUG_fixed ]) diff --git a/h2o-py/tests/testdir_parser/pyunit_GH_15947_skipped_column_error.py b/h2o-py/tests/testdir_parser/pyunit_GH_15947_skipped_column_error.py new file mode 100644 index 000000000000..76d4869f7af0 --- /dev/null +++ b/h2o-py/tests/testdir_parser/pyunit_GH_15947_skipped_column_error.py @@ -0,0 +1,15 @@ +import sys +sys.path.insert(1,"../../") +import h2o +from tests import pyunit_utils + +# Seb has reported that skipped_columns does not work if skipped_columns is called with h2o.H2OFrame +def test_skipped_columns(): + data = [[1, 4, "a", 1], [2, 5, "b", 0], [3, 6, "", 1]] + frame = h2o.H2OFrame(data, skipped_columns=[1, 2]) + assert frame.ncol == 2, "Expected column number: 2. Actual: {0}".format(frame.ncol) + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_skipped_columns) +else: + test_skipped_columns() diff --git a/h2o-r/h2o-package/R/frame.R b/h2o-r/h2o-package/R/frame.R index c3a2fcf74925..2e7b9f23ad29 100644 --- a/h2o-r/h2o-package/R/frame.R +++ b/h2o-r/h2o-package/R/frame.R @@ -4109,6 +4109,7 @@ use.package <- function(package, #' #' @param x An \code{R} object. #' @param destination_frame A string with the desired name for the H2OFrame +#' @param skipped_columns A list of integer containing columns to be skipped and not parsed into the final frame #' @param use_datatable allow usage of data.table #' @param \dots arguments passed to method arguments. #' @export @@ -4135,15 +4136,19 @@ use.package <- function(package, #' stopifnot(is.h2o(m_hf), dim(m_hf) == dim(m)) #' } #' } -as.h2o <- function(x, destination_frame="", ...) { +as.h2o <- function(x, destination_frame="", skipped_columns=NULL, ...) { .key.validate(destination_frame) - UseMethod("as.h2o") + if (is.null(skipped_columns)) { + UseMethod("as.h2o") + } else { + as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns) + } } #' @rdname as.h2o #' @method as.h2o default #' @export -as.h2o.default <- function(x, destination_frame="", ...) { +as.h2o.default <- function(x, destination_frame="", skipped_columns=NULL, ...) { if( destination_frame=="" ) { subx <- destination_frame.guess(deparse(substitute(x))) destination_frame <- .key.make(if(nzchar(subx)) subx else paste0(class(x), "_", collapse = "")) @@ -4152,13 +4157,13 @@ as.h2o.default <- function(x, destination_frame="", ...) { data.frame(C1=x) else as.data.frame(x, ...) - as.h2o.data.frame(x, destination_frame=destination_frame) + as.h2o.data.frame(x, destination_frame=destination_frame, skipped_columns=skipped_columns) } #' @rdname as.h2o #' @method as.h2o H2OFrame #' @export -as.h2o.H2OFrame <- function(x, destination_frame="", ...) { +as.h2o.H2OFrame <- function(x, destination_frame="", skipped_columns=NULL, ...) { if( destination_frame=="" ) { subx <- destination_frame.guess(deparse(substitute(x))) destination_frame <- .key.make(if(nzchar(subx)) subx else "H2OFrame_copy") @@ -4173,7 +4178,7 @@ as.h2o.H2OFrame <- function(x, destination_frame="", ...) { #' @seealso \code{\link{use.package}} #' @references \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/} #' @export -as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) { +as.h2o.data.frame <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) { if( destination_frame=="" ) { subx <- destination_frame.guess(deparse(substitute(x))) destination_frame <- .key.make(if(nzchar(subx)) subx else "data.frame") @@ -4203,7 +4208,8 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) if (verbose) cat(sprintf("writing csv to disk using '%s' took %.2fs\n", fun, proc.time()[[3]]-pt)) #if (verbose) pt <- proc.time()[[3]] # timings inside h2f <- h2o.uploadFile(tmpf, destination_frame = destination_frame, header = TRUE, col.types=types, - col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x))) + col.names=colnames(x, do.NULL=FALSE, prefix="C"), na.strings=rep(c("NA_h2o"),ncol(x)), + skipped_columns=skipped_columns) #if (verbose) cat(sprintf("uploading csv to h2o using 'h2o.uploadFile' took %.2fs\n", proc.time()[[3]]-pt)) file.remove(tmpf) h2f @@ -4215,7 +4221,7 @@ as.h2o.data.frame <- function(x, destination_frame="", use_datatable=TRUE, ...) #' To speedup execution time for large sparse matrices, use h2o datatable. Make sure you have installed and imported data.table and slam packages. #' Turn on h2o datatable by options("h2o.use.data.table"=TRUE) #' @export -as.h2o.Matrix <- function(x, destination_frame="", use_datatable=TRUE, ...) { +as.h2o.Matrix <- function(x, destination_frame="", skipped_columns=NULL, use_datatable=TRUE, ...) { if( destination_frame=="") { subx <- destination_frame.guess(deparse(substitute(x))) destination_frame <- .key.make(if(nzchar(subx)) subx else "Matrix") diff --git a/h2o-r/h2o-package/R/parse.R b/h2o-r/h2o-package/R/parse.R index d49b8db5564c..0ba9ed0afbdf 100755 --- a/h2o-r/h2o-package/R/parse.R +++ b/h2o-r/h2o-package/R/parse.R @@ -219,8 +219,10 @@ h2o.parseSetup <- function(data, pattern="", destination_frame = "", header = NA else col.names if (!is.null(parseSetup$column_names) && - (length(parseSetup$column_names) != parsedColLength)) { - stop("length of col.names must equal to the number of columns in dataset") + (length(parseSetup$column_names) != parsedColLength)) { # should equal, if not, need to check skipped_columns + if ((!is.null(skipped_columns) && ((length(parseSetup$column_names)-length(skipped_columns)) != parsedColLength)) + || is.null(skipped_columns)) # if no skipped column, this is an error. If skipped columns, check length + stop("length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset") } # change column names to what the user specified if (!is.null(skipped_columns)) { diff --git a/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R b/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R index f26476d67026..1945309820d0 100644 --- a/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R +++ b/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R @@ -66,7 +66,7 @@ test.continuous.or.categorical <- function() { e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"), col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x) - expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset") + expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset") # col.types as character vector df.hex4 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files"), @@ -98,7 +98,7 @@ test.continuous.or.categorical <- function() { e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"), col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x) - expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset") + expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset") # col.types as character vector df.hex6 <- h2o.importFile(locate("smalldata/iris/multiple_iris_files_wheader"), col.names=c("C1","C2","C3","C4","C5"), diff --git a/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R b/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R new file mode 100644 index 000000000000..94c7e016ef9a --- /dev/null +++ b/h2o-r/tests/testdir_parser/runit_GH_15947_skipped_column_error.R @@ -0,0 +1,10 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../scripts/h2o-r-test-setup.R") + +test.skipped_columns <- function() { + iris_hf <- as.h2o(iris, skipped_columns=c(1,2)) + expect_true(ncol(iris_hf) == (ncol(iris)-2)) + print("Columns are skipped!!!") +} + +doTest("Test skipped_columns when using as.h2o to change data frame to H2O Frame.", test.skipped_columns) From e82f0e1871f8da95aefeb93fd825ea2ea7233614 Mon Sep 17 00:00:00 2001 From: wendycwong Date: Thu, 23 May 2024 10:34:11 -0700 Subject: [PATCH 2/3] GH-15947: fix R test error message change, fix Python column_names contain only columns not skipped. --- h2o-py/h2o/h2o.py | 5 ++--- h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py index 394850dd47d0..7bf76a0f9397 100644 --- a/h2o-py/h2o/h2o.py +++ b/h2o-py/h2o/h2o.py @@ -871,11 +871,10 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co if column_names is not None: # used when python object is converted to H2O Frame if not isinstance(column_names, list): raise ValueError("col_names should be a list") if (skipped_columns is not None) and len(skipped_columns)>0: - # if ((len(column_names)-len(skipped_columns)) != parse_column_len) and (len(column_names) != parse_column_len): - if (len(column_names)-len(skipped_columns)) != parse_column_len: + if len(column_names) != parse_column_len: raise ValueError( "length of col_names minus length of skipped_columns should equal the number of columns parsed: " - "%d vs %d" % ((len(column_names)-len(skipped_columns), parse_column_len))) + "%d vs %d" % (len(column_names), parse_column_len)) else: # no skipped columns here if len(column_names) != len(j["column_types"]): raise ValueError( "length of col_names should be equal to the number of columns: %d vs %d" diff --git a/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R b/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R index 1945309820d0..bb9592e6a417 100644 --- a/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R +++ b/h2o-r/tests/testdir_jira/runit_hexdev_29_import_types.R @@ -11,6 +11,7 @@ source("../../scripts/h2o-r-test-setup.R") test.continuous.or.categorical <- function() { df.hex <- h2o.uploadFile(locate("smalldata/jira/hexdev_29.csv"), col.types = c("enum", "enum", "enum")) + browser() expect_true(is.factor(df.hex$h1)) expect_true(is.factor(df.hex$h2)) @@ -36,7 +37,7 @@ test.continuous.or.categorical <- function() { e <- tryCatch(h2o.importFile(locate("smalldata/iris/iris.csv"), col.names=c("C1","C2","C3","C4","C5","C6"), col.types=list(by.col.name=c("C4"),types=c("Enum"))), error = function(x) x) - expect_true(e[[1]] == "length of col.names must equal to the number of columns in dataset") + expect_true(e[[1]] == "length of col.names (minus length of skipped_columns if present) must equal to the number of columns in dataset") # col.types as character vector df.hex2 <- h2o.importFile(locate("smalldata/iris/iris.csv"), col.types=c("Numeric","Numeric","Enum","Numeric","Enum")) From 0cb824539342b20349bf1d8150d69f3d1fa2c044 Mon Sep 17 00:00:00 2001 From: wendycwong Date: Thu, 23 May 2024 13:32:49 -0700 Subject: [PATCH 3/3] GH-15947: fixed test conditions when python object is tranforming to H2O Frame and when import_file is loaded into H2O frame. --- h2o-py/h2o/h2o.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py index 7bf76a0f9397..db67d4fb93be 100644 --- a/h2o-py/h2o/h2o.py +++ b/h2o-py/h2o/h2o.py @@ -868,10 +868,18 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co if ind in skipped_columns: use_type[ind]=False - if column_names is not None: # used when python object is converted to H2O Frame + if column_names is not None: if not isinstance(column_names, list): raise ValueError("col_names should be a list") if (skipped_columns is not None) and len(skipped_columns)>0: - if len(column_names) != parse_column_len: + # when we are converting a python object to H2OFrame, column_names will include all columns despite + # skipped columns are specified. In this case, we need to make sure that + # len(column_names)-len(skipped_columns)==parse_column_len + # When we are importing a file with skipped columns mentioned, column_names will only contain columns that + # are not skipped. Hence, in this case, we need to check len(column_names) == parse_column_len. + # To combine the two, correct parsing will have conditions len(column_names)-len(skipped_columns)==parse_column_len + # or len(column_names)==parse_column_len. Hence, we will raise an error when + # not(len(column_names)-len(skipped_columns)==parse_column_len or len(column_names)==parse_column_len happened. + if not((len(column_names) == parse_column_len) or ((len(column_names)-len(skipped_columns))==parse_column_len)): raise ValueError( "length of col_names minus length of skipped_columns should equal the number of columns parsed: " "%d vs %d" % (len(column_names), parse_column_len))