recognize add'l params (col names,etc) for python objects

h2oai · Oct 18, 2015 · a902a8e · a902a8e
1 parent 78da066
commit a902a8e
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 26 deletions.
diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
@@ -66,10 +66,11 @@ def __init__(self, python_obj=None, file_path=None, raw_id=None, expr=None, dest
     self._ast       = None
     self._data      = None  # any cached data
 
+    kwargs = {k:v for k, v in locals().items() if k in ["destination_frame", "header", "separator", "column_names", "column_types", "na_strings"]}
     if expr is not None:         self._ast = expr
-    elif python_obj is not None: self._upload_python_object(python_obj)
-    elif file_path is not None:  self._import_parse(file_path, destination_frame, header, separator, column_names, column_types, na_strings)
-    elif raw_id is not None:     self._handle_text_key(raw_id)
+    elif python_obj is not None: self._upload_python_object(python_obj, **kwargs)
+    elif file_path is not None:  self._import_parse(file_path, **kwargs)
+    elif raw_id is not None:     self._handle_text_key(raw_id, **kwargs)
     else: pass
 
   @staticmethod
@@ -88,21 +89,22 @@ def get_frame(frame_id):
 
   def __str__(self): return self._id
 
-  def _import_parse(self, file_path, destination_frame, header, separator, column_names, column_types, na_strings):
+  def _import_parse(self, file_path, **kwargs):
     rawkey = h2o.lazy_import(file_path)
-    setup = h2o.parse_setup(rawkey, destination_frame, header, separator, column_names, column_types, na_strings)
+    setup = h2o.parse_setup(rawkey, **kwargs)
     parse = h2o._parse(setup)
     self._update_post_parse(parse)
     thousands_sep = h2o.H2ODisplay.THOUSANDS
     if isinstance(file_path, str): print "Imported {}. Parsed {} rows and {} cols".format(file_path,thousands_sep.format(self._nrows), thousands_sep.format(self._ncols))
     else:                          h2o.H2ODisplay([["File"+str(i+1),f] for i,f in enumerate(file_path)],None, "Parsed {} rows and {} cols".format(thousands_sep.format(self._nrows), thousands_sep.format(self._ncols)))
 
-  def _upload_python_object(self, python_obj):
+  def _upload_python_object(self, python_obj, **kwargs):
     """
     Properly handle native python data types. For a discussion of the rules and
     permissible data types please refer to the main documentation for H2OFrame.
 
     :param python_obj: A tuple, list, dict, collections.OrderedDict
+    :param kwargs: Optional arguments for input into parse_setup(), such as column_names and column_types
     :return: None
     """
     # [] and () cases -- folded together since H2OFrame is mutable
@@ -128,32 +130,33 @@ def _upload_python_object(self, python_obj):
     tmp_file = os.fdopen(tmp_handle,'wb')
     # create a new csv writer object thingy
     csv_writer = csv.DictWriter(tmp_file, fieldnames=header, restval=None, dialect="excel", extrasaction="ignore", delimiter=",")
-    csv_writer.writeheader()             # write the header
-    csv_writer.writerows(data_to_write)  # write the data
-    tmp_file.close()                     # close the streams
-    self._upload_raw_data(tmp_path)      # actually upload the data to H2O
-    os.remove(tmp_path)                  # delete the tmp file
+    if isinstance(python_obj, (dict, collections.OrderedDict)):
+      csv_writer.writeheader()                     # write the header
+    csv_writer.writerows(data_to_write)            # write the data
+    tmp_file.close()                               # close the streams
+    self._upload_raw_data(tmp_path, **kwargs)      # actually upload the data to H2O
+    os.remove(tmp_path)                            # delete the tmp file
 
-  def _handle_text_key(self, text_key, check_header=None):
+  def _handle_text_key(self, text_key, **kwargs):
     """
     Handle result of upload_file
 
     :param test_key: A key pointing to raw text to be parsed
+    :param kwargs: Additional optional arguments for h2o.parse_setup(), such as column_names and column_types.
     :return: Part of the H2OFrame constructor.
     """
     # perform the parse setup
-    setup = h2o.parse_setup(text_key)
-    if check_header is not None: setup["check_header"] = check_header
+    setup = h2o.parse_setup(text_key, **kwargs)
     parse = h2o._parse(setup)
     self._update_post_parse(parse)
     thousands_sep = h2o.H2ODisplay.THOUSANDS
     print "Uploaded {} into cluster with {} rows and {} cols".format(text_key, thousands_sep.format(self._nrows), thousands_sep.format(self._ncols))
 
-  def _upload_raw_data(self, tmp_file_path):
+  def _upload_raw_data(self, tmp_file_path, **kwargs):
     fui = {"file": os.path.abspath(tmp_file_path)}                            # file upload info is the normalized path to a local file
     dest_key = _py_tmp_key()                                                  # create a random name for the data
     h2o.H2OConnection.post_json("PostFile", fui, destination_frame=dest_key)  # do the POST -- blocking, and "fast" (does not real data upload)
-    self._handle_text_key(dest_key, 1)                                        # actually parse the data and setup self._vecs
+    self._handle_text_key(dest_key, **kwargs)                                 # actually parse the data and setup self._vecs
 
   def __iter__(self):
     """

diff --git a/h2o-py/tests/testdir_misc/pyunit_colnames.py b/h2o-py/tests/testdir_misc/pyunit_colnames.py
@@ -1,19 +1,33 @@
 import sys
 sys.path.insert(1, "../../")
 import h2o, tests
+import numpy as np
 
 def col_names_check():
-
-
 
-    iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv"))
-    assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
-        "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
-                                                           iris_wheader.col_names)
+  iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv"))
+  assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
+      "Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
+                                                         iris_wheader.col_names)
 
-    iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv"))
-    assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
-                                                           "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)
+  iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv"))
+  assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
+                                                         "{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)
+
+  df = h2o.H2OFrame(np.random.randn(100,4).tolist(), column_names=list("ABCD"), column_types=["Enum"]*4)
+  df.head()
+  assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names)
+  assert df.types == {"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, "Expected {} for column types " \
+                              "but got {}".format({"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"},
+                                                  df.types)
+
+  df = h2o.H2OFrame(np.random.randn(100,4).tolist())
+  df.head()
+  assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"]
+                                                                                                 , df.col_names)
+  assert df.types == {"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, "Expected {}" \
+                      " for column types but got {}".format({"C3": "Numeric", "C2": "Numeric", "C1": "Numeric",
+                                                             "C4": "Numeric"}, df.types)
 
 if __name__ == "__main__":
-    tests.run_test(sys.argv, col_names_check)
+  tests.run_test(sys.argv, col_names_check)
diff --git a/h2o-py/tests/testdir_misc/pyunit_types.py b/h2o-py/tests/testdir_misc/pyunit_types.py
@@ -1,6 +1,7 @@
 import sys
 sys.path.insert(1, "../../")
 import h2o, tests
+import numpy as np
 
 
 def pyunit_types():
@@ -15,5 +16,15 @@ def pyunit_types():
 
   print types2
 
+  df = h2o.H2OFrame(np.random.randn(100,4).tolist(), column_names=list("ABCD"), column_types=["Enum"]*4)
+  assert df.types == {"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, "Expected {} for column types " \
+                      "but got {}".format({"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, df.types)
+
+  df = h2o.H2OFrame(np.random.randn(100,4).tolist())
+  assert df.types == {"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, "Expected {}" \
+          " for column types but got {}".format({"C3": "Numeric", "C2": "Numeric", "C1": "Numeric",
+                                                "C4": "Numeric"}, df.types)
+
+
 if __name__ == "__main__":
   tests.run_test(sys.argv, pyunit_types)