move setup functionality from parse to parse_setup

allow col_types to be a dict w/o specifying names allow na_strings to be a dict clean up code
h2oai · Oct 7, 2015 · 56f1c36 · 56f1c36
1 parent b7863c9
commit 56f1c36
Show file tree

Hide file tree

Showing 7 changed files with 148 additions and 123 deletions.
diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
@@ -91,7 +91,7 @@ def __str__(self): return self._id
   def _import_parse(self, file_path, destination_frame, header, separator, column_names, column_types, na_strings):
     rawkey = h2o.lazy_import(file_path)
     setup = h2o.parse_setup(rawkey, destination_frame, header, separator, column_names, column_types, na_strings)
-    parse = h2o.parse(setup, _py_tmp_key())  # create a new key
+    parse = h2o.parse(setup)
     self._id = parse["job"]["dest"]["name"]
     self._computed=True
     self._nrows = int(H2OFrame(expr=ExprNode("nrow", self))._scalar())
@@ -150,7 +150,7 @@ def _handle_text_key(self, text_key, check_header=None):
     # perform the parse setup
     setup = h2o.parse_setup(text_key)
     if check_header is not None: setup["check_header"] = check_header
-    parse = h2o.parse(setup, _py_tmp_key())
+    parse = h2o.parse(setup)
     self._computed=True
     self._id = parse["destination_frame"]["name"]
     self._ncols = parse["number_columns"]

diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py
@@ -1,7 +1,6 @@
 import warnings
 warnings.simplefilter('always', DeprecationWarning)
 import os
-import itertools
 import functools
 import os.path
 import re
@@ -43,19 +42,18 @@ def upload_file(path, destination_frame="", header=(-1, 0, 1), sep="", col_names
 ----------
   path : str
     A path specifying the location of the data to upload.
-  destination_frame : H2OFrame
-    The name of the H2O Frame in the H2O Cluster.
+  destination_frame : str, optional
+    The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
   header : int, optional
    -1 means the first line is data, 0 means guess, 1 means first line is header.
-  sep : string, optional
+  sep : str, optional
     The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
-  col_names : optional
+  col_names : list, optional
     A list of column names for the file.
-  col_types : optional
-    A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
-  na_strings : optional
-    A list of strings which are to be interpreted as missing values.
-
+  col_types : list or dict, optional
+    A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing. If a list, the types for elements that are None will be guessed.
+  na_strings : list or dict, optional
+    A list of strings, or a list of lists of strings (one list per column), or a dictionary of column names to strings which are to be interpreted as missing values.
  :return: A new H2OFrame
   """
   fui = {"file": os.path.abspath(path)}
@@ -72,20 +70,20 @@ def import_file(path=None, destination_frame="", parse=True, header=(-1, 0, 1),
   ----------
   path : str
     A path specifying the location of the data to import.
-  destination_frame :
-    (Optional) The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
-  parse :
-    (Optional) A logical value indicating whether the file should be parsed after import.
-  header :
-   (Optional) -1 means the first line is data, 0 means guess, 1 means first line is header.
-  sep :
-    (Optional) The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
-  col_names :
-    (Optional) A list of column names for the file.
-  col_types :
-    (Optional) A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
-  na_strings :
-    (Optional) A list of strings which are to be interpreted as missing values.
+  destination_frame : str, optional
+    The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
+  parse : boolean, optional
+    A logical value indicating whether the file should be parsed after import.
+  header : int, optional
+   -1 means the first line is data, 0 means guess, 1 means first line is header.
+  sep : str, optional
+    The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
+  col_names : list, optional
+    A list of column names for the file.
+  col_types : list or dict, optional
+    A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing. If a list, the types for elements that are None will be guessed.
+  na_strings : list or dict, optional
+    A list of strings, or a list of lists of strings (one list per column), or a dictionary of column names to strings which are to be interpreted as missing values.
   :return: A new H2OFrame
   """
   if not parse:
@@ -101,110 +99,112 @@ def parse_setup(raw_frames, destination_frame="", header=(-1, 0, 1), separator="
 
   raw_frames : H2OFrame
     A collection of imported file frames
-  destination_frame :
-    (Optional) The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
-  parse :
-    (Optional) A logical value indicating whether the file should be parsed after import.
-  header :
-    (Optional) -1 means the first line is data, 0 means guess, 1 means first line is header.
-  sep :
-    (Optional) The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
-  col_names :
-    (Optional) A list of column names for the file.
-  col_types :
-    (Optional) A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
-  na_strings :
-    (Optional) A list of strings which are to be interpreted as missing values.
+  destination_frame : str, optional
+    The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
+  parse : boolean, optional
+    A logical value indicating whether the file should be parsed after import.
+  header : int, optional
+   -1 means the first line is data, 0 means guess, 1 means first line is header.
+  sep : str, optional
+    The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
+  col_names : list, optional
+    A list of column names for the file.
+  col_types : list or dict, optional
+    A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing. If a list, the types for elements that are None will be guessed.
+  na_strings : list or dict, optional
+    A list of strings, or a list of lists of strings (one list per column), or a dictionary of column names to strings which are to be interpreted as missing values.
   :return: A ParseSetup "object"
   """
 
   # The H2O backend only accepts things that are quoted
   if isinstance(raw_frames, unicode): raw_frames = [raw_frames]
   j = H2OConnection.post_json(url_suffix="ParseSetup", source_frames=[_quoted(id) for id in raw_frames])
 
-  if destination_frame: j["destination_frame"] = destination_frame
+  if destination_frame: j["destination_frame"] = _quoted(destination_frame).replace("%",".").replace("&",".") # TODO: really should be url encoding...
   if not isinstance(header, tuple):
     if header not in (-1, 0, 1): raise ValueError("header should be -1, 0, or 1")
     j["check_header"] = header
   if separator:
     if not isinstance(separator, basestring) or len(separator) != 1: raise ValueError("separator should be a single character string")
-    j["separator"] = separator
+    j["separator"] = ord(separator)
   if column_names:
     if not isinstance(column_names, list): raise ValueError("col_names should be a list")
+    if len(column_names) != len(j["column_types"]): raise ValueError("length of col_names should be equal to the number of columns")
     j["column_names"] = column_names
   if column_types:
     if isinstance(column_types, dict):
-      if not column_names: raise ValueError("col_names should be specified if col_types is a dictionary of column names to types")
-      if set(column_names) != set(column_types.keys()): raise ValueError("col_names and column names in col_types are unequal")
-    elif not isinstance(column_types, list):
+      #overwrite dictionary to ordered list of column types. if user didn't specify column type for all names, use type provided by backend
+      if not j["column_names"]: raise ValueError("column names should be specified")
+      if not set(column_types.keys()).issubset(set(j["column_names"])): raise ValueError("names specified in col_types is not a subset of the column names")
+      idx = 0
+      column_types_list = []
+      for name in j["column_names"]:
+        if name in column_types:
+          column_types_list.append(column_types[name])
+        else:
+          column_types_list.append(j["column_types"][idx])
+        idx += 1
+      column_types = column_types_list
+    elif isinstance(column_types, list):
+      if len(column_types) != len(j["column_types"]): raise ValueError("length of col_types should be equal to the number of columns")
+      column_types = [column_types[i] if column_types[i] else j["column_types"][i] for i in range(len(column_types))]
+    else: #not dictionary or list
       raise ValueError("col_types should be a list of types or a dictionary of column names to types")
     j["column_types"] = column_types
-  if na_strings: j["na_strings"] = na_strings
-
+  if na_strings:
+    if isinstance(na_strings, dict):
+      #overwrite dictionary to ordered list of lists of na_strings
+      if not j["column_names"]: raise ValueError("column names should be specified")
+      if not set(na_strings.keys()).issubset(set(j["column_names"])): raise ValueError("names specified in na_strings is not a subset of the column names")
+      j["na_strings"] = [[] for _ in range(len(j["column_names"]))]
+      for name, na in na_strings.items():
+        idx = j["column_names"].index(name)
+        if isinstance(na, basestring): na = [na]
+        for n in na: j["na_strings"][idx].append(_quoted(n))
+    elif _is_list_of_lists(na_strings):
+      if len(na_strings) != len(j["column_types"]): raise ValueError("length of na_strings should be equal to the number of columns")
+      j["na_strings"] = [[_quoted(na) for na in col] if col is not None else [] for col in na_strings]
+    elif isinstance(na_strings, list):
+      j["na_strings"] = [[_quoted(na) for na in na_strings]] * len(j["column_names"])
+    else: #not a dictionary or list
+      raise ValueError("na_strings should be a list, a list of lists (one list per column), or a dictionary of column "
+                       "names to strings which are to be interpreted as missing values")
+
+  #quote column names and column types also when not specified by user
+  if j["column_names"]: j["column_names"] = map(_quoted, j["column_names"])
+  j["column_types"] = map(_quoted, j["column_types"])
   return j
 
 
-def parse(setup, h2o_name, first_line_is_header=(-1, 0, 1)):
+def parse(setup):
   """
   Trigger a parse; blocking; removeFrame just keep the Vecs.
 
   Parameters
   ----------
   setup : dict
     The result of calling parse_setup.
-  h2o_name : H2OFrame
-    The name of the H2O Frame on the back end.
-  first_line_is_header : int
-    -1 means data, 0 means guess, 1 means header.
 
 :return: A new parsed object
   """
   # Parse parameters (None values provided by setup)
-  p = { 'destination_frame' : h2o_name,
-        'parse_type' : None,
-        'separator' : None,
-        'single_quotes' : None,
-        'check_header'  : None,
-        'number_columns' : None,
-        'chunk_size'    : None,
-        'delete_on_done' : True,
-        'blocking' : False,
+  p = { "destination_frame" : _py_tmp_key(),
+        "parse_type" : None,
+        "separator" : None,
+        "single_quotes" : None,
+        "check_header"  : None,
+        "number_columns" : None,
+        "chunk_size"    : None,
+        "delete_on_done" : True,
+        "blocking" : False,
+        "column_types" : None
         }
 
-  if setup["destination_frame"]:
-    setup["destination_frame"] = _quoted(setup["destination_frame"]).replace("%",".").replace("&",".")  # TODO: really should be url encoding...
-
-  if isinstance(first_line_is_header, tuple):
-    first_line_is_header = setup["check_header"]
-
-  if isinstance(setup["separator"], basestring):
-    setup["separator"] = ord(setup["separator"])
-
-  if setup["column_types"]: #process column_types before column_names for matching keys before quoting
-    if isinstance(setup["column_types"], dict):
-      #overwrite dictionary to ordered list of column types
-      setup["column_types"] = [_quoted(setup["column_types"][name]) for name in setup["column_names"]]
-    else: #if list
-      setup["column_types"] = [_quoted(name) for name in setup["column_types"]]
-    p["column_types"] = None
-
-  if setup["column_names"]:
-    setup["column_names"] = [_quoted(name) for name in setup["column_names"]]
-    p["column_names"] = None
+  if setup["column_names"]: p["column_names"] = None
+  if setup["na_strings"]: p["na_strings"] = None
 
-  if setup["na_strings"]:
-    if _is_list_of_lists(setup["na_strings"]): setup["na_strings"] = [[_quoted(na) for na in col] if col is not None else [] for col in setup["na_strings"]]
-    else:
-      setup["na_strings"] = [_quoted(na) for na in setup["na_strings"]] # quote the strings
-      setup["na_strings"] = '\"' + str(list(itertools.repeat(setup["na_strings"], len(setup["column_types"])))) + '\"'
-    p["na_strings"] = None
-
-
-  # update the parse parameters with the parse_setup values
   p.update({k: v for k, v in setup.iteritems() if k in p})
 
-  p["check_header"] = first_line_is_header
-
   # Extract only 'name' from each src in the array of srcs
   p['source_frames'] = [_quoted(src['name']) for src in setup['source_frames']]
 
@@ -231,7 +231,7 @@ def parse_raw(setup, id=None, first_line_is_header=(-1, 0, 1)):
   """
   id = setup["destination_frame"]
   fr = H2OFrame()
-  parsed = parse(setup, id, first_line_is_header)
+  parsed = parse(setup)
   fr._computed = True
   fr._id = id
   fr._keep = True
@@ -240,7 +240,7 @@ def parse_raw(setup, id=None, first_line_is_header=(-1, 0, 1)):
   fr._col_names = parsed['column_names'] if parsed["column_names"] else ["C" + str(x) for x in range(1,fr._ncols+1)]
   return fr
 
-def _quoted(key, replace=True):
+def _quoted(key):
   if key == None: return "\"\""
   #mimic behavior in R to replace "%" and "&" characters, which break the call to /Parse, with "."
   # key = key.replace("%", ".")

diff --git a/h2o-py/tests/testdir_demos/pyunit_lending_club_munging_assembly.py b/h2o-py/tests/testdir_demos/pyunit_lending_club_munging_assembly.py
@@ -1,6 +1,5 @@
 import sys
 sys.path.insert(1, "../../")
-from collections import OrderedDict as ODict
 import h2o,tests
 from h2o.assembly import *
 from h2o.transforms.preprocessing import *
@@ -30,12 +29,12 @@ def lending_club_munging_assembly():
                'Enum', 'Numeric', 'Numeric', 'Numeric', 'Numeric', 'Numeric', 'Numeric', 'Numeric',
                'Numeric', 'Numeric', 'Enum', 'Numeric', 'Enum', 'Enum', 'Numeric', 'Enum', 'Numeric']
 
-  types = ODict(zip(col_names,col_types))
+  types = dict(zip(col_names,col_types))
   types["int_rate"]   = "String"
   types["revol_util"] = "String"
   types["emp_length"] = "String"
 
-  data = h2o.import_file(path=small_test, col_types=types.values())
+  data = h2o.import_file(path=small_test, col_types=types)
   data[["int_rate","revol_util","emp_length"]].show()
 
   assembly = H2OAssembly(

diff --git a/h2o-py/tests/testdir_jira/pyunit_hexdev_29_additional_parameters.py b/h2o-py/tests/testdir_jira/pyunit_hexdev_29_additional_parameters.py
@@ -4,7 +4,7 @@
 ## column_names, and column_types and that certain characters are replaced.
 ##
 ################################################################################
-import sys, os
+import sys
 sys.path.insert(1, "../../")
 import h2o, tests
 
@@ -30,7 +30,7 @@ def additional_parameters():
     #col_types as dictionary
     dest_frame="dev29&hex%"
     c_names = ["a", "b", "c"]
-    c_types = {"c":"string", "a":"enum", "b": "enum"}
+    c_types = {"c":"string", "a":"string"}
 
     fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                            destination_frame=dest_frame,
@@ -42,7 +42,9 @@ def additional_parameters():
     assert fhex.col_names == c_names
     col_summary = h2o.frame(fhex._id)["frames"][0]["columns"]
     for i in range(len(col_summary)):
-      assert col_summary[i]["type"] == c_types[c_names[i]]
+      name = c_names[i]
+      if name in c_types:
+        assert col_summary[i]["type"] == c_types[name]
 
 if __name__ == "__main__":
     tests.run_test(sys.argv, additional_parameters)
diff --git a/h2o-py/tests/testdir_jira/pyunit_hexdev_29_import_types.py b/h2o-py/tests/testdir_jira/pyunit_hexdev_29_import_types.py
@@ -8,13 +8,7 @@
 import h2o, tests
 
 def continuous_or_categorical():
-  fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv"))
-  fsetup = h2o.parse_setup(fraw)
-  fsetup["column_types"][0] = "ENUM"
-  fsetup["column_types"][1] = "ENUM"
-  fsetup["column_types"][2] = "ENUM"
-
-  df_hex = h2o.parse_raw(fsetup)
+  df_hex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), col_types=["enum"]*3)
 
   df_hex.summary()
 

diff --git a/h2o-py/tests/testdir_jira/pyunit_hexdev_29_na_strings.py b/h2o-py/tests/testdir_jira/pyunit_hexdev_29_na_strings.py
@@ -15,14 +15,45 @@ def na_strings():
     fhex.summary()
     fhex_col_summary =  h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex._id) + "/summary")["frames"][0]["columns"]
     fhex_missing_count = sum([e["missing_count"] for e in fhex_col_summary])
+    assert fhex_missing_count == 0
 
+    #na_strings as list of lists
     fhex_na_strings = h2o.import_file(tests.locate(path),
                            na_strings=[[],["fish", "xyz"],[]])
     fhex_na_strings.summary()
     fhex__na_strings_col_summary =  h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
     fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
+    assert fhex_na_strings_missing_count == 2
 
-    assert fhex_missing_count == 0
+    #na_strings as single list
+    fhex_na_strings = h2o.import_file(tests.locate(path),
+                                      na_strings=["fish", "xyz"])
+    fhex_na_strings.summary()
+    fhex__na_strings_col_summary =  h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
+    fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
+    assert fhex_na_strings_missing_count == 2
+
+    #na_strings as dictionary with values as string
+    fhex_na_strings = h2o.import_file(tests.locate(path),
+                                      na_strings={"h2": "fish"})
+    fhex_na_strings.summary()
+    fhex__na_strings_col_summary =  h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
+    fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
+    assert fhex_na_strings_missing_count == 2
+
+    fhex_na_strings = h2o.import_file(tests.locate(path),
+                                      na_strings={"h1": "fish"})
+    fhex_na_strings.summary()
+    fhex__na_strings_col_summary =  h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
+    fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
+    assert fhex_na_strings_missing_count == 0
+
+    #na_strings as dictionary with values as list of strings
+    fhex_na_strings = h2o.import_file(tests.locate(path),
+                                      na_strings={"h2": ["fish","xyz"]})
+    fhex_na_strings.summary()
+    fhex__na_strings_col_summary =  h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
+    fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
     assert fhex_na_strings_missing_count == 2
 
 if __name__ == "__main__":