Skip to content

Commit

Permalink
move setup functionality from parse to parse_setup
Browse files Browse the repository at this point in the history
allow col_types to be a dict w/o specifying names
allow na_strings to be a dict
clean up code
  • Loading branch information
ludi317 committed Oct 7, 2015
1 parent b7863c9 commit 56f1c36
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 123 deletions.
4 changes: 2 additions & 2 deletions h2o-py/h2o/frame.py
Expand Up @@ -91,7 +91,7 @@ def __str__(self): return self._id
def _import_parse(self, file_path, destination_frame, header, separator, column_names, column_types, na_strings):
rawkey = h2o.lazy_import(file_path)
setup = h2o.parse_setup(rawkey, destination_frame, header, separator, column_names, column_types, na_strings)
parse = h2o.parse(setup, _py_tmp_key()) # create a new key
parse = h2o.parse(setup)
self._id = parse["job"]["dest"]["name"]
self._computed=True
self._nrows = int(H2OFrame(expr=ExprNode("nrow", self))._scalar())
Expand Down Expand Up @@ -150,7 +150,7 @@ def _handle_text_key(self, text_key, check_header=None):
# perform the parse setup
setup = h2o.parse_setup(text_key)
if check_header is not None: setup["check_header"] = check_header
parse = h2o.parse(setup, _py_tmp_key())
parse = h2o.parse(setup)
self._computed=True
self._id = parse["destination_frame"]["name"]
self._ncols = parse["number_columns"]
Expand Down
184 changes: 92 additions & 92 deletions h2o-py/h2o/h2o.py
@@ -1,7 +1,6 @@
import warnings
warnings.simplefilter('always', DeprecationWarning)
import os
import itertools
import functools
import os.path
import re
Expand Down Expand Up @@ -43,19 +42,18 @@ def upload_file(path, destination_frame="", header=(-1, 0, 1), sep="", col_names
----------
path : str
A path specifying the location of the data to upload.
destination_frame : H2OFrame
The name of the H2O Frame in the H2O Cluster.
destination_frame : str, optional
The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
header : int, optional
-1 means the first line is data, 0 means guess, 1 means first line is header.
sep : string, optional
sep : str, optional
The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
col_names : optional
col_names : list, optional
A list of column names for the file.
col_types : optional
A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
na_strings : optional
A list of strings which are to be interpreted as missing values.
col_types : list or dict, optional
A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing. If a list, the types for elements that are None will be guessed.
na_strings : list or dict, optional
A list of strings, or a list of lists of strings (one list per column), or a dictionary of column names to strings which are to be interpreted as missing values.
:return: A new H2OFrame
"""
fui = {"file": os.path.abspath(path)}
Expand All @@ -72,20 +70,20 @@ def import_file(path=None, destination_frame="", parse=True, header=(-1, 0, 1),
----------
path : str
A path specifying the location of the data to import.
destination_frame :
(Optional) The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
parse :
(Optional) A logical value indicating whether the file should be parsed after import.
header :
(Optional) -1 means the first line is data, 0 means guess, 1 means first line is header.
sep :
(Optional) The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
col_names :
(Optional) A list of column names for the file.
col_types :
(Optional) A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
na_strings :
(Optional) A list of strings which are to be interpreted as missing values.
destination_frame : str, optional
The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
parse : boolean, optional
A logical value indicating whether the file should be parsed after import.
header : int, optional
-1 means the first line is data, 0 means guess, 1 means first line is header.
sep : str, optional
The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
col_names : list, optional
A list of column names for the file.
col_types : list or dict, optional
A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing. If a list, the types for elements that are None will be guessed.
na_strings : list or dict, optional
A list of strings, or a list of lists of strings (one list per column), or a dictionary of column names to strings which are to be interpreted as missing values.
:return: A new H2OFrame
"""
if not parse:
Expand All @@ -101,110 +99,112 @@ def parse_setup(raw_frames, destination_frame="", header=(-1, 0, 1), separator="
raw_frames : H2OFrame
A collection of imported file frames
destination_frame :
(Optional) The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
parse :
(Optional) A logical value indicating whether the file should be parsed after import.
header :
(Optional) -1 means the first line is data, 0 means guess, 1 means first line is header.
sep :
(Optional) The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
col_names :
(Optional) A list of column names for the file.
col_types :
(Optional) A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing.
na_strings :
(Optional) A list of strings which are to be interpreted as missing values.
destination_frame : str, optional
The unique hex key assigned to the imported file. If none is given, a key will automatically be generated.
parse : boolean, optional
A logical value indicating whether the file should be parsed after import.
header : int, optional
-1 means the first line is data, 0 means guess, 1 means first line is header.
sep : str, optional
The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator.
col_names : list, optional
A list of column names for the file.
col_types : list or dict, optional
A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing. If a list, the types for elements that are None will be guessed.
na_strings : list or dict, optional
A list of strings, or a list of lists of strings (one list per column), or a dictionary of column names to strings which are to be interpreted as missing values.
:return: A ParseSetup "object"
"""

# The H2O backend only accepts things that are quoted
if isinstance(raw_frames, unicode): raw_frames = [raw_frames]
j = H2OConnection.post_json(url_suffix="ParseSetup", source_frames=[_quoted(id) for id in raw_frames])

if destination_frame: j["destination_frame"] = destination_frame
if destination_frame: j["destination_frame"] = _quoted(destination_frame).replace("%",".").replace("&",".") # TODO: really should be url encoding...
if not isinstance(header, tuple):
if header not in (-1, 0, 1): raise ValueError("header should be -1, 0, or 1")
j["check_header"] = header
if separator:
if not isinstance(separator, basestring) or len(separator) != 1: raise ValueError("separator should be a single character string")
j["separator"] = separator
j["separator"] = ord(separator)
if column_names:
if not isinstance(column_names, list): raise ValueError("col_names should be a list")
if len(column_names) != len(j["column_types"]): raise ValueError("length of col_names should be equal to the number of columns")
j["column_names"] = column_names
if column_types:
if isinstance(column_types, dict):
if not column_names: raise ValueError("col_names should be specified if col_types is a dictionary of column names to types")
if set(column_names) != set(column_types.keys()): raise ValueError("col_names and column names in col_types are unequal")
elif not isinstance(column_types, list):
#overwrite dictionary to ordered list of column types. if user didn't specify column type for all names, use type provided by backend
if not j["column_names"]: raise ValueError("column names should be specified")
if not set(column_types.keys()).issubset(set(j["column_names"])): raise ValueError("names specified in col_types is not a subset of the column names")
idx = 0
column_types_list = []
for name in j["column_names"]:
if name in column_types:
column_types_list.append(column_types[name])
else:
column_types_list.append(j["column_types"][idx])
idx += 1
column_types = column_types_list
elif isinstance(column_types, list):
if len(column_types) != len(j["column_types"]): raise ValueError("length of col_types should be equal to the number of columns")
column_types = [column_types[i] if column_types[i] else j["column_types"][i] for i in range(len(column_types))]
else: #not dictionary or list
raise ValueError("col_types should be a list of types or a dictionary of column names to types")
j["column_types"] = column_types
if na_strings: j["na_strings"] = na_strings

if na_strings:
if isinstance(na_strings, dict):
#overwrite dictionary to ordered list of lists of na_strings
if not j["column_names"]: raise ValueError("column names should be specified")
if not set(na_strings.keys()).issubset(set(j["column_names"])): raise ValueError("names specified in na_strings is not a subset of the column names")
j["na_strings"] = [[] for _ in range(len(j["column_names"]))]
for name, na in na_strings.items():
idx = j["column_names"].index(name)
if isinstance(na, basestring): na = [na]
for n in na: j["na_strings"][idx].append(_quoted(n))
elif _is_list_of_lists(na_strings):
if len(na_strings) != len(j["column_types"]): raise ValueError("length of na_strings should be equal to the number of columns")
j["na_strings"] = [[_quoted(na) for na in col] if col is not None else [] for col in na_strings]
elif isinstance(na_strings, list):
j["na_strings"] = [[_quoted(na) for na in na_strings]] * len(j["column_names"])
else: #not a dictionary or list
raise ValueError("na_strings should be a list, a list of lists (one list per column), or a dictionary of column "
"names to strings which are to be interpreted as missing values")

#quote column names and column types also when not specified by user
if j["column_names"]: j["column_names"] = map(_quoted, j["column_names"])
j["column_types"] = map(_quoted, j["column_types"])
return j


def parse(setup, h2o_name, first_line_is_header=(-1, 0, 1)):
def parse(setup):
"""
Trigger a parse; blocking; removeFrame just keep the Vecs.
Parameters
----------
setup : dict
The result of calling parse_setup.
h2o_name : H2OFrame
The name of the H2O Frame on the back end.
first_line_is_header : int
-1 means data, 0 means guess, 1 means header.
:return: A new parsed object
"""
# Parse parameters (None values provided by setup)
p = { 'destination_frame' : h2o_name,
'parse_type' : None,
'separator' : None,
'single_quotes' : None,
'check_header' : None,
'number_columns' : None,
'chunk_size' : None,
'delete_on_done' : True,
'blocking' : False,
p = { "destination_frame" : _py_tmp_key(),
"parse_type" : None,
"separator" : None,
"single_quotes" : None,
"check_header" : None,
"number_columns" : None,
"chunk_size" : None,
"delete_on_done" : True,
"blocking" : False,
"column_types" : None
}

if setup["destination_frame"]:
setup["destination_frame"] = _quoted(setup["destination_frame"]).replace("%",".").replace("&",".") # TODO: really should be url encoding...

if isinstance(first_line_is_header, tuple):
first_line_is_header = setup["check_header"]

if isinstance(setup["separator"], basestring):
setup["separator"] = ord(setup["separator"])

if setup["column_types"]: #process column_types before column_names for matching keys before quoting
if isinstance(setup["column_types"], dict):
#overwrite dictionary to ordered list of column types
setup["column_types"] = [_quoted(setup["column_types"][name]) for name in setup["column_names"]]
else: #if list
setup["column_types"] = [_quoted(name) for name in setup["column_types"]]
p["column_types"] = None

if setup["column_names"]:
setup["column_names"] = [_quoted(name) for name in setup["column_names"]]
p["column_names"] = None
if setup["column_names"]: p["column_names"] = None
if setup["na_strings"]: p["na_strings"] = None

if setup["na_strings"]:
if _is_list_of_lists(setup["na_strings"]): setup["na_strings"] = [[_quoted(na) for na in col] if col is not None else [] for col in setup["na_strings"]]
else:
setup["na_strings"] = [_quoted(na) for na in setup["na_strings"]] # quote the strings
setup["na_strings"] = '\"' + str(list(itertools.repeat(setup["na_strings"], len(setup["column_types"])))) + '\"'
p["na_strings"] = None


# update the parse parameters with the parse_setup values
p.update({k: v for k, v in setup.iteritems() if k in p})

p["check_header"] = first_line_is_header

# Extract only 'name' from each src in the array of srcs
p['source_frames'] = [_quoted(src['name']) for src in setup['source_frames']]

Expand All @@ -231,7 +231,7 @@ def parse_raw(setup, id=None, first_line_is_header=(-1, 0, 1)):
"""
id = setup["destination_frame"]
fr = H2OFrame()
parsed = parse(setup, id, first_line_is_header)
parsed = parse(setup)
fr._computed = True
fr._id = id
fr._keep = True
Expand All @@ -240,7 +240,7 @@ def parse_raw(setup, id=None, first_line_is_header=(-1, 0, 1)):
fr._col_names = parsed['column_names'] if parsed["column_names"] else ["C" + str(x) for x in range(1,fr._ncols+1)]
return fr

def _quoted(key, replace=True):
def _quoted(key):
if key == None: return "\"\""
#mimic behavior in R to replace "%" and "&" characters, which break the call to /Parse, with "."
# key = key.replace("%", ".")
Expand Down
@@ -1,6 +1,5 @@
import sys
sys.path.insert(1, "../../")
from collections import OrderedDict as ODict
import h2o,tests
from h2o.assembly import *
from h2o.transforms.preprocessing import *
Expand Down Expand Up @@ -30,12 +29,12 @@ def lending_club_munging_assembly():
'Enum', 'Numeric', 'Numeric', 'Numeric', 'Numeric', 'Numeric', 'Numeric', 'Numeric',
'Numeric', 'Numeric', 'Enum', 'Numeric', 'Enum', 'Enum', 'Numeric', 'Enum', 'Numeric']

types = ODict(zip(col_names,col_types))
types = dict(zip(col_names,col_types))
types["int_rate"] = "String"
types["revol_util"] = "String"
types["emp_length"] = "String"

data = h2o.import_file(path=small_test, col_types=types.values())
data = h2o.import_file(path=small_test, col_types=types)
data[["int_rate","revol_util","emp_length"]].show()

assembly = H2OAssembly(
Expand Down
Expand Up @@ -4,7 +4,7 @@
## column_names, and column_types and that certain characters are replaced.
##
################################################################################
import sys, os
import sys
sys.path.insert(1, "../../")
import h2o, tests

Expand All @@ -30,7 +30,7 @@ def additional_parameters():
#col_types as dictionary
dest_frame="dev29&hex%"
c_names = ["a", "b", "c"]
c_types = {"c":"string", "a":"enum", "b": "enum"}
c_types = {"c":"string", "a":"string"}

fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
destination_frame=dest_frame,
Expand All @@ -42,7 +42,9 @@ def additional_parameters():
assert fhex.col_names == c_names
col_summary = h2o.frame(fhex._id)["frames"][0]["columns"]
for i in range(len(col_summary)):
assert col_summary[i]["type"] == c_types[c_names[i]]
name = c_names[i]
if name in c_types:
assert col_summary[i]["type"] == c_types[name]

if __name__ == "__main__":
tests.run_test(sys.argv, additional_parameters)
8 changes: 1 addition & 7 deletions h2o-py/tests/testdir_jira/pyunit_hexdev_29_import_types.py
Expand Up @@ -8,13 +8,7 @@
import h2o, tests

def continuous_or_categorical():
fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv"))
fsetup = h2o.parse_setup(fraw)
fsetup["column_types"][0] = "ENUM"
fsetup["column_types"][1] = "ENUM"
fsetup["column_types"][2] = "ENUM"

df_hex = h2o.parse_raw(fsetup)
df_hex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), col_types=["enum"]*3)

df_hex.summary()

Expand Down
33 changes: 32 additions & 1 deletion h2o-py/tests/testdir_jira/pyunit_hexdev_29_na_strings.py
Expand Up @@ -15,14 +15,45 @@ def na_strings():
fhex.summary()
fhex_col_summary = h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex._id) + "/summary")["frames"][0]["columns"]
fhex_missing_count = sum([e["missing_count"] for e in fhex_col_summary])
assert fhex_missing_count == 0

#na_strings as list of lists
fhex_na_strings = h2o.import_file(tests.locate(path),
na_strings=[[],["fish", "xyz"],[]])
fhex_na_strings.summary()
fhex__na_strings_col_summary = h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
assert fhex_na_strings_missing_count == 2

assert fhex_missing_count == 0
#na_strings as single list
fhex_na_strings = h2o.import_file(tests.locate(path),
na_strings=["fish", "xyz"])
fhex_na_strings.summary()
fhex__na_strings_col_summary = h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
assert fhex_na_strings_missing_count == 2

#na_strings as dictionary with values as string
fhex_na_strings = h2o.import_file(tests.locate(path),
na_strings={"h2": "fish"})
fhex_na_strings.summary()
fhex__na_strings_col_summary = h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
assert fhex_na_strings_missing_count == 2

fhex_na_strings = h2o.import_file(tests.locate(path),
na_strings={"h1": "fish"})
fhex_na_strings.summary()
fhex__na_strings_col_summary = h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
assert fhex_na_strings_missing_count == 0

#na_strings as dictionary with values as list of strings
fhex_na_strings = h2o.import_file(tests.locate(path),
na_strings={"h2": ["fish","xyz"]})
fhex_na_strings.summary()
fhex__na_strings_col_summary = h2o.H2OConnection.get_json("Frames/" + urllib.quote(fhex_na_strings._id) + "/summary")["frames"][0]["columns"]
fhex_na_strings_missing_count = sum([e["missing_count"] for e in fhex__na_strings_col_summary])
assert fhex_na_strings_missing_count == 2

if __name__ == "__main__":
Expand Down

0 comments on commit 56f1c36

Please sign in to comment.