Skip to content

Commit

Permalink
recognize add'l params (col names,etc) for python objects
Browse files Browse the repository at this point in the history
  • Loading branch information
ludi317 committed Oct 18, 2015
1 parent 78da066 commit a902a8e
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 26 deletions.
35 changes: 19 additions & 16 deletions h2o-py/h2o/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,11 @@ def __init__(self, python_obj=None, file_path=None, raw_id=None, expr=None, dest
self._ast = None
self._data = None # any cached data

kwargs = {k:v for k, v in locals().items() if k in ["destination_frame", "header", "separator", "column_names", "column_types", "na_strings"]}
if expr is not None: self._ast = expr
elif python_obj is not None: self._upload_python_object(python_obj)
elif file_path is not None: self._import_parse(file_path, destination_frame, header, separator, column_names, column_types, na_strings)
elif raw_id is not None: self._handle_text_key(raw_id)
elif python_obj is not None: self._upload_python_object(python_obj, **kwargs)
elif file_path is not None: self._import_parse(file_path, **kwargs)
elif raw_id is not None: self._handle_text_key(raw_id, **kwargs)
else: pass

@staticmethod
Expand All @@ -88,21 +89,22 @@ def get_frame(frame_id):

def __str__(self): return self._id

def _import_parse(self, file_path, destination_frame, header, separator, column_names, column_types, na_strings):
def _import_parse(self, file_path, **kwargs):
rawkey = h2o.lazy_import(file_path)
setup = h2o.parse_setup(rawkey, destination_frame, header, separator, column_names, column_types, na_strings)
setup = h2o.parse_setup(rawkey, **kwargs)
parse = h2o._parse(setup)
self._update_post_parse(parse)
thousands_sep = h2o.H2ODisplay.THOUSANDS
if isinstance(file_path, str): print "Imported {}. Parsed {} rows and {} cols".format(file_path,thousands_sep.format(self._nrows), thousands_sep.format(self._ncols))
else: h2o.H2ODisplay([["File"+str(i+1),f] for i,f in enumerate(file_path)],None, "Parsed {} rows and {} cols".format(thousands_sep.format(self._nrows), thousands_sep.format(self._ncols)))

def _upload_python_object(self, python_obj):
def _upload_python_object(self, python_obj, **kwargs):
"""
Properly handle native python data types. For a discussion of the rules and
permissible data types please refer to the main documentation for H2OFrame.
:param python_obj: A tuple, list, dict, collections.OrderedDict
:param kwargs: Optional arguments for input into parse_setup(), such as column_names and column_types
:return: None
"""
# [] and () cases -- folded together since H2OFrame is mutable
Expand All @@ -128,32 +130,33 @@ def _upload_python_object(self, python_obj):
tmp_file = os.fdopen(tmp_handle,'wb')
# create a new csv writer object thingy
csv_writer = csv.DictWriter(tmp_file, fieldnames=header, restval=None, dialect="excel", extrasaction="ignore", delimiter=",")
csv_writer.writeheader() # write the header
csv_writer.writerows(data_to_write) # write the data
tmp_file.close() # close the streams
self._upload_raw_data(tmp_path) # actually upload the data to H2O
os.remove(tmp_path) # delete the tmp file
if isinstance(python_obj, (dict, collections.OrderedDict)):
csv_writer.writeheader() # write the header
csv_writer.writerows(data_to_write) # write the data
tmp_file.close() # close the streams
self._upload_raw_data(tmp_path, **kwargs) # actually upload the data to H2O
os.remove(tmp_path) # delete the tmp file

def _handle_text_key(self, text_key, check_header=None):
def _handle_text_key(self, text_key, **kwargs):
"""
Handle result of upload_file
:param test_key: A key pointing to raw text to be parsed
:param kwargs: Additional optional arguments for h2o.parse_setup(), such as column_names and column_types.
:return: Part of the H2OFrame constructor.
"""
# perform the parse setup
setup = h2o.parse_setup(text_key)
if check_header is not None: setup["check_header"] = check_header
setup = h2o.parse_setup(text_key, **kwargs)
parse = h2o._parse(setup)
self._update_post_parse(parse)
thousands_sep = h2o.H2ODisplay.THOUSANDS
print "Uploaded {} into cluster with {} rows and {} cols".format(text_key, thousands_sep.format(self._nrows), thousands_sep.format(self._ncols))

def _upload_raw_data(self, tmp_file_path):
def _upload_raw_data(self, tmp_file_path, **kwargs):
fui = {"file": os.path.abspath(tmp_file_path)} # file upload info is the normalized path to a local file
dest_key = _py_tmp_key() # create a random name for the data
h2o.H2OConnection.post_json("PostFile", fui, destination_frame=dest_key) # do the POST -- blocking, and "fast" (does not real data upload)
self._handle_text_key(dest_key, 1) # actually parse the data and setup self._vecs
self._handle_text_key(dest_key, **kwargs) # actually parse the data and setup self._vecs

def __iter__(self):
"""
Expand Down
34 changes: 24 additions & 10 deletions h2o-py/tests/testdir_misc/pyunit_colnames.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
import sys
sys.path.insert(1, "../../")
import h2o, tests
import numpy as np

def col_names_check():



iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv"))
assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
"Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
iris_wheader.col_names)
iris_wheader = h2o.import_file(tests.locate("smalldata/iris/iris_wheader.csv"))
assert iris_wheader.col_names == ["sepal_len","sepal_wid","petal_len","petal_wid","class"], \
"Expected {0} for column names but got {1}".format(["sepal_len","sepal_wid","petal_len","petal_wid","class"],
iris_wheader.col_names)

iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv"))
assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
"{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)
iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv"))
assert iris.col_names == ["C1","C2","C3","C4","C5"], "Expected {0} for column names but got " \
"{1}".format(["C1","C2","C3","C4","C5"], iris.col_names)

df = h2o.H2OFrame(np.random.randn(100,4).tolist(), column_names=list("ABCD"), column_types=["Enum"]*4)
df.head()
assert df.col_names == list("ABCD"), "Expected {} for column names but got {}".format(list("ABCD"), df.col_names)
assert df.types == {"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, "Expected {} for column types " \
"but got {}".format({"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"},
df.types)

df = h2o.H2OFrame(np.random.randn(100,4).tolist())
df.head()
assert df.col_names == ["C1","C2","C3","C4"], "Expected {} for column names but got {}".format(["C1","C2","C3","C4"]
, df.col_names)
assert df.types == {"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, "Expected {}" \
" for column types but got {}".format({"C3": "Numeric", "C2": "Numeric", "C1": "Numeric",
"C4": "Numeric"}, df.types)

if __name__ == "__main__":
tests.run_test(sys.argv, col_names_check)
tests.run_test(sys.argv, col_names_check)
11 changes: 11 additions & 0 deletions h2o-py/tests/testdir_misc/pyunit_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
sys.path.insert(1, "../../")
import h2o, tests
import numpy as np


def pyunit_types():
Expand All @@ -15,5 +16,15 @@ def pyunit_types():

print types2

df = h2o.H2OFrame(np.random.randn(100,4).tolist(), column_names=list("ABCD"), column_types=["Enum"]*4)
assert df.types == {"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, "Expected {} for column types " \
"but got {}".format({"A": "Enum", "C": "Enum", "B": "Enum", "D": "Enum"}, df.types)

df = h2o.H2OFrame(np.random.randn(100,4).tolist())
assert df.types == {"C3": "Numeric", "C2": "Numeric", "C1": "Numeric", "C4": "Numeric"}, "Expected {}" \
" for column types but got {}".format({"C3": "Numeric", "C2": "Numeric", "C1": "Numeric",
"C4": "Numeric"}, df.types)


if __name__ == "__main__":
tests.run_test(sys.argv, pyunit_types)

0 comments on commit a902a8e

Please sign in to comment.