Skip to content

Commit

Permalink
Adds int32 support for object encoding (#268)
Browse files Browse the repository at this point in the history
* Adds int32 support for object encoding

* Add test for int32 object encoding
  • Loading branch information
Bruno Studer authored and martindurant committed Jan 10, 2018
1 parent c2e6a39 commit 4507685
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 3 deletions.
8 changes: 8 additions & 0 deletions fastparquet/test/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,3 +891,11 @@ def test_bad_object_encoding(tempdir):
assert "INT64" in str(e)
assert "primitive" in str(e)
assert '"a"' in str(e)

def test_object_encoding_int32(tempdir):
df = pd.DataFrame({'a': ['15', None, '2']})
fn = os.path.join(tempdir, 'temp.parq')
write(fn, df, object_encoding={'a': 'int32'})
pf = ParquetFile(fn)
assert pf._schema[1].type == parquet_thrift.Type.INT32
assert not pf.schema.is_required('a')
9 changes: 6 additions & 3 deletions fastparquet/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
fixed_text: int or None
For str and bytes, the fixed-string length to use. If None, object
column will remain variable length.
object_encoding: None or infer|bytes|utf8|json|bson|bool|int|float
object_encoding: None or infer|bytes|utf8|json|bson|bool|int|int32|float
How to encode object type into bytes. If None, bytes is assumed;
if 'infer', type is guessed from 10 first non-null values.
times: 'int64'|'int96'
Expand Down Expand Up @@ -111,12 +111,15 @@ def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
elif object_encoding == 'int':
type, converted_type, width = (parquet_thrift.Type.INT64, None,
64)
elif object_encoding == 'int32':
type, converted_type, width = (parquet_thrift.Type.INT32, None,
32)
elif object_encoding == 'float':
type, converted_type, width = (parquet_thrift.Type.DOUBLE, None,
64)
else:
raise ValueError('Object encoding (%s) not one of '
'infer|utf8|bytes|json|bson|bool|int|float' %
'infer|utf8|bytes|json|bson|bool|int|int32|float' %
object_encoding)
if fixed_text:
width = fixed_text
Expand Down Expand Up @@ -762,7 +765,7 @@ def write(filename, data, row_group_offsets=50000000,
and the schema must match the input data.
object_encoding: str or {col: type}
For object columns, this gives the data type, so that the values can
be encoded to bytes. Possible values are bytes|utf8|json|bson|bool|int,
be encoded to bytes. Possible values are bytes|utf8|json|bson|bool|int|int32,
where bytes is assumed if not specified (i.e., no conversion). The
special value 'infer' will cause the type to be guessed from the first
ten non-null values.
Expand Down

0 comments on commit 4507685

Please sign in to comment.