Skip to content

Commit

Permalink
Merge pull request #179 from martindurant/string_stat_encoding
Browse files Browse the repository at this point in the history
String stat encoding
  • Loading branch information
martindurant committed Jul 7, 2017
2 parents 3a05329 + 4352b7e commit ff73af0
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 9 deletions.
16 changes: 11 additions & 5 deletions fastparquet/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,13 +575,19 @@ def statistics(obj):
return rv
if s.max is not None:
try:
rv['max'] = encoding.read_plain(ensure_bytes(s.max),
md.type, 1)[0]
if md.type == parquet_thrift.Type.BYTE_ARRAY:
rv['max'] = ensure_bytes(s.max)
else:
rv['max'] = encoding.read_plain(ensure_bytes(s.max),
md.type, 1)[0]
except:
rv['max'] = None
if s.min is not None:
try:
rv['min'] = encoding.read_plain(ensure_bytes(s.min),
if md.type == parquet_thrift.Type.BYTE_ARRAY:
rv['min'] = ensure_bytes(s.min)
else:
rv['min'] = encoding.read_plain(ensure_bytes(s.min),
md.type, 1)[0]
except:
rv['min'] = None
Expand All @@ -601,7 +607,7 @@ def statistics(obj):
for col in obj.columns}
for n in ['min', 'max', 'null_count', 'distinct_count']}
if not L:
return d
return d
schema = obj.schema
for col in obj.row_groups[0].columns:
column = '.'.join(col.meta_data.path_in_schema)
Expand All @@ -611,7 +617,7 @@ def statistics(obj):
try:
d[name][column] = (
[None] if d[name][column] is None
or None in d[name][column]
or None in d[name][column]
else list(converted_types.convert(
np.array(d[name][column]), se))
)
Expand Down
5 changes: 5 additions & 0 deletions fastparquet/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ def test_sorted_row_group_columns(tempdir):

pf = ParquetFile(fn)

# string stats should be stored without byte-encoding
zcol = [c for c in pf.row_groups[0].columns
if c.meta_data.path_in_schema == ['z']][0]
assert zcol.meta_data.statistics.min == 'a'

result = sorted_partitioned_columns(pf)
expected = {'x': {'min': [1, 3], 'max': [2, 4]},
'z': {'min': ['a', 'c'], 'max': ['b', 'd']}}
Expand Down
18 changes: 14 additions & 4 deletions fastparquet/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,8 +502,13 @@ def write_column(f, data, selement, compression=None):
try:
if num_nulls == 0:
max, min = data.values.max(), data.values.min()
max = encode['PLAIN'](pd.Series([max]), selement)
min = encode['PLAIN'](pd.Series([min]), selement)
if selement.type == parquet_thrift.Type.BYTE_ARRAY:
if selement.converted_type is not None:
max = encode['PLAIN'](pd.Series([max]), selement)[4:]
min = encode['PLAIN'](pd.Series([min]), selement)[4:]
else:
max = encode['PLAIN'](pd.Series([max]), selement)
min = encode['PLAIN'](pd.Series([min]), selement)
except TypeError:
pass
data = data.cat.codes
Expand All @@ -519,8 +524,13 @@ def write_column(f, data, selement, compression=None):
try:
if encoding != 'PLAIN_DICTIONARY' and num_nulls == 0:
max, min = data.values.max(), data.values.min()
max = encode['PLAIN'](pd.Series([max], dtype=data.dtype), selement)
min = encode['PLAIN'](pd.Series([min], dtype=data.dtype), selement)
if selement.type == parquet_thrift.Type.BYTE_ARRAY:
if selement.converted_type is not None:
max = encode['PLAIN'](pd.Series([max]), selement)[4:]
min = encode['PLAIN'](pd.Series([min]), selement)[4:]
else:
max = encode['PLAIN'](pd.Series([max]), selement)
min = encode['PLAIN'](pd.Series([min]), selement)
except TypeError:
pass

Expand Down

0 comments on commit ff73af0

Please sign in to comment.