Skip to content

Commit

Permalink
Merge pull request #161 from martindurant/ordered_dicts
Browse files Browse the repository at this point in the history
Fail on ordering dict statistics
  • Loading branch information
martindurant committed Jun 5, 2017
2 parents 30742fb + bb112fb commit 13b7978
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 11 deletions.
12 changes: 8 additions & 4 deletions fastparquet/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,10 +603,14 @@ def sorted_partitioned_columns(pf):
min, max = s['min'][c], s['max'][c]
if any(x is None for x in min + max):
continue
if (sorted(min) == min and
sorted(max) == max and
all(mx < mn for mx, mn in zip(max[:-1], min[1:]))):
out[c] = {'min': min, 'max': max}
try:
if (sorted(min) == min and
sorted(max) == max and
all(mx < mn for mx, mn in zip(max[:-1], min[1:]))):
out[c] = {'min': min, 'max': max}
except TypeError:
# because some types, e.g., dicts cannot be sorted/compared
continue
return out


Expand Down
6 changes: 5 additions & 1 deletion fastparquet/test/test_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import unicode_literals

import os

Expand Down Expand Up @@ -70,18 +71,21 @@ def test_empty_statistics(tempdir):

def test_sorted_row_group_columns(tempdir):
df = pd.DataFrame({'x': [1, 2, 3, 4],
'v': [{'a': 0}, {'b': -1}, {'c': 5}, {'a': 0}],
'y': [1.0, 2.0, 1.0, 2.0],
'z': ['a', 'b', 'c', 'd']})

fn = os.path.join(tempdir, 'foo.parquet')
write(fn, df, row_group_offsets=[0, 2])
write(fn, df, row_group_offsets=[0, 2], object_encoding={'v': 'json',
'z': 'utf8'})

pf = ParquetFile(fn)

result = sorted_partitioned_columns(pf)
expected = {'x': {'min': [1, 3], 'max': [2, 4]},
'z': {'min': ['a', 'c'], 'max': ['b', 'd']}}

# NB column v should not feature, as dict are unorderable
assert result == expected


Expand Down
15 changes: 9 additions & 6 deletions fastparquet/test/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,16 @@ def test_roundtrip_complex(tempdir, scheme,):
])
def test_datetime_roundtrip(tempdir, df, capsys):
fname = os.path.join(tempdir, 'test.parquet')
write(fname, df)

r = ParquetFile(fname)
out, err = capsys.readouterr()
w = False
if 'x' in df and str(df.x.dtype.tz) == 'Europe/London':
# warning happens first time only
assert "UTC" in err
with pytest.warns(UserWarning) as w:
write(fname, df)
else:
write(fname, df)
r = ParquetFile(fname)

if w:
assert "UTC" in str(w.list[0].message)

df2 = r.to_pandas()
if 'x' in df:
Expand Down

0 comments on commit 13b7978

Please sign in to comment.