Skip to content

Commit

Permalink
More stats (#25)
Browse files Browse the repository at this point in the history
* plain stats encoding

* More explicit dealing with categories and stats derrived therefrom
  • Loading branch information
martindurant committed Nov 17, 2016
1 parent 370f9ec commit 9b5133b
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
15 changes: 15 additions & 0 deletions fastparquet/test/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,18 @@ def test_cat_filters():

filters = [('cat', '==', 'freda'), ('catnum', '!=', 2.5)]
assert len(pf.to_pandas(filters=filters)) == 1000


def test_statistics(tempdir):
s = pd.Series([b'a', b'b', b'c']*20)
df = pd.DataFrame({'a': s, 'b': s.astype('category'),
'c': s.astype('category').cat.as_ordered()})
fastparquet.write(tempdir, df, file_scheme='hive')
pf = fastparquet.ParquetFile(tempdir)
stat = pf.statistics
assert stat['max']['a'] == [b'c']
assert stat['min']['a'] == [b'a']
assert stat['max']['b'] == [None]
assert stat['min']['b'] == [None]
assert stat['max']['c'] == [b'c']
assert stat['min']['c'] == [b'a']
13 changes: 10 additions & 3 deletions fastparquet/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,16 +389,23 @@ def write_column(f, data, selement, encoding='PLAIN', compression=None):
dict_start = f.tell()
write_thrift(f, ph)
f.write(bdata)
try:
max, min = data.max(), data.min()
max = encode['PLAIN'](pd.Series([max]), selement)
min = encode['PLAIN'](pd.Series([min]), selement)
except TypeError:
max, min = None, None
data = data.cat.codes.astype(np.int32)
cats = True
encoding = "PLAIN_DICTIONARY"

start = f.tell()
bdata = definition_data + repetition_data + encode[encoding](data, selement)
try:
max, min = data.max(), data.min()
max = encode['PLAIN'](pd.Series([max], dtype=data.dtype), selement)
min = encode['PLAIN'](pd.Series([min], dtype=data.dtype), selement)
if encoding != 'PLAIN_DICTIONARY':
max, min = data.max(), data.min()
max = encode['PLAIN'](pd.Series([max], dtype=data.dtype), selement)
min = encode['PLAIN'](pd.Series([min], dtype=data.dtype), selement)
except TypeError:
max, min = None, None

Expand Down

0 comments on commit 9b5133b

Please sign in to comment.