Merge pull request #179 from martindurant/string_stat_encoding

String stat encoding
dask · Jul 7, 2017 · ff73af0 · ff73af0
2 parents 3a05329 + 4352b7e
commit ff73af0
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 9 deletions.
diff --git a/fastparquet/api.py b/fastparquet/api.py
@@ -575,13 +575,19 @@ def statistics(obj):
             return rv
         if s.max is not None:
             try:
-                rv['max'] = encoding.read_plain(ensure_bytes(s.max),
-                                                md.type, 1)[0]
+                if md.type == parquet_thrift.Type.BYTE_ARRAY:
+                    rv['max'] = ensure_bytes(s.max)
+                else:
+                    rv['max'] = encoding.read_plain(ensure_bytes(s.max),
+                                                    md.type, 1)[0]
             except:
                 rv['max'] = None
         if s.min is not None:
             try:
-                rv['min'] = encoding.read_plain(ensure_bytes(s.min),
+                if md.type == parquet_thrift.Type.BYTE_ARRAY:
+                    rv['min'] = ensure_bytes(s.min)
+                else:
+                    rv['min'] = encoding.read_plain(ensure_bytes(s.min),
                                                 md.type, 1)[0]
             except:
                 rv['min'] = None
@@ -601,7 +607,7 @@ def statistics(obj):
                  for col in obj.columns}
              for n in ['min', 'max', 'null_count', 'distinct_count']}
         if not L:
-             return d
+            return d
         schema = obj.schema
         for col in obj.row_groups[0].columns:
             column = '.'.join(col.meta_data.path_in_schema)
@@ -611,7 +617,7 @@ def statistics(obj):
                     try:
                         d[name][column] = (
                             [None] if d[name][column] is None
-                                      or None in d[name][column]
+                            or None in d[name][column]
                             else list(converted_types.convert(
                                 np.array(d[name][column]), se))
                         )

diff --git a/fastparquet/test/test_api.py b/fastparquet/test/test_api.py
@@ -81,6 +81,11 @@ def test_sorted_row_group_columns(tempdir):
 
     pf = ParquetFile(fn)
 
+    # string stats should be stored without byte-encoding
+    zcol = [c for c in pf.row_groups[0].columns
+            if c.meta_data.path_in_schema == ['z']][0]
+    assert zcol.meta_data.statistics.min == 'a'
+
     result = sorted_partitioned_columns(pf)
     expected = {'x': {'min': [1, 3], 'max': [2, 4]},
                 'z': {'min': ['a', 'c'], 'max': ['b', 'd']}}

diff --git a/fastparquet/writer.py b/fastparquet/writer.py
@@ -502,8 +502,13 @@ def write_column(f, data, selement, compression=None):
         try:
             if num_nulls == 0:
                 max, min = data.values.max(), data.values.min()
-                max = encode['PLAIN'](pd.Series([max]), selement)
-                min = encode['PLAIN'](pd.Series([min]), selement)
+                if selement.type == parquet_thrift.Type.BYTE_ARRAY:
+                    if selement.converted_type is not None:
+                        max = encode['PLAIN'](pd.Series([max]), selement)[4:]
+                        min = encode['PLAIN'](pd.Series([min]), selement)[4:]
+                else:
+                    max = encode['PLAIN'](pd.Series([max]), selement)
+                    min = encode['PLAIN'](pd.Series([min]), selement)
         except TypeError:
             pass
         data = data.cat.codes
@@ -519,8 +524,13 @@ def write_column(f, data, selement, compression=None):
     try:
         if encoding != 'PLAIN_DICTIONARY' and num_nulls == 0:
             max, min = data.values.max(), data.values.min()
-            max = encode['PLAIN'](pd.Series([max], dtype=data.dtype), selement)
-            min = encode['PLAIN'](pd.Series([min], dtype=data.dtype), selement)
+            if selement.type == parquet_thrift.Type.BYTE_ARRAY:
+                if selement.converted_type is not None:
+                    max = encode['PLAIN'](pd.Series([max]), selement)[4:]
+                    min = encode['PLAIN'](pd.Series([min]), selement)[4:]
+            else:
+                max = encode['PLAIN'](pd.Series([max]), selement)
+                min = encode['PLAIN'](pd.Series([min]), selement)
     except TypeError:
         pass