Allow UTF8 column names to be read (#342)

Was using str() on the names, causing decode error for anythin not ascii.
dask · Jun 13, 2018 · 74643be · 74643be
1 parent 7313881
commit 74643be
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 10 deletions.
diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py
@@ -5,6 +5,7 @@
 from pandas.core.internals import BlockManager
 from pandas import Categorical, DataFrame, Series
 from pandas.api.types import is_categorical_dtype
+import six
 from .util import STR_TYPE
 
 
@@ -56,12 +57,13 @@ def cat(col):
     df = OrderedDict()
     for t, col in zip(types, cols):
         if str(t) == 'category':
-            df[str(col)] = Categorical([], categories=cat(col), fastpath=True)
+            df[six.text_type(col)] = Categorical([], categories=cat(col),
+                                                 fastpath=True)
         else:
             d = np.empty(0, dtype=t)
-            if d.dtype.kind == "M" and str(col) in timezones:
-                d = Series(d).dt.tz_localize(timezones[str(col)])
-            df[str(col)] = d
+            if d.dtype.kind == "M" and six.text_type(col) in timezones:
+                d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
+            df[six.text_type(col)] = d
 
     df = DataFrame(df)
     if not index_types:
@@ -79,8 +81,8 @@ def cat(col):
             views[col+'-catdef'] = index._data
         else:
             d = np.empty(size, dtype=t)
-            # if d.dtype.kind == "M" and str(col) in timezones:
-            #     d = Series(d).dt.tz_localize(timezones[str(col)])
+            # if d.dtype.kind == "M" and six.text_type(col) in timezones:
+            #     d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
             index = Index(d)
             views[col] = index.values
     else:
@@ -103,8 +105,8 @@ def cat(col):
                 views[col+'-catdef'] = index._levels[i]
             else:
                 d = np.empty(size, dtype=index_types[i])
-                # if d.dtype.kind == "M" and str(col) in timezones:
-                #     d = Series(d).dt.tz_localize(timezones[str(col)])
+                # if d.dtype.kind == "M" and six.text_type(col) in timezones:
+                #     d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
                 index._levels.append(Index(d))
                 index._labels.append(np.arange(size, dtype=int))
                 views[col] = index._levels[i]._data

diff --git a/fastparquet/test/test_api.py b/fastparquet/test/test_api.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 
 import io
@@ -663,3 +664,11 @@ def test_empty_df():
     df = p.to_pandas()
     assert list(p.columns) == ['a', 'b', 'c', '__index_level_0__']
     assert len(df) == 0
+
+
+def test_unicode_cols(tempdir):
+    fn = os.path.join(tempdir, 'test.parq')
+    df = pd.DataFrame({u"région": [1, 2, 3]})
+    write(fn, df)
+    pf = ParquetFile(fn)
+    pf.to_pandas()
diff --git a/fastparquet/writer.py b/fastparquet/writer.py
@@ -214,7 +214,7 @@ def convert(data, se):
 
 
 def infer_object_encoding(data):
-    head = data[:10] if isinstance(data, pd.Index) else data.valid()[:10]
+    head = data[:10] if isinstance(data, pd.Index) else data.dropna()[:10]
     if all(isinstance(i, STR_TYPE) for i in head) and not PY2:
         return "utf8"
     elif PY2 and all(isinstance(i, unicode) for i in head):
@@ -404,7 +404,7 @@ def make_definitions(data, no_nulls):
         head = temp.so_far().tostring()
 
         block = struct.pack('<i', len(head + out)) + head + out
-        out = data.valid()  # better, data[data.notnull()], from above ?
+        out = data.dropna()  # better, data[data.notnull()], from above ?
     return block, out