Skip to content

Commit

Permalink
Allow UTF8 column names to be read (#342)
Browse files Browse the repository at this point in the history
Was using str() on the names, causing decode error for anythin not
ascii.
  • Loading branch information
martindurant committed Jun 13, 2018
1 parent 7313881 commit 74643be
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 10 deletions.
18 changes: 10 additions & 8 deletions fastparquet/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pandas.core.internals import BlockManager
from pandas import Categorical, DataFrame, Series
from pandas.api.types import is_categorical_dtype
import six
from .util import STR_TYPE


Expand Down Expand Up @@ -56,12 +57,13 @@ def cat(col):
df = OrderedDict()
for t, col in zip(types, cols):
if str(t) == 'category':
df[str(col)] = Categorical([], categories=cat(col), fastpath=True)
df[six.text_type(col)] = Categorical([], categories=cat(col),
fastpath=True)
else:
d = np.empty(0, dtype=t)
if d.dtype.kind == "M" and str(col) in timezones:
d = Series(d).dt.tz_localize(timezones[str(col)])
df[str(col)] = d
if d.dtype.kind == "M" and six.text_type(col) in timezones:
d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
df[six.text_type(col)] = d

df = DataFrame(df)
if not index_types:
Expand All @@ -79,8 +81,8 @@ def cat(col):
views[col+'-catdef'] = index._data
else:
d = np.empty(size, dtype=t)
# if d.dtype.kind == "M" and str(col) in timezones:
# d = Series(d).dt.tz_localize(timezones[str(col)])
# if d.dtype.kind == "M" and six.text_type(col) in timezones:
# d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
index = Index(d)
views[col] = index.values
else:
Expand All @@ -103,8 +105,8 @@ def cat(col):
views[col+'-catdef'] = index._levels[i]
else:
d = np.empty(size, dtype=index_types[i])
# if d.dtype.kind == "M" and str(col) in timezones:
# d = Series(d).dt.tz_localize(timezones[str(col)])
# if d.dtype.kind == "M" and six.text_type(col) in timezones:
# d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
index._levels.append(Index(d))
index._labels.append(np.arange(size, dtype=int))
views[col] = index._levels[i]._data
Expand Down
9 changes: 9 additions & 0 deletions fastparquet/test/test_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import io
Expand Down Expand Up @@ -663,3 +664,11 @@ def test_empty_df():
df = p.to_pandas()
assert list(p.columns) == ['a', 'b', 'c', '__index_level_0__']
assert len(df) == 0


def test_unicode_cols(tempdir):
fn = os.path.join(tempdir, 'test.parq')
df = pd.DataFrame({u"région": [1, 2, 3]})
write(fn, df)
pf = ParquetFile(fn)
pf.to_pandas()
4 changes: 2 additions & 2 deletions fastparquet/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def convert(data, se):


def infer_object_encoding(data):
head = data[:10] if isinstance(data, pd.Index) else data.valid()[:10]
head = data[:10] if isinstance(data, pd.Index) else data.dropna()[:10]
if all(isinstance(i, STR_TYPE) for i in head) and not PY2:
return "utf8"
elif PY2 and all(isinstance(i, unicode) for i in head):
Expand Down Expand Up @@ -404,7 +404,7 @@ def make_definitions(data, no_nulls):
head = temp.so_far().tostring()

block = struct.pack('<i', len(head + out)) + head + out
out = data.valid() # better, data[data.notnull()], from above ?
out = data.dropna() # better, data[data.notnull()], from above ?
return block, out


Expand Down

0 comments on commit 74643be

Please sign in to comment.