Skip to content

Commit

Permalink
Fix #2720 (#2729)
Browse files Browse the repository at this point in the history
  • Loading branch information
martindurant authored Oct 1, 2017
1 parent 68d8c95 commit 0f29e7b
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 2 deletions.
5 changes: 3 additions & 2 deletions dask/dataframe/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,9 @@ def _read_fastparquet(fs, paths, myopen, columns=None, filters=None,
dtypes)

for cat in categories:
meta[cat] = pd.Series(pd.Categorical([],
categories=[UNKNOWN_CATEGORIES]))
if cat in meta:
meta[cat] = pd.Series(pd.Categorical([],
categories=[UNKNOWN_CATEGORIES]))

if index_col:
meta = meta.set_index(index_col)
Expand Down
16 changes: 16 additions & 0 deletions dask/dataframe/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,3 +536,19 @@ def test_drill_scheme(fn):
out = df.compute()
assert 'dir0' in out
assert (np.unique(out.dir0) == ['test_data1', 'test_data2']).all()


def test_parquet_select_cats(fn):
df = pd.DataFrame({
'categories': pd.Series(
np.random.choice(['a', 'b', 'c', 'd', 'e', 'f'], size=100),
dtype='category'),
'ints': pd.Series(list(range(0, 100)), dtype='int'),
'floats': pd.Series(list(range(0, 100)), dtype='float')})

ddf = dd.from_pandas(df, 1)
ddf.to_parquet(fn)
rddf = dd.read_parquet(fn, columns=['ints'])
assert list(rddf.columns) == ['ints']
rddf = dd.read_parquet(fn)
assert list(rddf.columns) == list(df)

0 comments on commit 0f29e7b

Please sign in to comment.