Skip to content

Commit

Permalink
Fixed error if a certain partition is empty, when writing a partioned… (
Browse files Browse the repository at this point in the history
#347)

* Fixed error if a certain partition is empty, when loading a partioned parquet file

* fixes issue due to, if a column is DateTime,  GroupBy.indices returns key differently than GroupBy.__iter__.

* added test for when reading and filtering a partioned parquet file and then writing it again with some empty partitions
  • Loading branch information
andrethrill authored and martindurant committed Jul 12, 2018
1 parent 74643be commit 25c5df5
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 2 deletions.
16 changes: 16 additions & 0 deletions fastparquet/test/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,22 @@ def test_too_many_partition_columns(tempdir):
assert "Cannot include all columns" in str(ve)


def test_read_partitioned_and_write_with_empty_partions(tempdir):
df = pd.DataFrame({'a': np.random.choice(['a', 'b', 'c'], size=1000),
'c': np.random.choice([True, False], size=1000)})

writer.write(tempdir, df, partition_on=['a'], file_scheme='hive')
df_filtered = ParquetFile(tempdir).to_pandas(
filters=[('a', '==', 'b')]
)

writer.write(tempdir, df_filtered, partition_on=['a'], file_scheme='hive')

df_loaded = ParquetFile(tempdir).to_pandas()

tm.assert_frame_equal(df_filtered, df_loaded, check_categorical=False)


@pytest.mark.parametrize('compression', ['GZIP',
'gzip',
None,
Expand Down
6 changes: 4 additions & 2 deletions fastparquet/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,8 +918,10 @@ def partition_on_columns(data, columns, root_path, partname, fmd,
if not remaining:
raise ValueError("Cannot include all columns in partition_on")
rgs = []
for key in sorted(gb.indices):
df = gb.get_group(key)[remaining]
for key, group in zip(sorted(gb.indices), sorted(gb)):
if group[1].empty:
continue
df = group[1][remaining]
if not isinstance(key, tuple):
key = (key,)
if with_field:
Expand Down

0 comments on commit 25c5df5

Please sign in to comment.