Skip to content

Commit

Permalink
Allow lists of multi-file data-sets (#309)
Browse files Browse the repository at this point in the history
* Allow lists of multi-file data-sets

* Remove test that no longer fails
  • Loading branch information
martindurant committed Feb 27, 2018
1 parent 77e9720 commit fdb23ba
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 17 deletions.
17 changes: 17 additions & 0 deletions fastparquet/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,23 @@ def test_drill_list(tempdir):
assert out.dir0.tolist() == ['x'] * 3 + ['y'] * 3


def test_multi_list(tempdir):
df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
dir1 = os.path.join(tempdir, 'x')
write(dir1, df, file_scheme='hive')
dir2 = os.path.join(tempdir, 'y')
write(dir2, df, file_scheme='hive')
dir3 = os.path.join(tempdir, 'z', 'deep')
write(dir3, df, file_scheme='hive')

pf = ParquetFile([dir1, dir2])
out = pf.to_pandas() # this version may have extra column!
assert out.a.tolist() == ['x', 'y', 'z'] * 2
pf = ParquetFile([dir1, dir2, dir3])
out = pf.to_pandas()
assert out.a.tolist() == ['x', 'y', 'z'] * 3


def test_hive_and_drill_list(tempdir):
df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
dir1 = os.path.join(tempdir, 'x=0')
Expand Down
6 changes: 0 additions & 6 deletions fastparquet/test/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,12 +716,6 @@ def test_merge_fail(tempdir):
writer.merge([fn0, fn1])
assert 'schemas' in str(e)

os.remove(fn1)
write(fn1, df0, file_scheme='hive')
with pytest.raises(ValueError) as e:
writer.merge([fn0, fn1])
assert 'multi-file' in str(e)


def test_append_simple(tempdir):
fn = os.path.join(str(tempdir), 'test.parq')
Expand Down
23 changes: 14 additions & 9 deletions fastparquet/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,20 @@ def metadata_from_many(file_list, verify_schema=False, open_with=default_open,

for pf, fn in zip(pfs, file_list):
if pf.file_scheme not in ['simple', 'empty']:
# should remove 'empty' datasets up front? Get ignored on load
# anyway.
raise ValueError('Cannot merge multi-file input', fn)
for rg in pf.row_groups:
rg = copy.copy(rg)
rg.columns = [copy.copy(c) for c in rg.columns]
for chunk in rg.columns:
chunk.file_path = fn
fmd.row_groups.append(rg)
for rg in pf.row_groups:
rg = copy.copy(rg)
rg.columns = [copy.copy(c) for c in rg.columns]
for chunk in rg.columns:
chunk.file_path = '/'.join([fn, chunk.file_path])
fmd.row_groups.append(rg)

else:
for rg in pf.row_groups:
rg = copy.copy(rg)
rg.columns = [copy.copy(c) for c in rg.columns]
for chunk in rg.columns:
chunk.file_path = fn
fmd.row_groups.append(rg)

fmd.num_rows = sum(rg.num_rows for rg in fmd.row_groups)
return basepath, fmd
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def fix_exts(sources):
description='Python support for Parquet file format',
author='Martin Durant',
author_email='mdurant@continuum.io',
url='https://github.com/martindurant/fastparquet/',
url='https://github.com/dask/fastparquet/',
license='Apache License 2.0',
classifiers=[
'Development Status :: 3 - Alpha',
Expand All @@ -67,7 +67,7 @@ def fix_exts(sources):
'Programming Language :: Python :: Implementation :: CPython',
],
packages=['fastparquet'],
cmdclass={'build_ext':build_ext},
cmdclass={'build_ext': build_ext},
install_requires=install_requires,
setup_requires=[
'pytest-runner',
Expand Down

0 comments on commit fdb23ba

Please sign in to comment.