Skip to content

Commit

Permalink
Merge pull request #164 from martindurant/index_not_in_columns
Browse files Browse the repository at this point in the history
If metadata gives index, put in columns
  • Loading branch information
martindurant committed Jun 7, 2017
2 parents abb3bd6 + 463749d commit d07d662
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
14 changes: 10 additions & 4 deletions fastparquet/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,12 @@ def iter_row_groups(self, columns=None, categories=None, filters=[],
-------
Generator yielding one Pandas data-frame per row-group
"""
check_column_names(self.columns, columns, categories)
index = self._get_index(index)
if index is None:
index = self._get_index(index)
columns = columns or self.columns
if index and index not in columns:
columns.append(index)
check_column_names(self.columns, columns, categories)
rgs = self.filter_row_groups(filters)
if all(column.file_path is None for rg in self.row_groups
for column in rg.columns):
Expand Down Expand Up @@ -333,11 +336,14 @@ def to_pandas(self, columns=None, categories=None, filters=[],
-------
Pandas data-frame
"""
check_column_names(self.columns, columns, categories, timestamp96)
rgs = self.filter_row_groups(filters)
size = sum(rg.num_rows for rg in rgs)
if index is None:
index = self._get_index(index)
columns = columns or self.columns
index = self._get_index(index)
if index and index not in columns:
columns.append(index)
check_column_names(self.columns, columns, categories)
df, views = self.pre_allocate(size, columns, categories, index,
timestamp96=timestamp96)
start = 0
Expand Down
10 changes: 10 additions & 0 deletions fastparquet/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,13 @@ def test_filter_stats(tempdir):
pf = ParquetFile(tempdir)
out = pf.to_pandas(filters=[('x', '>=', 5)])
assert out.x.tolist() == [5, 6, 7]


def test_index_not_in_columns(tempdir):
df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a')
write(tempdir, df, file_scheme='hive')
pf = ParquetFile(tempdir)
out = pf.to_pandas(columns=['b'])
assert out.index.tolist() == ['x', 'y', 'z']
out = pf.to_pandas(columns=['b'], index=False)
assert out.index.tolist() == [0, 1, 2]

0 comments on commit d07d662

Please sign in to comment.