Skip to content

Commit

Permalink
ARROW-16431: [C++][Python] Improve AppendRowGroups error when schemas…
Browse files Browse the repository at this point in the history
… differ (apache#14029)

Fix [ARROW-16431](https://issues.apache.org/jira/browse/ARROW-16431)

Feel free to opine on specific error messages or the implementation as a whole. 👌 

Examples

```python
# meta1 and meta2 differ in column types
meta1.append_row_groups(meta2)
*** RuntimeError: AppendRowGroups requires equal schemas.
The two columns with index 0 differ.
column descriptor = {
  name: col1,
  path: col1,
  physical_type: INT64,
  converted_type: NONE,
  logical_type: None,
  max_definition_level: 1,
  max_repetition_level: 0,
}
column descriptor = {
  name: col2,
  path: col2,
  physical_type: INT64,
  converted_type: NONE,
  logical_type: None,
  max_definition_level: 1,
  max_repetition_level: 0,
}


# meta1 and meta2 differ in number of columns
meta1.append_row_groups(meta2)
*** RuntimeError: This schema has 2 columns, other has 1
```

Authored-by: Miles Granger <miles59923@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
milesgranger authored and fatemehp committed Oct 17, 2022
1 parent 5c57662 commit 5e3c85c
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 5 deletions.
6 changes: 4 additions & 2 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -690,8 +690,10 @@ class FileMetaData::FileMetaDataImpl {
}

void AppendRowGroups(const std::unique_ptr<FileMetaDataImpl>& other) {
if (!schema()->Equals(*other->schema())) {
throw ParquetException("AppendRowGroups requires equal schemas.");
std::ostringstream diff_output;
if (!schema()->Equals(*other->schema(), &diff_output)) {
auto msg = "AppendRowGroups requires equal schemas.\n" + diff_output.str();
throw ParquetException(msg);
}

// ARROW-13654: `other` may point to self, be careful not to enter an infinite loop
Expand Down
12 changes: 11 additions & 1 deletion cpp/src/parquet/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -794,13 +794,23 @@ void SchemaDescriptor::Init(NodePtr schema) {
}
}

bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
bool SchemaDescriptor::Equals(const SchemaDescriptor& other,
std::ostream* diff_output) const {
if (this->num_columns() != other.num_columns()) {
if (diff_output != nullptr) {
*diff_output << "This schema has " << this->num_columns() << " columns, other has "
<< other.num_columns();
}
return false;
}

for (int i = 0; i < this->num_columns(); ++i) {
if (!this->Column(i)->Equals(*other.Column(i))) {
if (diff_output != nullptr) {
*diff_output << "The two columns with index " << i << " differ." << std::endl
<< this->Column(i)->ToString() << std::endl
<< other.Column(i)->ToString() << std::endl;
}
return false;
}
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ class PARQUET_EXPORT SchemaDescriptor {
// Get the index of a column by its node, or negative value if not found.
int ColumnIndex(const schema::Node& node) const;

bool Equals(const SchemaDescriptor& other) const;
bool Equals(const SchemaDescriptor& other, std::ostream* diff_output = NULLPTR) const;

// The number of physical columns appearing in the file
int num_columns() const { return static_cast<int>(leaves_.size()); }
Expand Down
35 changes: 34 additions & 1 deletion python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import datetime
import decimal
from collections import OrderedDict
import io

import numpy as np
import pytest
Expand Down Expand Up @@ -433,7 +434,9 @@ def test_write_metadata(tempdir):
assert parquet_meta_mult.num_row_groups == 2

# append metadata with different schema raises an error
with pytest.raises(RuntimeError, match="requires equal schemas"):
msg = ("AppendRowGroups requires equal schemas.\n"
"The two columns with index 0 differ.")
with pytest.raises(RuntimeError, match=msg):
pq.write_metadata(
pa.schema([("a", "int32"), ("b", "null")]),
path, metadata_collector=[parquet_meta, parquet_meta]
Expand Down Expand Up @@ -580,3 +583,33 @@ def test_metadata_equals():
match = "Argument 'other' has incorrect type"
with pytest.raises(TypeError, match=match):
original_metadata.equals(None)


@pytest.mark.parametrize("t1,t2,expected_error", (
({'col1': range(10)}, {'col1': range(10)}, None),
({'col1': range(10)}, {'col2': range(10)},
"The two columns with index 0 differ."),
({'col1': range(10), 'col2': range(10)}, {'col3': range(10)},
"This schema has 2 columns, other has 1")
))
def test_metadata_append_row_groups_diff(t1, t2, expected_error):
table1 = pa.table(t1)
table2 = pa.table(t2)

buf1 = io.BytesIO()
buf2 = io.BytesIO()
pq.write_table(table1, buf1)
pq.write_table(table2, buf2)
buf1.seek(0)
buf2.seek(0)

meta1 = pq.ParquetFile(buf1).metadata
meta2 = pq.ParquetFile(buf2).metadata

if expected_error:
# Error clearly defines it's happening at append row groups call
prefix = "AppendRowGroups requires equal schemas.\n"
with pytest.raises(RuntimeError, match=prefix + expected_error):
meta1.append_row_groups(meta2)
else:
meta1.append_row_groups(meta2)

0 comments on commit 5e3c85c

Please sign in to comment.