ARROW-16431: [C++][Python] Improve AppendRowGroups error when schemas…

… differ (apache#14029) Fix [ARROW-16431](https://issues.apache.org/jira/browse/ARROW-16431) Feel free to opine on specific error messages or the implementation as a whole. 👌 Examples ```python # meta1 and meta2 differ in column types meta1.append_row_groups(meta2) *** RuntimeError: AppendRowGroups requires equal schemas. The two columns with index 0 differ. column descriptor = { name: col1, path: col1, physical_type: INT64, converted_type: NONE, logical_type: None, max_definition_level: 1, max_repetition_level: 0, } column descriptor = { name: col2, path: col2, physical_type: INT64, converted_type: NONE, logical_type: None, max_definition_level: 1, max_repetition_level: 0, } # meta1 and meta2 differ in number of columns meta1.append_row_groups(meta2) *** RuntimeError: This schema has 2 columns, other has 1 ``` Authored-by: Miles Granger <miles59923@gmail.com> Signed-off-by: Antoine Pitrou <antoine@python.org>
fatemehp · Oct 17, 2022 · 5e3c85c · 5e3c85c
1 parent 5c57662
commit 5e3c85c
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 5 deletions.
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
@@ -690,8 +690,10 @@ class FileMetaData::FileMetaDataImpl {
   }
 
   void AppendRowGroups(const std::unique_ptr<FileMetaDataImpl>& other) {
-    if (!schema()->Equals(*other->schema())) {
-      throw ParquetException("AppendRowGroups requires equal schemas.");
+    std::ostringstream diff_output;
+    if (!schema()->Equals(*other->schema(), &diff_output)) {
+      auto msg = "AppendRowGroups requires equal schemas.\n" + diff_output.str();
+      throw ParquetException(msg);
     }
 
     // ARROW-13654: `other` may point to self, be careful not to enter an infinite loop

diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc
@@ -794,13 +794,23 @@ void SchemaDescriptor::Init(NodePtr schema) {
   }
 }
 
-bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
+bool SchemaDescriptor::Equals(const SchemaDescriptor& other,
+                              std::ostream* diff_output) const {
   if (this->num_columns() != other.num_columns()) {
+    if (diff_output != nullptr) {
+      *diff_output << "This schema has " << this->num_columns() << " columns, other has "
+                   << other.num_columns();
+    }
     return false;
   }
 
   for (int i = 0; i < this->num_columns(); ++i) {
     if (!this->Column(i)->Equals(*other.Column(i))) {
+      if (diff_output != nullptr) {
+        *diff_output << "The two columns with index " << i << " differ." << std::endl
+                     << this->Column(i)->ToString() << std::endl
+                     << other.Column(i)->ToString() << std::endl;
+      }
       return false;
     }
   }

diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h
@@ -432,7 +432,7 @@ class PARQUET_EXPORT SchemaDescriptor {
   // Get the index of a column by its node, or negative value if not found.
   int ColumnIndex(const schema::Node& node) const;
 
-  bool Equals(const SchemaDescriptor& other) const;
+  bool Equals(const SchemaDescriptor& other, std::ostream* diff_output = NULLPTR) const;
 
   // The number of physical columns appearing in the file
   int num_columns() const { return static_cast<int>(leaves_.size()); }

diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py
@@ -18,6 +18,7 @@
 import datetime
 import decimal
 from collections import OrderedDict
+import io
 
 import numpy as np
 import pytest
@@ -433,7 +434,9 @@ def test_write_metadata(tempdir):
     assert parquet_meta_mult.num_row_groups == 2
 
     # append metadata with different schema raises an error
-    with pytest.raises(RuntimeError, match="requires equal schemas"):
+    msg = ("AppendRowGroups requires equal schemas.\n"
+           "The two columns with index 0 differ.")
+    with pytest.raises(RuntimeError, match=msg):
         pq.write_metadata(
             pa.schema([("a", "int32"), ("b", "null")]),
             path, metadata_collector=[parquet_meta, parquet_meta]
@@ -580,3 +583,33 @@ def test_metadata_equals():
     match = "Argument 'other' has incorrect type"
     with pytest.raises(TypeError, match=match):
         original_metadata.equals(None)
+
+
+@pytest.mark.parametrize("t1,t2,expected_error", (
+    ({'col1': range(10)}, {'col1': range(10)}, None),
+    ({'col1': range(10)}, {'col2': range(10)},
+     "The two columns with index 0 differ."),
+    ({'col1': range(10), 'col2': range(10)}, {'col3': range(10)},
+     "This schema has 2 columns, other has 1")
+))
+def test_metadata_append_row_groups_diff(t1, t2, expected_error):
+    table1 = pa.table(t1)
+    table2 = pa.table(t2)
+
+    buf1 = io.BytesIO()
+    buf2 = io.BytesIO()
+    pq.write_table(table1, buf1)
+    pq.write_table(table2, buf2)
+    buf1.seek(0)
+    buf2.seek(0)
+
+    meta1 = pq.ParquetFile(buf1).metadata
+    meta2 = pq.ParquetFile(buf2).metadata
+
+    if expected_error:
+        # Error clearly defines it's happening at append row groups call
+        prefix = "AppendRowGroups requires equal schemas.\n"
+        with pytest.raises(RuntimeError, match=prefix + expected_error):
+            meta1.append_row_groups(meta2)
+    else:
+        meta1.append_row_groups(meta2)