Parquet: Support reading INT96 column in row group filter (apache#8988)

geruh · Jan 25, 2024 · 6118f2d · 6118f2d
1 parent 0512806
commit 6118f2d
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 31 deletions.
diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java
@@ -21,6 +21,7 @@
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.charset.StandardCharsets;
 import java.util.UUID;
 import java.util.function.Function;
@@ -112,6 +113,10 @@ static Function<Object, Object> converterFromParquet(PrimitiveType type) {
       case FIXED_LEN_BYTE_ARRAY:
       case BINARY:
         return binary -> ByteBuffer.wrap(((Binary) binary).getBytes());
+      case INT96:
+        return binary ->
+            ParquetUtil.extractTimestampInt96(
+                ByteBuffer.wrap(((Binary) binary).getBytes()).order(ByteOrder.LITTLE_ENDIAN));
       default:
     }
 

diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetDictionaryRowGroupFilter.java
@@ -453,6 +453,9 @@ private <T> Set<T> dict(int id, Comparator<T> comparator) {
           case DOUBLE:
             dictSet.add((T) conversion.apply(dict.decodeToDouble(i)));
             break;
+          case INT96:
+            dictSet.add((T) conversion.apply(dict.decodeToBinary(i)));
+            break;
           default:
             throw new IllegalArgumentException(
                 "Cannot decode dictionary of type: "

diff --git a/...v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java b/...v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java
@@ -2156,22 +2156,32 @@ public void testTableWithInt96Timestamp() throws IOException {
             stagingLocation);
 
         // validate we get the expected results back
-        List<Row> expected = spark.table("parquet_table").select("tmp_col").collectAsList();
-        List<Row> actual =
-            spark
-                .read()
-                .format("iceberg")
-                .load(loadLocation(tableIdentifier))
-                .select("tmp_col")
-                .collectAsList();
-        Assertions.assertThat(actual)
-            .as("Rows must match")
-            .containsExactlyInAnyOrderElementsOf(expected);
+        testWithFilter("tmp_col < to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col <= to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col == to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col > to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col >= to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
         dropTable(tableIdentifier);
       }
     }
   }
 
+  private void testWithFilter(String filterExpr, TableIdentifier tableIdentifier) {
+    List<Row> expected =
+        spark.table("parquet_table").select("tmp_col").filter(filterExpr).collectAsList();
+    List<Row> actual =
+        spark
+            .read()
+            .format("iceberg")
+            .load(loadLocation(tableIdentifier))
+            .select("tmp_col")
+            .filter(filterExpr)
+            .collectAsList();
+    Assertions.assertThat(actual)
+        .as("Rows must match")
+        .containsExactlyInAnyOrderElementsOf(expected);
+  }
+
   private GenericData.Record manifestRecord(
       Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) {
     GenericRecordBuilder builder =

diff --git a/...v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java b/...v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java
@@ -2154,22 +2154,32 @@ public void testTableWithInt96Timestamp() throws IOException {
             stagingLocation);
 
         // validate we get the expected results back
-        List<Row> expected = spark.table("parquet_table").select("tmp_col").collectAsList();
-        List<Row> actual =
-            spark
-                .read()
-                .format("iceberg")
-                .load(loadLocation(tableIdentifier))
-                .select("tmp_col")
-                .collectAsList();
-        Assertions.assertThat(actual)
-            .as("Rows must match")
-            .containsExactlyInAnyOrderElementsOf(expected);
+        testWithFilter("tmp_col < to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col <= to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col == to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col > to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col >= to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
         dropTable(tableIdentifier);
       }
     }
   }
 
+  private void testWithFilter(String filterExpr, TableIdentifier tableIdentifier) {
+    List<Row> expected =
+        spark.table("parquet_table").select("tmp_col").filter(filterExpr).collectAsList();
+    List<Row> actual =
+        spark
+            .read()
+            .format("iceberg")
+            .load(loadLocation(tableIdentifier))
+            .select("tmp_col")
+            .filter(filterExpr)
+            .collectAsList();
+    Assertions.assertThat(actual)
+        .as("Rows must match")
+        .containsExactlyInAnyOrderElementsOf(expected);
+  }
+
   private GenericData.Record manifestRecord(
       Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) {
     GenericRecordBuilder builder =

diff --git a/...v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java b/...v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSourceTablesBase.java
@@ -2181,20 +2181,30 @@ public void testTableWithInt96Timestamp() throws IOException {
             stagingLocation);
 
         // validate we get the expected results back
-        List<Row> expected = spark.table("parquet_table").select("tmp_col").collectAsList();
-        List<Row> actual =
-            spark
-                .read()
-                .format("iceberg")
-                .load(loadLocation(tableIdentifier))
-                .select("tmp_col")
-                .collectAsList();
-        assertThat(actual).as("Rows must match").containsExactlyInAnyOrderElementsOf(expected);
+        testWithFilter("tmp_col < to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col <= to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col == to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col > to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
+        testWithFilter("tmp_col >= to_timestamp('2000-01-31 08:30:00')", tableIdentifier);
         dropTable(tableIdentifier);
       }
     }
   }
 
+  private void testWithFilter(String filterExpr, TableIdentifier tableIdentifier) {
+    List<Row> expected =
+        spark.table("parquet_table").select("tmp_col").filter(filterExpr).collectAsList();
+    List<Row> actual =
+        spark
+            .read()
+            .format("iceberg")
+            .load(loadLocation(tableIdentifier))
+            .select("tmp_col")
+            .filter(filterExpr)
+            .collectAsList();
+    assertThat(actual).as("Rows must match").containsExactlyInAnyOrderElementsOf(expected);
+  }
+
   private GenericData.Record manifestRecord(
       Table manifestTable, Long referenceSnapshotId, ManifestFile manifest) {
     GenericRecordBuilder builder =