ARROW-17525: [Java] Read ORC files using NativeDatasetFactory (apache…

…#13973) Support ORC file format in java Dataset API Authored-by: igor.suhorukov <igor.suhorukov@gmail.com> Signed-off-by: David Li <li.davidm96@gmail.com>
drin · Sep 7, 2022 · 133dcd8 · 133dcd8
1 parent 751ea0e
commit 133dcd8
Show file tree

Hide file tree

Showing 5 changed files with 108 additions and 0 deletions.
diff --git a/java/dataset/pom.xml b/java/dataset/pom.xml
@@ -109,6 +109,38 @@
             <artifactId>jackson-databind</artifactId>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>org.apache.arrow.orc</groupId>
+            <artifactId>arrow-orc</artifactId>
+            <version>${project.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.orc</groupId>
+            <artifactId>orc-core</artifactId>
+            <version>1.7.6</version>
+            <scope>test</scope>
+            <exclusions>
+                <exclusion>
+                    <groupId>log4j</groupId>
+                    <artifactId>log4j</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-log4j12</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>commons-logging</groupId>
+                    <artifactId>commons-logging</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hive</groupId>
+            <artifactId>hive-storage-api</artifactId>
+            <version>2.8.1</version>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
     <build>
         <resources>

diff --git a/java/dataset/src/main/cpp/jni_wrapper.cc b/java/dataset/src/main/cpp/jni_wrapper.cc
@@ -91,6 +91,8 @@ arrow::Result<std::shared_ptr<arrow::dataset::FileFormat>> GetFileFormat(
       return std::make_shared<arrow::dataset::ParquetFileFormat>();
     case 1:
       return std::make_shared<arrow::dataset::IpcFileFormat>();
+    case 2:
+      return std::make_shared<arrow::dataset::OrcFileFormat>();
     default:
       std::string error_message =
           "illegal file format id: " + std::to_string(file_format_id);

diff --git a/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java b/java/dataset/src/main/java/org/apache/arrow/dataset/file/FileFormat.java
@@ -23,6 +23,7 @@
 public enum FileFormat {
   PARQUET(0),
   ARROW_IPC(1),
+  ORC(2),
   NONE(-1);
 
   private final int id;

diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/OrcWriteSupport.java b/java/dataset/src/test/java/org/apache/arrow/dataset/OrcWriteSupport.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.arrow.dataset;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.Writer;
+
+public class OrcWriteSupport {
+  public static void writeTempFile(TypeDescription orcSchema, Path path, Integer[] values) throws IOException {
+    Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(new Configuration()).setSchema(orcSchema));
+    VectorizedRowBatch batch = orcSchema.createRowBatch();
+    LongColumnVector longColumnVector = (LongColumnVector) batch.cols[0];
+    for (int idx = 0; idx < values.length; idx++) {
+      longColumnVector.vector[idx] = values[idx];
+    }
+    batch.size = values.length;
+    writer.addRowBatch(batch);
+    writer.close();
+  }
+}
diff --git a/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java b/java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java
@@ -37,6 +37,7 @@
 import java.util.concurrent.Executors;
 import java.util.stream.Collectors;
 
+import org.apache.arrow.dataset.OrcWriteSupport;
 import org.apache.arrow.dataset.ParquetWriteSupport;
 import org.apache.arrow.dataset.jni.NativeDataset;
 import org.apache.arrow.dataset.jni.NativeInstanceReleasedException;
@@ -59,6 +60,8 @@
 import org.apache.arrow.vector.types.pojo.Schema;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.generic.GenericRecordBuilder;
+import org.apache.hadoop.fs.Path;
+import org.apache.orc.TypeDescription;
 import org.junit.Assert;
 import org.junit.ClassRule;
 import org.junit.Test;
@@ -357,6 +360,34 @@ public void testBaseArrowIpcRead() throws Exception {
     AutoCloseables.close(factory);
   }
 
+  @Test
+  public void testBaseOrcRead() throws Exception {
+    String dataName = "test-orc";
+    String basePath = TMP.getRoot().getAbsolutePath();
+
+    TypeDescription orcSchema = TypeDescription.fromString("struct<ints:int>");
+    Path path = new Path(basePath, dataName);
+    OrcWriteSupport.writeTempFile(orcSchema, path, new Integer[]{Integer.MIN_VALUE, Integer.MAX_VALUE});
+
+    String orcDatasetUri = new File(basePath, dataName).toURI().toString();
+    FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(),
+            FileFormat.ORC, orcDatasetUri);
+    ScanOptions options = new ScanOptions(100);
+    Schema schema = inferResultSchemaFromFactory(factory, options);
+    List<ArrowRecordBatch> datum = collectResultFromFactory(factory, options);
+
+    assertSingleTaskProduced(factory, options);
+    assertEquals(1, datum.size());
+    assertEquals(1, schema.getFields().size());
+    assertEquals("ints", schema.getFields().get(0).getName());
+
+    String expectedJsonUnordered = "[[2147483647], [-2147483648]]";
+    checkParquetReadResult(schema, expectedJsonUnordered, datum);
+
+    AutoCloseables.close(datum);
+    AutoCloseables.close(factory);
+  }
+
   private void checkParquetReadResult(Schema schema, String expectedJson, List<ArrowRecordBatch> actual)
       throws IOException {
     final ObjectMapper json = new ObjectMapper();