diff --git a/Cargo.toml b/Cargo.toml index 0c0ef3c1..6f08d9f3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,7 @@ semver = "1.0" [dev-dependencies] tempfile = "3.8" # Only used in the example -arrow2 = "0.18" +arrow = { version = "50.0", default-features = false, features = ["ffi"] } [workspace] members = ["gdal-sys"] diff --git a/examples/read_ogr_arrow.rs b/examples/read_ogr_arrow.rs index 42596a7d..7a125453 100644 --- a/examples/read_ogr_arrow.rs +++ b/examples/read_ogr_arrow.rs @@ -1,35 +1,36 @@ -//! Example of reading from OGR to a stream of Arrow arrays +//! Example of reading from OGR to a stream of Arrow array batches //! -//! As of this writing (Feb 2023), there are two competing low-level Arrow libraries in Rust. -//! [`arrow-rs`](https://github.com/apache/arrow-rs) is the "official" one but uses unsafe -//! transmutes. [`arrow2`](https://github.com/jorgecarleitao/arrow2) was written to be a fully safe -//! implementation of Arrow. +//! As of this writing (Jan 2024), there are two competing low-level Arrow libraries in Rust. +//! [`arrow`](https://github.com/apache/arrow-rs) is the "official" one, while +//! [`arrow2`](https://github.com/jorgecarleitao/arrow2) is a less active alternative. //! //! Each library implements the same Arrow memory standard, and each implements the //! ArrowArrayStream interface, so each can integrate with the GDAL `read_arrow_stream` API. //! -//! This example will use `arrow2` but the process should be similar using `arrow-rs`. +//! This example will use `arrow`, but the process is +//! [similar](https://github.com/georust/gdal/blob/87497bf28509ea1b66b8e64000bd6b33fde0f31b/examples/read_ogr_arrow.rs#L23) +//! when using `arrow2`. #[cfg(any(major_ge_4, all(major_is_3, minor_ge_6)))] fn run() -> gdal::errors::Result<()> { - use arrow2::array::{BinaryArray, StructArray}; - use arrow2::datatypes::DataType; + use arrow::array::{Array as _, BinaryArray}; + use arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; use gdal::cpl::CslStringList; use gdal::vector::*; use gdal::Dataset; use std::path::Path; // Open a dataset and access a layer - let dataset_a = Dataset::open(Path::new("fixtures/roads.geojson"))?; - let mut layer_a = dataset_a.layer(0)?; + let dataset = Dataset::open(Path::new("fixtures/roads.geojson"))?; + let mut layer = dataset.layer(0)?; // Instantiate an `ArrowArrayStream` for OGR to write into - let mut output_stream = Box::new(arrow2::ffi::ArrowArrayStream::empty()); + let mut output_stream = FFI_ArrowArrayStream::empty(); - // Access the unboxed pointer - let output_stream_ptr = &mut *output_stream as *mut arrow2::ffi::ArrowArrayStream; + // Take a pointer to it + let output_stream_ptr = &mut output_stream as *mut FFI_ArrowArrayStream; - // gdal includes its own copy of the ArrowArrayStream struct definition. These are guaranteed + // GDAL includes its own copy of the ArrowArrayStream struct definition. These are guaranteed // to be the same across implementations, but we need to manually cast between the two for Rust // to allow it. let gdal_pointer: *mut gdal::ArrowArrayStream = output_stream_ptr.cast(); @@ -38,50 +39,24 @@ fn run() -> gdal::errors::Result<()> { options.set_name_value("INCLUDE_FID", "NO")?; // Read the layer's data into our provisioned pointer - unsafe { layer_a.read_arrow_stream(gdal_pointer, &options).unwrap() } + unsafe { layer.read_arrow_stream(gdal_pointer, &options)? } - // The rest of this example is arrow2-specific. + // The rest of this example is specific to the `arrow` crate. - // `arrow2` has a helper class `ArrowArrayStreamReader` to assist with iterating over the raw + // `arrow` has a helper class `ArrowArrayStreamReader` to assist with iterating over the raw // batches - let mut arrow_stream_reader = - unsafe { arrow2::ffi::ArrowArrayStreamReader::try_new(output_stream).unwrap() }; + let arrow_stream_reader = ArrowArrayStreamReader::try_new(output_stream).unwrap(); // Iterate over the stream until it's finished - // arrow_stream_reader.next() will return None when the stream has no more data - while let Some(maybe_array) = unsafe { arrow_stream_reader.next() } { + for maybe_array in arrow_stream_reader { // Access the contained array let top_level_array = maybe_array.unwrap(); - // The top-level array is a single logical "struct" array which includes all columns of the - // dataset inside it. - assert!( - matches!(top_level_array.data_type(), DataType::Struct(..)), - "Top-level arrays from OGR are expected to be of struct type" - ); - - // Downcast from the Box to a concrete StructArray - let struct_array = top_level_array - .as_any() - .downcast_ref::() - .unwrap(); - - // Access the underlying column metadata and data - // Clones are cheap because they do not copy the underlying data - let (fields, columns, _validity) = struct_array.clone().into_data(); - - // Find the index of the geometry column - let geom_column_index = fields - .iter() - .position(|field| field.name == "wkb_geometry") - .unwrap(); - - // Pick that column and downcast to a BinaryArray - let geom_column = &columns[geom_column_index]; - let binary_array = geom_column - .as_any() - .downcast_ref::>() - .unwrap(); + // Find the geometry column + let geom_column = top_level_array.column_by_name("wkb_geometry").unwrap(); + + // Downcast it to a `BinaryArray` + let binary_array = geom_column.as_any().downcast_ref::().unwrap(); // Access the first row as WKB let _wkb_buffer = binary_array.value(0); @@ -97,6 +72,6 @@ fn run() -> gdal::errors::Result<()> { Ok(()) } -fn main() { - run().unwrap(); +fn main() -> gdal::errors::Result<()> { + run() }