Skip to content

Commit

Permalink
Use arrow instead of arrow2 in the example
Browse files Browse the repository at this point in the history
  • Loading branch information
lnicola committed Jan 27, 2024
1 parent 87497bf commit 72f6e9b
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 53 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ semver = "1.0"
[dev-dependencies]
tempfile = "3.8"
# Only used in the example
arrow2 = "0.18"
arrow = { version = "50.0", default-features = false, features = ["ffi"] }

[workspace]
members = ["gdal-sys"]
Expand Down
79 changes: 27 additions & 52 deletions examples/read_ogr_arrow.rs
Original file line number Diff line number Diff line change
@@ -1,35 +1,36 @@
//! Example of reading from OGR to a stream of Arrow arrays
//! Example of reading from OGR to a stream of Arrow array batches
//!
//! As of this writing (Feb 2023), there are two competing low-level Arrow libraries in Rust.
//! [`arrow-rs`](https://github.com/apache/arrow-rs) is the "official" one but uses unsafe
//! transmutes. [`arrow2`](https://github.com/jorgecarleitao/arrow2) was written to be a fully safe
//! implementation of Arrow.
//! As of this writing (Jan 2024), there are two competing low-level Arrow libraries in Rust.
//! [`arrow`](https://github.com/apache/arrow-rs) is the "official" one, while
//! [`arrow2`](https://github.com/jorgecarleitao/arrow2) is a less active alternative.
//!
//! Each library implements the same Arrow memory standard, and each implements the
//! ArrowArrayStream interface, so each can integrate with the GDAL `read_arrow_stream` API.
//!
//! This example will use `arrow2` but the process should be similar using `arrow-rs`.
//! This example will use `arrow`, but the process is
//! [similar](https://github.com/georust/gdal/blob/87497bf28509ea1b66b8e64000bd6b33fde0f31b/examples/read_ogr_arrow.rs#L23)
//! when using `arrow2`.

#[cfg(any(major_ge_4, all(major_is_3, minor_ge_6)))]
fn run() -> gdal::errors::Result<()> {
use arrow2::array::{BinaryArray, StructArray};
use arrow2::datatypes::DataType;
use arrow::array::{Array as _, BinaryArray};
use arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream};
use gdal::cpl::CslStringList;
use gdal::vector::*;
use gdal::Dataset;
use std::path::Path;

// Open a dataset and access a layer
let dataset_a = Dataset::open(Path::new("fixtures/roads.geojson"))?;
let mut layer_a = dataset_a.layer(0)?;
let dataset = Dataset::open(Path::new("fixtures/roads.geojson"))?;
let mut layer = dataset.layer(0)?;

// Instantiate an `ArrowArrayStream` for OGR to write into
let mut output_stream = Box::new(arrow2::ffi::ArrowArrayStream::empty());
let mut output_stream = FFI_ArrowArrayStream::empty();

// Access the unboxed pointer
let output_stream_ptr = &mut *output_stream as *mut arrow2::ffi::ArrowArrayStream;
// Take a pointer to it
let output_stream_ptr = &mut output_stream as *mut FFI_ArrowArrayStream;

// gdal includes its own copy of the ArrowArrayStream struct definition. These are guaranteed
// GDAL includes its own copy of the ArrowArrayStream struct definition. These are guaranteed
// to be the same across implementations, but we need to manually cast between the two for Rust
// to allow it.
let gdal_pointer: *mut gdal::ArrowArrayStream = output_stream_ptr.cast();
Expand All @@ -38,50 +39,24 @@ fn run() -> gdal::errors::Result<()> {
options.set_name_value("INCLUDE_FID", "NO")?;

// Read the layer's data into our provisioned pointer
unsafe { layer_a.read_arrow_stream(gdal_pointer, &options).unwrap() }
unsafe { layer.read_arrow_stream(gdal_pointer, &options)? }

// The rest of this example is arrow2-specific.
// The rest of this example is specific to the `arrow` crate.

// `arrow2` has a helper class `ArrowArrayStreamReader` to assist with iterating over the raw
// `arrow` has a helper class `ArrowArrayStreamReader` to assist with iterating over the raw
// batches
let mut arrow_stream_reader =
unsafe { arrow2::ffi::ArrowArrayStreamReader::try_new(output_stream).unwrap() };
let arrow_stream_reader = ArrowArrayStreamReader::try_new(output_stream).unwrap();

// Iterate over the stream until it's finished
// arrow_stream_reader.next() will return None when the stream has no more data
while let Some(maybe_array) = unsafe { arrow_stream_reader.next() } {
for maybe_array in arrow_stream_reader {
// Access the contained array
let top_level_array = maybe_array.unwrap();

// The top-level array is a single logical "struct" array which includes all columns of the
// dataset inside it.
assert!(
matches!(top_level_array.data_type(), DataType::Struct(..)),
"Top-level arrays from OGR are expected to be of struct type"
);

// Downcast from the Box<dyn Array> to a concrete StructArray
let struct_array = top_level_array
.as_any()
.downcast_ref::<StructArray>()
.unwrap();

// Access the underlying column metadata and data
// Clones are cheap because they do not copy the underlying data
let (fields, columns, _validity) = struct_array.clone().into_data();

// Find the index of the geometry column
let geom_column_index = fields
.iter()
.position(|field| field.name == "wkb_geometry")
.unwrap();

// Pick that column and downcast to a BinaryArray
let geom_column = &columns[geom_column_index];
let binary_array = geom_column
.as_any()
.downcast_ref::<BinaryArray<i32>>()
.unwrap();
// Find the geometry column
let geom_column = top_level_array.column_by_name("wkb_geometry").unwrap();

// Downcast it to a `BinaryArray`
let binary_array = geom_column.as_any().downcast_ref::<BinaryArray>().unwrap();

// Access the first row as WKB
let _wkb_buffer = binary_array.value(0);
Expand All @@ -97,6 +72,6 @@ fn run() -> gdal::errors::Result<()> {
Ok(())
}

fn main() {
run().unwrap();
fn main() -> gdal::errors::Result<()> {
run()
}

0 comments on commit 72f6e9b

Please sign in to comment.