Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions src/duckdb_py/arrow/arrow_array_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,16 @@ void PythonTableArrowArrayStreamFactory::GetSchemaInternal(py::handle arrow_obj_
}

void PythonTableArrowArrayStreamFactory::GetSchema(uintptr_t factory_ptr, ArrowSchemaWrapper &schema) {
py::gil_scoped_acquire acquire;
auto factory = static_cast<PythonTableArrowArrayStreamFactory *>(reinterpret_cast<void *>(factory_ptr)); // NOLINT

// Fast path: return cached schema without GIL or Python calls
if (factory->schema_cached) {
schema.arrow_schema = factory->cached_schema; // struct copy
schema.arrow_schema.release = nullptr; // non-owning copy
return;
}

py::gil_scoped_acquire acquire;
D_ASSERT(factory->arrow_object);
py::handle arrow_obj_handle(factory->arrow_object);

Expand All @@ -188,8 +196,11 @@ void PythonTableArrowArrayStreamFactory::GetSchema(uintptr_t factory_ptr, ArrowS
auto schema_capsule = arrow_obj_handle.attr("__arrow_c_schema__")();
auto capsule = py::reinterpret_borrow<py::capsule>(schema_capsule);
auto arrow_schema = capsule.get_pointer<struct ArrowSchema>();
schema.arrow_schema = *arrow_schema;
arrow_schema->release = nullptr; // take ownership
factory->cached_schema = *arrow_schema; // factory takes ownership
arrow_schema->release = nullptr;
factory->schema_cached = true;
schema.arrow_schema = factory->cached_schema; // non-owning copy
schema.arrow_schema.release = nullptr;
return;
}
// Otherwise try to use .schema with _export_to_c
Expand All @@ -211,6 +222,13 @@ void PythonTableArrowArrayStreamFactory::GetSchema(uintptr_t factory_ptr, ArrowS
return; // stream_capsule goes out of scope, stream released by capsule destructor
}
GetSchemaInternal(arrow_obj_handle, schema);

// Cache for Table and Dataset (immutable schema)
if (type == PyArrowObjectType::Table || type == PyArrowObjectType::Dataset) {
factory->cached_schema = schema.arrow_schema; // factory takes ownership
schema.arrow_schema.release = nullptr; // caller gets non-owning copy
factory->schema_cached = true;
}
}

} // namespace duckdb
13 changes: 12 additions & 1 deletion src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,15 @@ class PythonTableArrowArrayStreamFactory {
public:
explicit PythonTableArrowArrayStreamFactory(PyObject *arrow_table, const ClientProperties &client_properties_p,
PyArrowObjectType arrow_type_p)
: arrow_object(arrow_table), client_properties(client_properties_p), cached_arrow_type(arrow_type_p) {};
: arrow_object(arrow_table), client_properties(client_properties_p), cached_arrow_type(arrow_type_p) {
cached_schema.release = nullptr;
}

~PythonTableArrowArrayStreamFactory() {
if (cached_schema.release) {
cached_schema.release(&cached_schema);
}
}

//! Produces an Arrow Scanner, should be only called once when initializing Scan States
static unique_ptr<ArrowArrayStreamWrapper> Produce(uintptr_t factory, ArrowStreamParameters &parameters);
Expand All @@ -77,6 +85,9 @@ class PythonTableArrowArrayStreamFactory {
const PyArrowObjectType cached_arrow_type;

private:
ArrowSchema cached_schema;
bool schema_cached = false;

static py::object ProduceScanner(py::object &arrow_scanner, py::handle &arrow_obj_handle,
ArrowStreamParameters &parameters, const ClientProperties &client_properties);
};
Expand Down
Loading