Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
e8dbd62
ENH: add support for parquet list columns
theroggy Nov 18, 2025
d812965
Skip parquet if driver not available
theroggy Nov 18, 2025
8a92f9a
Update test_geopandas_io.py
theroggy Nov 18, 2025
f98c0e8
Fix linter issue
theroggy Nov 18, 2025
b87e799
Skip some tests if pyarrow.parquet is not available
theroggy Nov 18, 2025
89df381
Skip tests if parquet file cannot be created
theroggy Nov 18, 2025
3d637e8
Use a saved version of the test parquet files as it is a problem to a…
theroggy Nov 18, 2025
a429118
Give error instead of skip if test file cannot be created
theroggy Nov 18, 2025
68fd8f9
Fix tests for minimal ci env
theroggy Nov 18, 2025
9bb9472
Add libgdal-arrow-parquet to the lastest env to be able to run parque…
theroggy Nov 18, 2025
e3f2555
Skip test for nested columns in parquet without arrow for now
theroggy Nov 18, 2025
880182e
Update test_geopandas_io.py
theroggy Nov 18, 2025
fc5a4d7
Update latest.yml
theroggy Nov 18, 2025
74fdf5c
Update latest.yml
theroggy Nov 18, 2025
c64c8d9
Update latest.yml
theroggy Nov 18, 2025
59fc5dc
Only check GDAL_HAS_PARQUET in the tests
theroggy Nov 18, 2025
6c7738f
Apply feedback
theroggy Nov 18, 2025
30513c5
Update conftest.py
theroggy Nov 19, 2025
425c5d0
Update test_geopandas_io.py
theroggy Nov 19, 2025
cb115b1
Remove some redundant code
theroggy Nov 19, 2025
099f19f
Only import pyarrow, parquet if needed in conftest
theroggy Nov 19, 2025
223e686
Update conftest.py
theroggy Nov 19, 2025
4f2ac0e
Delay calling list_drivers
theroggy Nov 19, 2025
036df1e
Try importing pyarrow sooner
theroggy Nov 19, 2025
27d1bee
rollback previous commit as it didn't help
theroggy Nov 19, 2025
01a72be
Comment new tests
theroggy Nov 19, 2025
a100f2f
Try uncommenting GDAL_HAS_PARQUET
theroggy Nov 19, 2025
9aa348a
Update test_geopandas_io.py
theroggy Nov 19, 2025
0060a3f
Move test_read_arrow_dtypes up + reenable new tests
theroggy Nov 19, 2025
97634cc
Comment new tests again
theroggy Nov 19, 2025
0650ef9
Comment code to create parquet test files
theroggy Nov 19, 2025
9963d0b
Activate new tests again
theroggy Nov 19, 2025
7916ebc
Move new tests down
theroggy Nov 19, 2025
e4ef8bb
Add fix for ArrowDtype error
theroggy Nov 19, 2025
7c62698
Small improvements
theroggy Nov 19, 2025
a4f95f5
Revert reordering of tests
theroggy Nov 19, 2025
a996beb
Revert commenting parquet file creation code
theroggy Nov 19, 2025
da4f693
Update test_geopandas_io.py
theroggy Nov 19, 2025
45ee07a
Merge remote-tracking branch 'upstream/main' into ENH-add-support-for…
jorisvandenbossche Nov 20, 2025
424341d
fixup merge
jorisvandenbossche Nov 20, 2025
0835b58
Add link to gdal issue for None values in list
theroggy Nov 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
### Improvements

- Add listing of GDAL data types and subtypes to `read_info` (#556).
- Add support to read list fields without arrow (#558).
- Add support to read list fields without arrow (#558, #597).

### Bug fixes

Expand Down
26 changes: 22 additions & 4 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,9 +332,19 @@ def read_dataframe(

del table

for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns):
for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
if ogr_subtype == "OFSTJSON":
df[c] = df[c].map(json.loads, na_action="ignore")
# When reading .parquet files with arrow, JSON fields are already
# parsed, so only parse if strings.
dtype = pd.api.types.infer_dtype(df[c])
if dtype == "string":
try:
df[c] = df[c].map(json.loads, na_action="ignore")
except Exception:
warnings.warn(
f"Could not parse column '{c}' as JSON; leaving as string",
stacklevel=2,
)

if fid_as_index:
df = df.set_index(meta["fid_column"])
Expand Down Expand Up @@ -378,9 +388,17 @@ def read_dataframe(
for dtype, c in zip(meta["dtypes"], df.columns):
if dtype.startswith("datetime"):
df[c] = _try_parse_datetime(df[c])
for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns):
for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]):
if ogr_subtype == "OFSTJSON":
df[c] = df[c].map(json.loads, na_action="ignore")
dtype = pd.api.types.infer_dtype(df[c])
if dtype == "string":
try:
df[c] = df[c].map(json.loads, na_action="ignore")
except Exception:
warnings.warn(
f"Could not parse column '{c}' as JSON; leaving as string",
stacklevel=2,
)

if geometry is None or not read_geometry:
return df
Expand Down
110 changes: 103 additions & 7 deletions pyogrio/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
"""Module with helper functions, fixtures, and common test data for pyogrio tests."""

from io import BytesIO
from pathlib import Path
from zipfile import ZIP_DEFLATED, ZipFile

import numpy as np

from pyogrio import (
__gdal_version_string__,
__version__,
list_drivers,
)
from pyogrio import __gdal_version_string__, __version__, list_drivers
from pyogrio._compat import (
GDAL_GE_37,
HAS_ARROW_WRITE_API,
Expand Down Expand Up @@ -203,8 +201,7 @@ def no_geometry_file(tmp_path):
return filename


@pytest.fixture(scope="function")
def list_field_values_file(tmp_path):
def list_field_values_geojson_file(tmp_path):
# Create a GeoJSON file with list values in a property
list_geojson = """{
"type": "FeatureCollection",
Expand Down Expand Up @@ -279,6 +276,66 @@ def list_field_values_file(tmp_path):
return filename


def list_field_values_parquet_file():
"""Return the path to a Parquet file with list values in a property.

Because in the CI environments pyarrow.parquet is typically not available, we save
the file in the test data directory instead of always creating it from scratch.

The code to create it is here though, in case it needs to be recreated later.
"""
# Check if the file already exists in the test data dir
fixture_path = _data_dir / "list_field_values_file.parquet"
if fixture_path.exists():
return fixture_path

# The file doesn't exist, so create it
try:
import pyarrow as pa
from pyarrow import parquet as pq

import shapely
except ImportError as ex:
raise RuntimeError(
f"test file {fixture_path} does not exist, but error importing: {ex}."
)

table = pa.table(
{
"geometry": shapely.to_wkb(shapely.points(np.ones((5, 2)))),
"int": [1, 2, 3, 4, 5],
"list_int": [[0, 1], [2, 3], [], None, None],
"list_double": [[0.0, 1.0], [2.0, 3.0], [], None, None],
"list_string": [
["string1", "string2"],
["string3", "string4", ""],
[],
None,
[""],
],
"list_int_with_null": [[0, None], [2, 3], [], None, None],
"list_string_with_null": [
["string1", None],
["string3", "string4", ""],
[],
None,
[""],
],
}
)
pq.write_table(table, fixture_path)

return fixture_path


@pytest.fixture(scope="function", params=[".geojson", ".parquet"])
def list_field_values_files(tmp_path, request):
if request.param == ".geojson":
return list_field_values_geojson_file(tmp_path)
elif request.param == ".parquet":
return list_field_values_parquet_file()


@pytest.fixture(scope="function")
def nested_geojson_file(tmp_path):
# create GeoJSON file with nested properties
Expand Down Expand Up @@ -308,6 +365,45 @@ def nested_geojson_file(tmp_path):
return filename


@pytest.fixture(scope="function")
def list_nested_struct_parquet_file(tmp_path):
"""Create a Parquet file in tmp_path with nested values in a property.

Because in the CI environments pyarrow.parquet is typically not available, we save
the file in the test data directory instead of always creating it from scratch.

The code to create it is here though, in case it needs to be recreated later.
"""
# Check if the file already exists in the test data dir
fixture_path = _data_dir / "list_nested_struct_file.parquet"
if fixture_path.exists():
return fixture_path

# The file doesn't exist, so create it
try:
import pyarrow as pa
from pyarrow import parquet as pq

import shapely
except ImportError as ex:
raise RuntimeError(
f"test file {fixture_path} does not exist, but error importing: {ex}."
)

table = pa.table(
{
"geometry": shapely.to_wkb(shapely.points(np.ones((3, 2)))),
"col_flat": [0, 1, 2],
"col_struct": [{"a": 1, "b": 2}] * 3,
"col_nested": [[{"a": 1, "b": 2}] * 2] * 3,
"col_list": [[1, 2, 3]] * 3,
}
)
pq.write_table(table, fixture_path)

return fixture_path


@pytest.fixture(scope="function")
def datetime_file(tmp_path):
# create GeoJSON file with millisecond precision
Expand Down
Binary file not shown.
Binary file not shown.
137 changes: 119 additions & 18 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,20 +379,29 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow):
assert_series_equal(df_read.datetime_col, expected)


def test_read_list_types(list_field_values_file, use_arrow):
def test_read_list_types(list_field_values_files, use_arrow):
"""Test reading a geojson file containing fields with lists."""
info = read_info(list_field_values_file)
result = read_dataframe(list_field_values_file, use_arrow=use_arrow)
if list_field_values_files.suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER:
pytest.skip(
"Skipping test for parquet as the GDAL Parquet driver is not available"
)

info = read_info(list_field_values_files)
suffix = list_field_values_files.suffix

result = read_dataframe(list_field_values_files, use_arrow=use_arrow)

# Check list_int column
assert "list_int" in result.columns
assert info["fields"][1] == "list_int"
assert info["ogr_types"][1] == "OFTIntegerList"
assert info["ogr_types"][1] in ("OFTIntegerList", "OFTInteger64List")
assert result["list_int"][0].tolist() == [0, 1]
assert result["list_int"][1].tolist() == [2, 3]
assert result["list_int"][2].tolist() == []
assert result["list_int"][3] is None
assert result["list_int"][4] is None

# Check list_double column
assert "list_double" in result.columns
assert info["fields"][2] == "list_double"
assert info["ogr_types"][2] == "OFTRealList"
Expand All @@ -402,6 +411,7 @@ def test_read_list_types(list_field_values_file, use_arrow):
assert result["list_double"][3] is None
assert result["list_double"][4] is None

# Check list_string column
assert "list_string" in result.columns
assert info["fields"][3] == "list_string"
assert info["ogr_types"][3] == "OFTStringList"
Expand All @@ -411,31 +421,122 @@ def test_read_list_types(list_field_values_file, use_arrow):
assert result["list_string"][3] is None
assert result["list_string"][4] == [""]

# Once any row of a column contains a null value in a list (in the test geojson),
# the column isn't recognized as a list column anymore, but as a JSON column.
# Because JSON columns containing JSON Arrays are also parsed to python lists, the
# end result is the same...
# Check list_int_with_null column
if suffix == ".geojson":
# Once any row of a column contains a null value in a list, the column isn't
# recognized as a list column anymore for .geojson files, but as a JSON column.
# Because JSON columns containing JSON Arrays are also parsed to python lists,
# the end result is the same...
exp_type = "OFTString"
exp_subtype = "OFSTJSON"
exp_list_int_with_null_value = [0, None]
else:
# For .parquet files, the list column is preserved as a list column.
exp_type = "OFTInteger64List"
exp_subtype = "OFSTNone"
if use_arrow:
exp_list_int_with_null_value = [0.0, np.nan]
else:
exp_list_int_with_null_value = [0, 0]
# xfail: when reading a list of int with None values without Arrow from a
# .parquet file, the None values become 0, which is wrong.
# https://github.com/OSGeo/gdal/issues/13448

assert "list_int_with_null" in result.columns
assert info["fields"][4] == "list_int_with_null"
assert info["ogr_types"][4] == "OFTString"
assert info["ogr_subtypes"][4] == "OFSTJSON"
assert result["list_int_with_null"][0] == [0, None]
assert result["list_int_with_null"][1] == [2, 3]
assert result["list_int_with_null"][2] == []
assert info["ogr_types"][4] == exp_type
assert info["ogr_subtypes"][4] == exp_subtype
assert result["list_int_with_null"][0][0] == 0
if exp_list_int_with_null_value[1] == 0:
assert result["list_int_with_null"][0][1] == exp_list_int_with_null_value[1]
else:
assert pd.isna(result["list_int_with_null"][0][1])

if suffix == ".geojson":
# For .geojson, the lists are already python lists
assert result["list_int_with_null"][1] == [2, 3]
assert result["list_int_with_null"][2] == []
else:
# For .parquet, the lists are numpy arrays
assert result["list_int_with_null"][1].tolist() == [2, 3]
assert result["list_int_with_null"][2].tolist() == []

assert pd.isna(result["list_int_with_null"][3])
assert pd.isna(result["list_int_with_null"][4])

# Check list_string_with_null column
if suffix == ".geojson":
# Once any row of a column contains a null value in a list, the column isn't
# recognized as a list column anymore for .geojson files, but as a JSON column.
# Because JSON columns containing JSON Arrays are also parsed to python lists,
# the end result is the same...
exp_type = "OFTString"
exp_subtype = "OFSTJSON"
else:
# For .parquet files, the list column is preserved as a list column.
exp_type = "OFTStringList"
exp_subtype = "OFSTNone"

assert "list_string_with_null" in result.columns
assert info["fields"][5] == "list_string_with_null"
assert info["ogr_types"][5] == "OFTString"
assert info["ogr_subtypes"][5] == "OFSTJSON"
assert result["list_string_with_null"][0] == ["string1", None]
assert result["list_string_with_null"][1] == ["string3", "string4", ""]
assert result["list_string_with_null"][2] == []
assert info["ogr_types"][5] == exp_type
assert info["ogr_subtypes"][5] == exp_subtype

if suffix == ".geojson":
# For .geojson, the lists are already python lists
assert result["list_string_with_null"][0] == ["string1", None]
assert result["list_string_with_null"][1] == ["string3", "string4", ""]
assert result["list_string_with_null"][2] == []
else:
# For .parquet, the lists are numpy arrays
# When use_arrow=False, the None becomes an empty string, which is wrong.
exp_value = ["string1", ""] if not use_arrow else ["string1", None]
assert result["list_string_with_null"][0].tolist() == exp_value
assert result["list_string_with_null"][1].tolist() == ["string3", "string4", ""]
assert result["list_string_with_null"][2].tolist() == []

assert pd.isna(result["list_string_with_null"][3])
assert result["list_string_with_null"][4] == [""]


@pytest.mark.requires_arrow_write_api
@pytest.mark.skipif(
not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available"
)
def test_read_list_nested_struct_parquet_file(
list_nested_struct_parquet_file, use_arrow
):
"""Test reading a Parquet file containing nested struct and list types."""
if not use_arrow:
pytest.skip(
"When use_arrow=False, gdal flattens nested columns to seperate columns. "
"Not sure how we want to deal with this case, but for now just skip."
)

result = read_dataframe(list_nested_struct_parquet_file, use_arrow=use_arrow)

assert "col_flat" in result.columns
assert np.array_equal(result["col_flat"].to_numpy(), np.array([0, 1, 2]))

assert "col_list" in result.columns
assert result["col_list"].dtype == object
assert result["col_list"][0].tolist() == [1, 2, 3]
assert result["col_list"][1].tolist() == [1, 2, 3]
assert result["col_list"][2].tolist() == [1, 2, 3]

assert "col_nested" in result.columns
assert result["col_nested"].dtype == object
assert result["col_nested"][0].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
assert result["col_nested"][1].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]
assert result["col_nested"][2].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}]

assert "col_struct" in result.columns
assert result["col_struct"].dtype == object
assert result["col_struct"][0] == {"a": 1, "b": 2}
assert result["col_struct"][1] == {"a": 1, "b": 2}
assert result["col_struct"][2] == {"a": 1, "b": 2}


@pytest.mark.filterwarnings(
"ignore: Non-conformant content for record 1 in column dates"
)
Expand Down
Loading