diff --git a/CHANGES.md b/CHANGES.md index ec185c65..6659e6d5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -11,7 +11,7 @@ ### Improvements - Add listing of GDAL data types and subtypes to `read_info` (#556). -- Add support to read list fields without arrow (#558). +- Add support to read list fields without arrow (#558, #597). ### Bug fixes diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index cf265241..d5455420 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -332,9 +332,19 @@ def read_dataframe( del table - for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns): + for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]): if ogr_subtype == "OFSTJSON": - df[c] = df[c].map(json.loads, na_action="ignore") + # When reading .parquet files with arrow, JSON fields are already + # parsed, so only parse if strings. + dtype = pd.api.types.infer_dtype(df[c]) + if dtype == "string": + try: + df[c] = df[c].map(json.loads, na_action="ignore") + except Exception: + warnings.warn( + f"Could not parse column '{c}' as JSON; leaving as string", + stacklevel=2, + ) if fid_as_index: df = df.set_index(meta["fid_column"]) @@ -378,9 +388,17 @@ def read_dataframe( for dtype, c in zip(meta["dtypes"], df.columns): if dtype.startswith("datetime"): df[c] = _try_parse_datetime(df[c]) - for ogr_subtype, c in zip(meta["ogr_subtypes"], df.columns): + for ogr_subtype, c in zip(meta["ogr_subtypes"], meta["fields"]): if ogr_subtype == "OFSTJSON": - df[c] = df[c].map(json.loads, na_action="ignore") + dtype = pd.api.types.infer_dtype(df[c]) + if dtype == "string": + try: + df[c] = df[c].map(json.loads, na_action="ignore") + except Exception: + warnings.warn( + f"Could not parse column '{c}' as JSON; leaving as string", + stacklevel=2, + ) if geometry is None or not read_geometry: return df diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py index 6eb5713a..ffe851ac 100644 --- a/pyogrio/tests/conftest.py +++ b/pyogrio/tests/conftest.py @@ -1,14 +1,12 @@ +"""Module with helper functions, fixtures, and common test data for pyogrio tests.""" + from io import BytesIO from pathlib import Path from zipfile import ZIP_DEFLATED, ZipFile import numpy as np -from pyogrio import ( - __gdal_version_string__, - __version__, - list_drivers, -) +from pyogrio import __gdal_version_string__, __version__, list_drivers from pyogrio._compat import ( GDAL_GE_37, HAS_ARROW_WRITE_API, @@ -203,8 +201,7 @@ def no_geometry_file(tmp_path): return filename -@pytest.fixture(scope="function") -def list_field_values_file(tmp_path): +def list_field_values_geojson_file(tmp_path): # Create a GeoJSON file with list values in a property list_geojson = """{ "type": "FeatureCollection", @@ -279,6 +276,66 @@ def list_field_values_file(tmp_path): return filename +def list_field_values_parquet_file(): + """Return the path to a Parquet file with list values in a property. + + Because in the CI environments pyarrow.parquet is typically not available, we save + the file in the test data directory instead of always creating it from scratch. + + The code to create it is here though, in case it needs to be recreated later. + """ + # Check if the file already exists in the test data dir + fixture_path = _data_dir / "list_field_values_file.parquet" + if fixture_path.exists(): + return fixture_path + + # The file doesn't exist, so create it + try: + import pyarrow as pa + from pyarrow import parquet as pq + + import shapely + except ImportError as ex: + raise RuntimeError( + f"test file {fixture_path} does not exist, but error importing: {ex}." + ) + + table = pa.table( + { + "geometry": shapely.to_wkb(shapely.points(np.ones((5, 2)))), + "int": [1, 2, 3, 4, 5], + "list_int": [[0, 1], [2, 3], [], None, None], + "list_double": [[0.0, 1.0], [2.0, 3.0], [], None, None], + "list_string": [ + ["string1", "string2"], + ["string3", "string4", ""], + [], + None, + [""], + ], + "list_int_with_null": [[0, None], [2, 3], [], None, None], + "list_string_with_null": [ + ["string1", None], + ["string3", "string4", ""], + [], + None, + [""], + ], + } + ) + pq.write_table(table, fixture_path) + + return fixture_path + + +@pytest.fixture(scope="function", params=[".geojson", ".parquet"]) +def list_field_values_files(tmp_path, request): + if request.param == ".geojson": + return list_field_values_geojson_file(tmp_path) + elif request.param == ".parquet": + return list_field_values_parquet_file() + + @pytest.fixture(scope="function") def nested_geojson_file(tmp_path): # create GeoJSON file with nested properties @@ -308,6 +365,45 @@ def nested_geojson_file(tmp_path): return filename +@pytest.fixture(scope="function") +def list_nested_struct_parquet_file(tmp_path): + """Create a Parquet file in tmp_path with nested values in a property. + + Because in the CI environments pyarrow.parquet is typically not available, we save + the file in the test data directory instead of always creating it from scratch. + + The code to create it is here though, in case it needs to be recreated later. + """ + # Check if the file already exists in the test data dir + fixture_path = _data_dir / "list_nested_struct_file.parquet" + if fixture_path.exists(): + return fixture_path + + # The file doesn't exist, so create it + try: + import pyarrow as pa + from pyarrow import parquet as pq + + import shapely + except ImportError as ex: + raise RuntimeError( + f"test file {fixture_path} does not exist, but error importing: {ex}." + ) + + table = pa.table( + { + "geometry": shapely.to_wkb(shapely.points(np.ones((3, 2)))), + "col_flat": [0, 1, 2], + "col_struct": [{"a": 1, "b": 2}] * 3, + "col_nested": [[{"a": 1, "b": 2}] * 2] * 3, + "col_list": [[1, 2, 3]] * 3, + } + ) + pq.write_table(table, fixture_path) + + return fixture_path + + @pytest.fixture(scope="function") def datetime_file(tmp_path): # create GeoJSON file with millisecond precision diff --git a/pyogrio/tests/fixtures/list_field_values_file.parquet b/pyogrio/tests/fixtures/list_field_values_file.parquet new file mode 100644 index 00000000..5c074b98 Binary files /dev/null and b/pyogrio/tests/fixtures/list_field_values_file.parquet differ diff --git a/pyogrio/tests/fixtures/list_nested_struct_file.parquet b/pyogrio/tests/fixtures/list_nested_struct_file.parquet new file mode 100644 index 00000000..68ac16b2 Binary files /dev/null and b/pyogrio/tests/fixtures/list_nested_struct_file.parquet differ diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 20e9af0d..e3a29603 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -379,20 +379,29 @@ def test_read_datetime_tz(datetime_tz_file, tmp_path, use_arrow): assert_series_equal(df_read.datetime_col, expected) -def test_read_list_types(list_field_values_file, use_arrow): +def test_read_list_types(list_field_values_files, use_arrow): """Test reading a geojson file containing fields with lists.""" - info = read_info(list_field_values_file) - result = read_dataframe(list_field_values_file, use_arrow=use_arrow) + if list_field_values_files.suffix == ".parquet" and not GDAL_HAS_PARQUET_DRIVER: + pytest.skip( + "Skipping test for parquet as the GDAL Parquet driver is not available" + ) + + info = read_info(list_field_values_files) + suffix = list_field_values_files.suffix + result = read_dataframe(list_field_values_files, use_arrow=use_arrow) + + # Check list_int column assert "list_int" in result.columns assert info["fields"][1] == "list_int" - assert info["ogr_types"][1] == "OFTIntegerList" + assert info["ogr_types"][1] in ("OFTIntegerList", "OFTInteger64List") assert result["list_int"][0].tolist() == [0, 1] assert result["list_int"][1].tolist() == [2, 3] assert result["list_int"][2].tolist() == [] assert result["list_int"][3] is None assert result["list_int"][4] is None + # Check list_double column assert "list_double" in result.columns assert info["fields"][2] == "list_double" assert info["ogr_types"][2] == "OFTRealList" @@ -402,6 +411,7 @@ def test_read_list_types(list_field_values_file, use_arrow): assert result["list_double"][3] is None assert result["list_double"][4] is None + # Check list_string column assert "list_string" in result.columns assert info["fields"][3] == "list_string" assert info["ogr_types"][3] == "OFTStringList" @@ -411,31 +421,122 @@ def test_read_list_types(list_field_values_file, use_arrow): assert result["list_string"][3] is None assert result["list_string"][4] == [""] - # Once any row of a column contains a null value in a list (in the test geojson), - # the column isn't recognized as a list column anymore, but as a JSON column. - # Because JSON columns containing JSON Arrays are also parsed to python lists, the - # end result is the same... + # Check list_int_with_null column + if suffix == ".geojson": + # Once any row of a column contains a null value in a list, the column isn't + # recognized as a list column anymore for .geojson files, but as a JSON column. + # Because JSON columns containing JSON Arrays are also parsed to python lists, + # the end result is the same... + exp_type = "OFTString" + exp_subtype = "OFSTJSON" + exp_list_int_with_null_value = [0, None] + else: + # For .parquet files, the list column is preserved as a list column. + exp_type = "OFTInteger64List" + exp_subtype = "OFSTNone" + if use_arrow: + exp_list_int_with_null_value = [0.0, np.nan] + else: + exp_list_int_with_null_value = [0, 0] + # xfail: when reading a list of int with None values without Arrow from a + # .parquet file, the None values become 0, which is wrong. + # https://github.com/OSGeo/gdal/issues/13448 + assert "list_int_with_null" in result.columns assert info["fields"][4] == "list_int_with_null" - assert info["ogr_types"][4] == "OFTString" - assert info["ogr_subtypes"][4] == "OFSTJSON" - assert result["list_int_with_null"][0] == [0, None] - assert result["list_int_with_null"][1] == [2, 3] - assert result["list_int_with_null"][2] == [] + assert info["ogr_types"][4] == exp_type + assert info["ogr_subtypes"][4] == exp_subtype + assert result["list_int_with_null"][0][0] == 0 + if exp_list_int_with_null_value[1] == 0: + assert result["list_int_with_null"][0][1] == exp_list_int_with_null_value[1] + else: + assert pd.isna(result["list_int_with_null"][0][1]) + + if suffix == ".geojson": + # For .geojson, the lists are already python lists + assert result["list_int_with_null"][1] == [2, 3] + assert result["list_int_with_null"][2] == [] + else: + # For .parquet, the lists are numpy arrays + assert result["list_int_with_null"][1].tolist() == [2, 3] + assert result["list_int_with_null"][2].tolist() == [] + assert pd.isna(result["list_int_with_null"][3]) assert pd.isna(result["list_int_with_null"][4]) + # Check list_string_with_null column + if suffix == ".geojson": + # Once any row of a column contains a null value in a list, the column isn't + # recognized as a list column anymore for .geojson files, but as a JSON column. + # Because JSON columns containing JSON Arrays are also parsed to python lists, + # the end result is the same... + exp_type = "OFTString" + exp_subtype = "OFSTJSON" + else: + # For .parquet files, the list column is preserved as a list column. + exp_type = "OFTStringList" + exp_subtype = "OFSTNone" + assert "list_string_with_null" in result.columns assert info["fields"][5] == "list_string_with_null" - assert info["ogr_types"][5] == "OFTString" - assert info["ogr_subtypes"][5] == "OFSTJSON" - assert result["list_string_with_null"][0] == ["string1", None] - assert result["list_string_with_null"][1] == ["string3", "string4", ""] - assert result["list_string_with_null"][2] == [] + assert info["ogr_types"][5] == exp_type + assert info["ogr_subtypes"][5] == exp_subtype + + if suffix == ".geojson": + # For .geojson, the lists are already python lists + assert result["list_string_with_null"][0] == ["string1", None] + assert result["list_string_with_null"][1] == ["string3", "string4", ""] + assert result["list_string_with_null"][2] == [] + else: + # For .parquet, the lists are numpy arrays + # When use_arrow=False, the None becomes an empty string, which is wrong. + exp_value = ["string1", ""] if not use_arrow else ["string1", None] + assert result["list_string_with_null"][0].tolist() == exp_value + assert result["list_string_with_null"][1].tolist() == ["string3", "string4", ""] + assert result["list_string_with_null"][2].tolist() == [] + assert pd.isna(result["list_string_with_null"][3]) assert result["list_string_with_null"][4] == [""] +@pytest.mark.requires_arrow_write_api +@pytest.mark.skipif( + not GDAL_HAS_PARQUET_DRIVER, reason="Parquet driver is not available" +) +def test_read_list_nested_struct_parquet_file( + list_nested_struct_parquet_file, use_arrow +): + """Test reading a Parquet file containing nested struct and list types.""" + if not use_arrow: + pytest.skip( + "When use_arrow=False, gdal flattens nested columns to seperate columns. " + "Not sure how we want to deal with this case, but for now just skip." + ) + + result = read_dataframe(list_nested_struct_parquet_file, use_arrow=use_arrow) + + assert "col_flat" in result.columns + assert np.array_equal(result["col_flat"].to_numpy(), np.array([0, 1, 2])) + + assert "col_list" in result.columns + assert result["col_list"].dtype == object + assert result["col_list"][0].tolist() == [1, 2, 3] + assert result["col_list"][1].tolist() == [1, 2, 3] + assert result["col_list"][2].tolist() == [1, 2, 3] + + assert "col_nested" in result.columns + assert result["col_nested"].dtype == object + assert result["col_nested"][0].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}] + assert result["col_nested"][1].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}] + assert result["col_nested"][2].tolist() == [{"a": 1, "b": 2}, {"a": 1, "b": 2}] + + assert "col_struct" in result.columns + assert result["col_struct"].dtype == object + assert result["col_struct"][0] == {"a": 1, "b": 2} + assert result["col_struct"][1] == {"a": 1, "b": 2} + assert result["col_struct"][2] == {"a": 1, "b": 2} + + @pytest.mark.filterwarnings( "ignore: Non-conformant content for record 1 in column dates" )