Skip to content

Commit

Permalink
Automatically downcast data types in from_geopandas (#195)
Browse files Browse the repository at this point in the history
* Automatically downcast data types

* fix with geodataframe
  • Loading branch information
kylebarron committed Nov 2, 2023
1 parent 408644a commit 9a1fb0d
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 1 deletion.
15 changes: 14 additions & 1 deletion lonboard/_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from lonboard._constants import EPSG_4326, EXTENSION_NAME, OGC_84
from lonboard._geoarrow.geopandas_interop import geopandas_to_geoarrow
from lonboard._serialization import infer_rows_per_chunk
from lonboard._utils import auto_downcast as _auto_downcast
from lonboard.traits import ColorAccessor, FloatAccessor, PyarrowTableTrait

if TYPE_CHECKING:
Expand Down Expand Up @@ -82,7 +83,9 @@ def _default_rows_per_chunk(self):
return infer_rows_per_chunk(self.table)

@classmethod
def from_geopandas(cls, gdf: gpd.GeoDataFrame, **kwargs) -> Self:
def from_geopandas(
cls, gdf: gpd.GeoDataFrame, *, auto_downcast: bool = True, **kwargs
) -> Self:
"""Construct a Layer from a geopandas GeoDataFrame.
The GeoDataFrame will be reprojected to EPSG:4326 if it is not already in that
Expand All @@ -91,13 +94,23 @@ def from_geopandas(cls, gdf: gpd.GeoDataFrame, **kwargs) -> Self:
Args:
gdf: The GeoDataFrame to set on the layer.
Other parameters:
auto_downcast: If `True`, automatically downcast to smaller-size data types
if possible without loss of precision. This calls
[pandas.DataFrame.convert_dtypes][pandas.DataFrame.convert_dtypes] and
[pandas.to_numeric][pandas.to_numeric] under the hood.
Returns:
A Layer with the initialized data.
"""
if gdf.crs and gdf.crs not in [EPSG_4326, OGC_84]:
warnings.warn("GeoDataFrame being reprojected to EPSG:4326")
gdf = gdf.to_crs(OGC_84) # type: ignore

if auto_downcast:
# Note: we don't deep copy because we don't need to clone geometries
gdf = _auto_downcast(gdf.copy())

table = geopandas_to_geoarrow(gdf)
return cls(table=table, **kwargs)

Expand Down
53 changes: 53 additions & 0 deletions lonboard/_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
from typing import TypeVar

import numpy as np
import pandas as pd
import pyarrow as pa

from lonboard._constants import EXTENSION_NAME

DF = TypeVar("DF", bound=pd.DataFrame)

GEOARROW_EXTENSION_TYPE_NAMES = {e.value for e in EXTENSION_NAME}


Expand All @@ -17,3 +23,50 @@ def get_geometry_column_index(schema: pa.Schema) -> int:
return field_idx

raise ValueError("No geometry column in table schema.")


def auto_downcast(df: DF) -> DF:
"""Automatically downcast types to smallest data size
Args:
df: pandas DataFrame or geopandas GeoDataFrame
Returns:
DataFrame with downcasted data types
"""
# Convert objects to numeric types where possible.
# Note: we have to exclude geometry because
# `convert_dtypes(dtype_backend="pyarrow")` fails on the geometory column, but we
# also have to manually cast to a non-geo data frame because it'll fail to convert
# dtypes on a GeoDataFrame without a geom col
casted_df = pd.DataFrame(df.select_dtypes(exclude="geometry")).convert_dtypes(
infer_objects=True,
convert_string=True,
convert_integer=True,
convert_boolean=True,
convert_floating=True,
dtype_backend="pyarrow",
)
df[casted_df.columns] = casted_df

# Try to convert _all_ integer columns to unsigned integer columns, but use
# errors='ignore' to return signed integer data types for columns with negative
# integers.
for col_name in df.select_dtypes(np.integer).columns: # type: ignore
df[col_name] = pd.to_numeric(
df[col_name], errors="ignore", downcast="unsigned", dtype_backend="pyarrow"
)

# For any integer columns that are still signed integer, downcast those to smaller
# signed types
for col_name in df.select_dtypes(np.signedinteger).columns: # type: ignore
df[col_name] = pd.to_numeric(
df[col_name], errors="ignore", downcast="signed", dtype_backend="pyarrow"
)

for col_name in df.select_dtypes(np.floating).columns: # type: ignore
df[col_name] = pd.to_numeric(
df[col_name], errors="ignore", downcast="float", dtype_backend="pyarrow"
)

return df

0 comments on commit 9a1fb0d

Please sign in to comment.