In [1]:
# Utility function to ensure DataFrames with geometry are converted to GeoDataFrames
import geopandas as gpd
import pandas as pd
from shapely import wkt

def ensure_geodataframe(df, geometry_col='geometry'):
    """
    Ensures that a DataFrame with geometry column is converted to a GeoDataFrame.
    If conversion fails, tries to robustly decode geometry values before failing.
    
    Args:
        df: DataFrame or GeoDataFrame
        geometry_col: Name of the geometry column
    
    Returns:
        GeoDataFrame with proper CRS set
    """
    import shapely
    import binascii

    def try_decode_geometry(val):
        """
        Try to decode a geometry value that may be:
        - Already a shapely geometry
        - A WKT string
        - A WKB hex string (bytes or str)
        - A WKB bytes object
        If it cannot be decoded, returns None.
        """
        if isinstance(val, shapely.geometry.base.BaseGeometry):
            return val
        if val is None or (isinstance(val, float) and pd.isna(val)):
            return None
        # Try WKT
        if isinstance(val, str):
            try:
                # Try WKT first
                return shapely.wkt.loads(val)
            except Exception:
                pass
            try:
                # Try WKB hex string
                return shapely.wkb.loads(binascii.unhexlify(val))
            except Exception:
                pass
        # Try WKB bytes
        if isinstance(val, (bytes, bytearray)):
            try:
                return shapely.wkb.loads(val)
            except Exception:
                pass
        return None

    # If already a GeoDataFrame, ensure CRS is set
    if isinstance(df, gpd.GeoDataFrame):
        if df.crs is None:
            df = df.set_crs(epsg=4326)  # WGS84
        return df

    # If regular DataFrame with geometry column, convert to GeoDataFrame
    if geometry_col in df.columns:
        # Convert geometry column from WKT strings to geometry objects if needed
        if df[geometry_col].dtype == 'object':
            try:
                df[geometry_col] = df[geometry_col].apply(wkt.loads)
            except Exception:
                # If wkt.loads fails, try to robustly decode geometry values
                try:
                    df[geometry_col] = df[geometry_col].apply(try_decode_geometry)
                    # Remove rows where geometry could not be decoded
                    n_invalid = df[geometry_col].isna().sum()
                    if n_invalid > 0:
                        print(f"⚠️ {n_invalid} rows had invalid geometry and will be dropped.")
                        df = df[df[geometry_col].notna()]
                except Exception as e:
                    print(f"❌ Could not decode geometry: {e}")
                    raise

        # Convert to GeoDataFrame
        try:
            df = gpd.GeoDataFrame(df, geometry=geometry_col)
        except Exception as e:
            # Try to robustly decode geometry and try again
            try:
                df[geometry_col] = df[geometry_col].apply(try_decode_geometry)
                n_invalid = df[geometry_col].isna().sum()
                if n_invalid > 0:
                    print(f"⚠️ {n_invalid} rows had invalid geometry and will be dropped.")
                    df = df[df[geometry_col].notna()]
                df = gpd.GeoDataFrame(df, geometry=geometry_col)
            except Exception as e2:
                print(f"❌ Could not convert to GeoDataFrame after robust decode: {e2}")
                raise

        # Set CRS if not already set
        if df.crs is None:
            df = df.set_crs(epsg=4326)  # WGS84

        return df

    # Return as-is if no geometry column found
    return df

print("✅ Utility function loaded: ensure_geodataframe()")


✅ Utility function loaded: ensure_geodataframe()


In [5]:
import sys
import pandas as pd
sys.path.append('..')  # Add parent directory to path
from cloud_utils import get_feature_data, get_feature_data_with_geometry
from lvt_utils import model_split_rate_tax, calculate_current_tax, model_full_building_abatement, model_stacking_improvement_exemption
from census_utils import get_census_data, get_census_blockgroups_shapefile, get_census_data_with_boundaries, match_to_census_blockgroups

scrape_data = 0

In [15]:

import os
from datetime import datetime
import glob

scrape_data = 1

# Directory to save/load data
data_dir = "data/st_paul"
os.makedirs(data_dir, exist_ok=True)

if scrape_data == 1:
    # Base URL for the ArcGIS services
    base_url = "https://maps.co.ramsey.mn.us/arcgis/rest/services/OpenData/OpenData/FeatureServer"
    # Fetch the main parcel dataset with tax info
    parcel_civic_df = get_feature_data_with_geometry('12/query', base_url)
    # Save with geometry to parquet, with current date
    today_str = datetime.now().strftime("%Y_%m_%d")
    out_path = os.path.join(data_dir, f"st_paul_parcels_{today_str}.parquet")
    parcel_civic_df.to_parquet(out_path, index=False)
    print(f"Saved new scrape to {out_path}")
else:
    # Find the most recent parquet file in the data_dir
    files = glob.glob(os.path.join(data_dir, "st_paul_parcels_*.parquet"))
    if not files:
        raise FileNotFoundError("No previously scraped parcel files found in data/st_paul/")
    # Sort files by date in filename
    files_sorted = sorted(files, key=lambda x: datetime.strptime(os.path.basename(x).replace("st_paul_parcels_", "").replace(".parquet", ""), "%Y_%m_%d"), reverse=True)
    latest_file = files_sorted[0]
    print(f"Loading most recent scrape: {latest_file}")
    parcel_civic_df = pd.read_parquet(latest_file)

# Ensure parcel_civic_df is a proper GeoDataFrame
parcel_civic_df = ensure_geodataframe(parcel_civic_df)
print(f"✅ Parcel data loaded as {type(parcel_civic_df).__name__} with CRS: {parcel_civic_df.crs}")


Total records in 12/query: 167601
Fetched records 0 to 1000 of 167601
Saved new scrape to data/st_paul/st_paul_parcels_2025_09_19.parquet
✅ Parcel data loaded as GeoDataFrame with CRS: EPSG:4326
