In [8]:
import geopandas as gpd
from pathlib import Path
#import pyarrow

In [7]:
# Function to find root directory
def find_project_root(start:Path | None=None)-> Path:
    '''
    Objective is to find a marker such as .toml, .git or a directory that points to root
    '''
    start = start or Path.cwd()
    for p in [start, *start.parents]:
        if (p / ".git").exists() or (p / "pyproject.toml").exists() or (p / "data").exists():
            return p
    raise FileNotFoundError("Project root not found")

In [9]:
# Function to convert shapefile to geoparquet
def convert_shp_to_geoparquet(
        shp_path: Path, 
        output_path: Path,
        target_epsg: int = 4326
        ) -> None:
    '''
    Function to convert a shapefile to a GeoParquet file with a specified target EPSG.
    Parameters:
    - shp_path: Path to the input shapefile.
    - output_path: Path to the output GeoParquet file.
    - target_epsg: EPSG code to which the geometries should be projected (default is 4326).
    '''
    gdf = gpd.read_file(shp_path)
    # Ensure crs exists
    if gdf.crs is None:
        raise ValueError("Input shapefile does not have a CRS defined.")
    
    # Reproject crs
    if gdf.crs.to_epsg() != target_epsg:
        gdf = gdf.to_crs(epsg=target_epsg)

    # Check geometries
    if not gdf.geometry.is_valid.all():
        raise ValueError("Input shapefile contains invalid geometries.")

    # Normalize column names
    gdf.columns = [c.lower() for c in gdf.columns]

    # Save to GeoParquet
    gdf.to_parquet(output_path, index=False, engine='pyarrow')
    print(f'First few rows of the dataframe are: {gdf.head()}\n')
    print(f"GeoParquet file saved to {output_path}\n rows = {len(gdf)} \n columns = {gdf.columns.tolist()}")

In [10]:
# Convert place boundaries to geoparquet
shp_path = Path(r"C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\data\raw\boundaries\cb_2024_us_place_500k\cb_2024_us_place_500k.shp")
output_path = find_project_root() / "data" / "intermediate"/"boundaries"/"place_boundaries_2024.parquet"
convert_shp_to_geoparquet(shp_path, output_path)

First few rows of the dataframe are:   statefp placefp   placens           geoidfq    geoid          name  \
0      12   53150  02404445  1600000US1253150  1253150  Ormond Beach   
1      17   55899  02399571  1600000US1755899  1755899       Olmsted   
2      06   02000  02409704  1600000US0602000  0602000       Anaheim   
3      17   47774  02399280  1600000US1747774  1747774       Maywood   
4      19   76665  02396013  1600000US1976665  1976665          Swan   

            namelsad stusps  state_name lsad      aland    awater  \
0  Ormond Beach city     FL     Florida   25   90951810  10688642   
1    Olmsted village     IL    Illinois   47    8714516    338372   
2       Anaheim city     CA  California   25  130232149   1567025   
3    Maywood village     IL    Illinois   47    7038039         0   
4          Swan city     IA        Iowa   25    1597912         0   

                                            geometry  
0  MULTIPOLYGON (((-81.13629 29.35995, -81.13166 ...  
1  PO