In [None]:
import os
import io
import shapely.wkb
import geopandas as gpd
from google.cloud import storage

# --- Configuration ---
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/credentials.json"

SOURCE_BUCKET = "geospatial-projects"
SOURCE_PREFIX = "infra_parcels/wetlands_v2/county/"
DESTINATION_BUCKET = "geospatial-projects"
DESTINATION_PREFIX = "infra_parcels/wetlands_v2/county_cleaned/"

# --- Script Body ---
storage_client = storage.Client()

def clean_geometry(geom):
    """
    Safely loads a WKB geometry, fixes it if invalid, and returns a GeoJSON string.
    """
    if geom is None:
        return None
    try:
        # Load the WKB bytes into a shapely geometry object
        shape = shapely.wkb.loads(geom)
        
        # The .buffer(0) trick is a powerful way to fix many invalid geometries
        if not shape.is_valid:
            shape = shape.buffer(0)
            
        return shape.to_wkt() # Using WKT for compatibility, can also be geojson
    except Exception:
        # If loading fails, return None
        return None

# Get a list of all non-Alaska parquet files
blobs = storage_client.list_blobs(SOURCE_BUCKET, prefix=SOURCE_PREFIX)
files_to_process = [
    blob.name
    for blob in blobs
    if blob.name.endswith(".parquet") and not blob.name.startswith(f"{SOURCE_PREFIX}AK_")
]

print(f"Found {len(files_to_process)} files to clean and convert.")

for i, gcs_path in enumerate(files_to_process):
    print(f"\nProcessing file {i+1} of {len(files_to_process)}: {gcs_path}")
    
    try:
        # Read the parquet file directly from GCS into a GeoPandas DataFrame
        gdf = gpd.read_parquet(f"gs://{SOURCE_BUCKET}/{gcs_path}")

        # Apply the cleaning function to the geometry column
        # This assumes the raw geometry column is named 'geometry'
        gdf['geometry_cleaned'] = gdf['geometry'].apply(clean_geometry)
        
        # Drop rows where geometry could not be fixed
        gdf.dropna(subset=['geometry_cleaned'], inplace=True)
        
        # Drop the original geometry column
        gdf.drop(columns=['geometry'], inplace=True)
        gdf.rename(columns={'geometry_cleaned': 'geometry'}, inplace=True)

        # Write the cleaned DataFrame to a new location in GCS
        destination_path = gcs_path.replace(SOURCE_PREFIX, DESTINATION_PREFIX)
        gdf.to_parquet(f"gs://{DESTINATION_BUCKET}/{destination_path}")
        print(f"  SUCCESS: Wrote cleaned file to gs://{DESTINATION_BUCKET}/{destination_path}")

    except Exception as e:
        print(f"  ERROR: Failed to process {gcs_path}. Reason: {e}")
        with open("failed_conversion.log", "a") as f:
            f.write(f"{gcs_path}\n")

print("\nConversion complete.")
