In [1]:
# Importing required libraries
import os
import subprocess
from datetime import datetime, timedelta
import time
import logging
from typing import Literal

import pandas as pd
import numpy as np
import geopandas as gpd
import json
from shapely.geometry import mapping
from shapely.validation import explain_validity, make_valid
import fiona

from google.cloud import bigquery
from google.cloud import storage

# Import utility constants and functions
import utils

In [2]:
# Initialize global constants here
POC_FINALIZED_COUNTIES = [
    # urban
    '17031',
    '13121',
    '53033',
    # sub-urban
    '48491',
    '29181',
    '42011',
    # rural 
    '55107',
    '35051',
    '17127',
]

In [3]:
# Define constants
#POC_DATASET = 'encumbered_parcels'
#POC_TABLE = 'parcels'
geo_crs = "EPSG:4326"
projected_crs = "EPSG:3857" 
ENCUMBRANCES = [
    'roadways',
    'railways',
    'protected_lands',
    'wetlands',
    'transmission_lines',
    ]
EncumbranceType = Literal[
    'roadways',
    'railways',
    'protected_lands',
    'wetlands',
    'transmission_lines',
]

LOCAL_DATA_FOLDER = r"C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data"
COUNTY_DATA = r"counties\tl_2024_us_county\tl_2024_us_county.shp"
RAILWAYS_DATA = r"NTAD_North_American_Rail_Network_Lines\NARN.gdb" 
TRANSMISSION_LINES_DATA = r"transmission_lines\Transmission_Lines.shp"
ROADWAYS_DATA = r"NTAD_North_American_Roads\North_American_Roads.shp"
PROTECTED_LANDS = r"C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data\Protected Lands"
PROTECTED_LANDS_NATIONAL = r"protected_lands_national\PADUS4_1VectorAnalysis_PADUS_Only\PADUS4_1VectorAnalysis_PADUS_Only.gdb"
WETLANDS = r"C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data\Wetlands"
WETLAND_ATTRIBUTES = r"Wetlands\NWI-Code-Definitions\NWI-Code-Definitions\NWI_Code_Definitions.gdb"
PARQUET_INGESTION_PATH = r"C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data\ingestion_parquets"

#### 1.1 Cleaning and saving source encumbrance data in parquet format

In [4]:
# Function to make valid geometries
def validate_and_fix_geometries(
        gdf,
        dataset,
        state=None):
    
    """
    """
    # Initialize summary stats for logging
    fixed_make_valid = 0
    fixed_buffer_count = 0
    dropped_count = 0

    # Ensure CRS is 4326 (WGS84)
    if gdf.crs is None:
        print(f"CRS is undefined; assigning EPSG:4326 directly.")
        gdf.set_crs(geo_crs, inplace=True)
    elif gdf.crs != geo_crs:
        print(f"CRS is {gdf.crs}, transforming to {geo_crs}")
        gdf = gdf.to_crs(geo_crs)

    # Assigning crs to a variable to save in the log file
    original_crs = gdf.crs.to_string()
    
    # Loop over rows
    for idx, row in gdf.iterrows():
        geom = row['geometry']
        if not geom.is_valid:
            print(f"Invalid geometry at index {idx}: {explain_validity(geom)}")
            try:
                # Attempt to fix with make_valid
                fixed = make_valid(geom)
                if fixed.is_valid and not fixed.is_empty:
                    gdf.at[idx, 'geometry'] = fixed
                    fixed_make_valid += 1
                else:
                    print(f"make_valid failed to fix geometry at index {idx}, falling back to buffer(0)")
                    fixed_w_buffer = geom.buffer(0)
                    if fixed_w_buffer.is_valid and not fixed_w_buffer.is_empty:
                        gdf.at[idx, 'geometry'] = fixed_w_buffer
                        fixed_buffer_count += 1
                    else:
                        print(f"Geometry at index {idx} is still invalid or empty after fixing. Dropping.")
                        gdf.at[idx, 'geometry'] = None
            except Exception as e:
                print(f"Exception while fixing geometry at index {idx}: {e}. Dropping.")
                gdf.at[idx, 'geometry'] = None
        else:
            gdf.at[idx, 'geometry'] = geom
    
    # Drop invalid geometries
    initial_len = len(gdf)
    gdf = gdf[gdf['geometry'].notnull()]
    dropped_count = initial_len - len(gdf) # gdf with dropped geometries

    # Convert all remaining geometries to GeoJSON strings
    gdf['geometry_geojson'] = gdf['geometry'].apply(lambda g: json.dumps(mapping(g)) if g is not None else None)
    gdf = gdf.drop(columns='geometry')
    gdf = gdf.rename(columns={'geometry_geojson': 'geometry'})
    print(f"Converted geometries to GeoJSON strings.")
    
    # Log summary
    os.makedirs('logs', exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f").replace('.', '_')
    log_filename = os.path.join('logs', f"{state or 'no_state'}_{dataset}_geo_clean_log_{timestamp}.txt")
    print(f'Writing to log file {log_filename}...')

    with open(log_filename, "w") as log_file:
        log_file.write(f"Dataset: {dataset}\n")
        log_file.write(f"State: {state}\n")
        log_file.write(f"Timestamp: {timestamp}\n")
        log_file.write(f"Original CRS: {original_crs}\n")
        log_file.write(f"Initial length of dataframe: {initial_len}\n")
        log_file.write(f"Geometries fixed using make_valid: {fixed_make_valid}\n")
        log_file.write(f"Geometries fixed using buffer(0): {fixed_buffer_count}\n")
        log_file.write(f"Geometries dropped after failed fix: {dropped_count}\n")

    return gdf

In [5]:
# Function to add wetland attributes
def add_wetland_attributes(gdf_wetland):
    """
    Load wetland attributes from a geodatabase and return as a GeoDataFrame.
    """
    # Read the geodatabase
    gdf_wetland_attributes = gpd.read_file(os.path.join(LOCAL_DATA_FOLDER, WETLAND_ATTRIBUTES))
    
    # Drop unnecessary columns
    columns_to_drop = [
        'SYSTEM', 'SYSTEM_NAME', 'SYSTEM_DEFINITION',
        'SUBSYSTEM', 'SUBSYSTEM_DEFINITION',
        'CLASS', 'CLASS_DEFINITION',
        'SUBCLASS', 'SUBCLASS_DEFINITION',
        'SPLIT_CLASS', 'SPLIT_CLASS_DEFINITION',
        'SPLIT_SUBCLASS', 'SPLIT_SUBCLASS_NAME', 'SPLIT_SUBCLASS_DEFINITION',
        'WATER_REGIME', 'WATER_REGIME_DEFINITION',
        'MODIFIER1', 'MODIFIER1_NAME', 'MODIFIER1_GROUP', 'MODIFIER1_SUBGROUP', 'MODIFIER1_DEFINITION',
        'MODIFIER2', 'MODIFIER2_NAME', 'MODIFIER2_GROUP', 'MODIFIER2_SUBGROUP', 'MODIFIER2_DEFINITION',
        'geometry'
    ]
    gdf_wetland_attributes = gdf_wetland_attributes.drop(
        columns=[col for col in columns_to_drop if col in gdf_wetland_attributes.columns])

    # Join datasets using the 'ATTRIBUTE' column
    wetlands_with_attributes = gdf_wetland.merge(
        gdf_wetland_attributes,
        how='left',
        on='ATTRIBUTE'
    )

    return wetlands_with_attributes

In [6]:
# Define global config variable to store dataset configurations
# Config includes paths, read arguments, and cleanup functions for each dataset
DATASET_CONFIG = {
    'transmission_lines': {
        'path': TRANSMISSION_LINES_DATA,
        'read_kwargs': {},
        'requires_state': False,
        'cleanup': lambda gdf: gdf
            .drop(columns=[
                'OBJECTID', 'SOURCE', 'SOURCEDATE', 'VAL_METHOD', 'VOLTAGE',
                'INFERRED', 'SUB_1', 'SUB_2'
            ])
            .assign(Shape__Len=gdf['Shape__Len'].round(2))
    },
    'railways': {
        'path': RAILWAYS_DATA,
        'read_kwargs': {'layer': 'North_American_Rail_Network_Lines'},
        'requires_state': False,
        'cleanup': lambda gdf: gdf
            .drop(columns=[
                'FRFRANODE', 'TOFRANODE', 'STFIPS', 'CNTYFIPS', 'STATEAB', 'COUNTRY',
                'FRADISTRCT', 'RROWNER1', 'RROWNER2', 'RROWNER3',
                'TRKRGHTS1', 'TRKRGHTS2', 'TRKRGHTS3', 'TRKRGHTS4', 'TRKRGHTS5',
                'TRKRGHTS6', 'TRKRGHTS7', 'TRKRGHTS8', 'TRKRGHTS9', 'DIVISION',
                'SUBDIV', 'BRANCH', 'YARDNAME', 'PASSNGR', 'STRACNET', 'TRACKS',
                'NET', 'MILES', 'TIMEZONE', 'SHAPE_Length'
            ], errors='ignore')
            .assign(KM=gdf['KM'].round(2))
    },
    'roadways': {
        'path': ROADWAYS_DATA,
        'read_kwargs': {},
        'requires_state': False,
        'cleanup': lambda gdf: gdf[gdf['COUNTRY'] == 2]
            .drop(columns=['DIR', 'LINKID', 'JURISCODE', 'ROADNUM', 'CLASS', 'NHS'], errors='ignore')
    },
    'wetlands': {
        'gdb_config': lambda state: {
            'folder': WETLANDS,
            'subfolder': f"{state}_geodatabase_wetlands",
            'gdb_name': f"{state}_geodatabase_wetlands.gdb"
        },
        'requires_state': True,
        'postprocess': lambda gdf: add_wetland_attributes(gdf)
    },
    'protected_lands': {
        'gdb_config': lambda state: {
            'folder': PROTECTED_LANDS,
            'subfolder': f"PADUS4_1_State_{state}_GDB_KMZ",
            'gdb_name': f"PADUS4_1_State{state}.gdb"
        },
        'requires_state': True,
        'cleanup':  lambda gdf: gdf.drop(columns=[
                'FeatClass',
                'Category',
                'Own_Name',
                'Mang_Type',
                'Mang_Name',
                'Des_Tp',
                'Agg_Src',
                'GIS_Src',
                'Src_Date',
                'GIS_Acres',
                'Source_PAID',
                'Pub_Access',
                'Access_Src',
                'GAP_Sts',
                'IUCN_Cat',
                'Date_Est',
                'Comments',
                'Term',
                'Duration',
        ], errors='ignore'),
    },
    'protected_lands_national': {
        'path': PROTECTED_LANDS_NATIONAL,
        'read_kwargs': {'layer':'PADUS4_1VectorAnalysis_PADUS_Only_Simp_SingP'},
        'requires_state': False,
        'cleanup': lambda gdf: gdf
            .drop(columns=[ 
                'FID_GAP_Sts14_13_12_12_11',
                'Agg_Src',
                'ShL_ShA',
                'DupShL_ShA',
                'RevOID',
                'Shp_AreaDup',
                'GIS_Acres',
                'BndryName',
                'BndryExten',
                'BndryID',
                'GIS_AcrsDb',
                'InPoly_FID',
                'SimPgnFlag',
                'MaxSimpTol',
                'MinSimpTol',
                'Shape_Length',
            ], errors='ignore')
            .assign(area=gdf['Shape_Area'].round(2))

    }
}


In [7]:
# Function to clean and save dataset
def clean_and_save_dataset(
    dataset='railways',
    output_format='parquet',
    destination_path=PARQUET_INGESTION_PATH,
    state=None
):
    config = DATASET_CONFIG.get(dataset)
    if not config:
        raise ValueError(f"Unsupported dataset: {dataset}")

    if config.get('requires_state') and not state:
        raise ValueError(f"State must be provided for {dataset}")

    # Determine path
    if 'path' in config:
        full_path = os.path.join(LOCAL_DATA_FOLDER, config['path'])
        gdf = gpd.read_file(full_path, **config.get('read_kwargs', {}))
    else:
        # Handle GDB datasets
        gdb_info = config['gdb_config'](state)
        gdb_path = os.path.join(LOCAL_DATA_FOLDER, gdb_info['folder'], gdb_info['subfolder'], gdb_info['gdb_name'])

        # Find largest layer
        largest_layer = max(
            fiona.listlayers(gdb_path),
            key=lambda layer: len(fiona.open(gdb_path, layer=layer))
        )

        gdf = gpd.read_file(gdb_path, layer=largest_layer)
        print(f"Loaded {dataset} data from layer: {largest_layer} with {len(gdf)} features")

    # Optional post-processing
    if 'cleanup' in config:
        gdf = config['cleanup'](gdf)

    if 'postprocess' in config:
        gdf = config['postprocess'](gdf)
        print(f"Postprocessed {dataset} data")

    # Check that geometries are valid
    print(f'State name is {state}')
    gdf = validate_and_fix_geometries(
        gdf,
        dataset,
        state=state)

    # Save output
    if output_format == 'parquet':
        filename = f"{state}_{dataset}.parquet" if state else f"{dataset}.parquet"
        filepath = os.path.join(destination_path, filename)
        gdf.to_parquet(filepath)
        print(f"{filename} data cleaned and saved successfully!")


In [8]:
# Upload local parquet file to GCS bucket
# First, define constants
BUCKET = 'geospatial-projects'
BUCKET_FOLDER = 'infra_parcels'
CREDENTIALS_PATH =  r"C:\Users\eprashar\AppData\Roaming\gcloud\application_default_credentials.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(CREDENTIALS_PATH)

# Verify credentials
utils.check_and_authenticate(CREDENTIALS_PATH)

Credentials file is valid.


In [None]:
# Loop through all states and datasets to save cleaned parquet files
# List of states and datasets
OVERWRITE_EXISTING = True
states = [
    'PA',
    'GA',
    'MO',
    'WI',
    'NM',
    'IL',
    'WA',
    'TX'
    ]

datasets = [ 
    'roadways',
    'railways',
    'transmission_lines'
    ]

# Iterate through states and datasets
for dataset in datasets:
    if dataset in ['railways', 'roadways', 'transmission_lines']:
        state = None
        filename = f'{dataset}.parquet'
        print(f'filename is {filename}...')
         # Check if the file already exists
        if OVERWRITE_EXISTING or not os.path.exists(destination_file):
            try:
                # Attempt to run the function
                # Print the reason for running the function
                print(f'Running the function because over-write status: {OVERWRITE_EXISTING} and file existence status: {os.path.exists(destination_file)}...') 
                clean_and_save_dataset(
                    dataset=dataset,
                    output_format='parquet',
                    destination_path=PARQUET_INGESTION_PATH,
                    state=state
                )
                print(f"File {filename} processed and saved.")
            except Exception as e:
                # Log the error and continue
                print(f"Failed to process {filename}: {e}")
        else:
                print(f"File {filename} already exists. Skipping.")
    else:
        states = states
        for state in states:
        # Construct the filename
            filename =  f"{state}_{dataset}.parquet"
            print(f'filename is {filename}...')
            destination_file = os.path.join(PARQUET_INGESTION_PATH, filename)

            # Check if the file already exists
            if OVERWRITE_EXISTING or not os.path.exists(destination_file):
                try:
                    # Attempt to run the function
                    # Print the reason for running the function
                    print(f'Running the function because over-write status: {OVERWRITE_EXISTING} and file existence status: {os.path.exists(destination_file)}...') 
                    clean_and_save_dataset(
                        dataset=dataset,
                        output_format='parquet',
                        destination_path=PARQUET_INGESTION_PATH,
                        state=state
                    )
                    print(f"File {filename} processed and saved.")
                except Exception as e:
                    # Log the error and continue
                    print(f"Failed to process {filename}: {e}")
            else:
                    print(f"File {filename} already exists. Skipping.")

#### 1.2 Uploading cleaned encumbrance parquets to GCS 

In [76]:
# Function to upload locally saved parquet to GCS
def upload_parquet_to_gcs(
        bucket_name,
        bucket_folder,
        dataset,
        local_ingestion_path = PARQUET_INGESTION_PATH,
        state=None):
    """
    Uploads a local Parquet file to a GCS bucket using config structure.

    Args:
        bucket_name (str): Target GCS bucket.
        dataset (str): One of the encumbrances in EncumbranceType
        local_file_path (str): Local parquet file path.
        state (str, optional): State name for datasets that include it.
    """
    gcs_folder = f'{bucket_folder}/{dataset}'
    filename = f"{state}_{dataset}.parquet" if state else f"{dataset}.parquet"
    local_file_path = os.path.join(local_ingestion_path, filename)
    if not os.path.exists(local_file_path):
        raise FileNotFoundError(f"File {local_file_path} does not exist.")
    destination_blob_path = f"{gcs_folder}/{filename}"

    # GCS client upload
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_path)
    blob.upload_from_filename(local_file_path)

    print(f"Uploaded {filename} to gs://{bucket_name}/{destination_blob_path}")

In [None]:
# Loop over states and encumbrances to upload to GCS
# List of states and datasets
states = [
    #'PA',
    'GA',
    #'MO',
    'WI',
    'NM',
    'IL',
    'WA', 
    'TX'
    ]
datasets = [
    'wetlands',
    #'protected_lands'
    ] # To re-run all, replace this with ENCUMBRANCES

for dataset in datasets:
    if dataset in [
        'wetlands', 
        #'protected_lands'
        ]:
        for state in states:
            try:
                upload_parquet_to_gcs(
                    bucket_name=BUCKET,
                    bucket_folder=BUCKET_FOLDER, 
                    dataset=dataset, 
                    local_ingestion_path=PARQUET_INGESTION_PATH, 
                    state=state
                )
            except Exception as e:
                print(f"Failed to upload {dataset} for {state}: {e}")
    else:
        try:
            upload_parquet_to_gcs(
                bucket_name=BUCKET,
                bucket_folder=BUCKET_FOLDER,
                dataset=dataset, 
                local_ingestion_path=PARQUET_INGESTION_PATH
                # No state needed
            )
        except Exception as e:
            print(f"Failed to upload {dataset}: {e}")

#### 2. Convert county boundaries to geographic projection and upload to BQ

In [47]:
# Upload local parquet file to GCS bucket
# First, define constants
GIS_PROJECT = 'clgx-gis-app-dev-06e3'
POC_DATASET = 'encumbered_parcels'
CREDENTIALS_PATH =  r"C:\Users\eprashar\AppData\Roaming\gcloud\application_default_credentials.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(CREDENTIALS_PATH)

# Verify credentials
utils.check_and_authenticate(CREDENTIALS_PATH)

Credentials file is valid.


In [None]:
# Upload county data in GCS bucket after converting CRS to EPSG:4326
def convert_upload_parquet_to_gcs_and_bigquery(
    source_folder,
    output_name,
    bucket_name,
    gcs_blob,
    project_id,
    dataset_id,
    table_name
):
    """
    Converts a Parquet file with EPSG:4269 geometries to EPSG:4326,
    uploads it to GCS, then loads it into BigQuery.

    Args:
    """
    # Load and convert CRS
    input_path = os.path.join(LOCAL_DATA_FOLDER, source_folder)
    gdf = gpd.read_file(input_path)
    print(f"Loaded {len(gdf)} features from {input_path}")

    # Convert geometries to WKT
    

    if gdf.crs is None or gdf.crs.to_epsg() != geo_crs:
        gdf = gdf.to_crs(geo_crs)
        print("CRS converted to EPSG:4326")
    else:
        print("CRS already in EPSG:4326")

    # Save converted file locally
    output_path = os.path.join(PARQUET_INGESTION_PATH, f'{output_name}')
    gdf.to_parquet(output_path)
    print(f"Saved converted file to {output_path}")

    # Upload to GCS
    gcs_blob_path = f'{gcs_blob}/{output_name}'
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(gcs_blob_path)
    blob.upload_from_filename(output_path)
    print(f"Uploaded to GCS: gs://{bucket_name}/{gcs_blob_path}")

    # Load into BigQuery
    client = bigquery.Client()
    table_id = f"{project_id}.{dataset_id}.{table_name}"

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.PARQUET,
        autodetect=True,
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
    )

    uri = f"gs://{bucket_name}/{gcs_blob_path}"
    load_job = client.load_table_from_uri(uri, table_id, job_config=job_config)
    load_job.result()  # Wait for job to complete

    print(f"Loaded data into BigQuery table: {table_id}")

In [49]:
# Executing the county data upload
convert_upload_parquet_to_gcs_and_bigquery(
    source_folder=COUNTY_DATA,
    output_name='county_bounds.parquet',
    bucket_name=BUCKET,
    gcs_blob= f'{BUCKET_FOLDER}/county_bounds',
    project_id=GIS_PROJECT,
    dataset_id=POC_DATASET,
    table_name="county_boundaries"
)

Loaded 3235 features from C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data\counties\tl_2024_us_county\tl_2024_us_county.shp
CRS converted to EPSG:4326
Saved converted file to C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data\ingestion_parquets\county_bounds.parquet
Uploaded to GCS: gs://geospatial-projects/infra_parcels/county_bounds/county_bounds.parquet
Loaded data into BigQuery table: clgx-gis-app-dev-06e3.encumbered_parcels.county_boundaries


#### 3.1 [Relevant for local development in python]: Get county-level encumbrance parquet files

In [40]:
# Function to load county boundary
def get_county_boundary(fips_code):
    """
    Load county boundary from shapefile and filter by FIPS code.
    """
    # Read the shapefile
    gdf_county = gpd.read_file(os.path.join(LOCAL_DATA_FOLDER, COUNTY_DATA))
    
    # Filter by FIPS code
    gdf_county = gdf_county[gdf_county['GEOID'] == fips_code]
    
    # Convert to EPSG:4326
    gdf_county = gdf_county.to_crs(geo_crs)
    
    return gdf_county

In [41]:
# Filter encumbrance data for county
def filter_gdf_using_boundary(gdf_encumbrance, county_boundary):
    """
    Load wetland attributes from a geodatabase and join them to the wetlands GeoDataFrame
    using the 'ATTRIBUTE' field as a key.

    Parameters:

    Returns:
        GeoDataFrame: 
    """

    # Convert both dataframes to the same projection system
    gdf_encumbrance = gdf_encumbrance.to_crs(county_boundary.crs)

    # Perform a spatial join
    filtered_encumbrance = gpd.sjoin(gdf_encumbrance, county_boundary, predicate='within')

    # Drop unnecessary columns from county database
    filtered_encumbrance.drop(columns=[
        'index_right',
        'STATEFP',
        'COUNTYFP',
        'COUNTYNS',
        'GEOID',
        'GEOIDFQ',
        'LSAD',
        'CLASSFP',
        'MTFCC',
        'CSAFP',
        'CBSAFP',
        'METDIVFP',
        'FUNCSTAT',
        'ALAND',
        'AWATER',
        'INTPTLAT',
        'INTPTLON'], inplace=True)

    # Convert epsg for filtered gdf to 4326
    filtered_encumbrance = filtered_encumbrance.set_geometry('geometry').to_crs(geo_crs)
    print(f'CRS of the filtered dataframe is {filtered_encumbrance.crs}')
    return filtered_encumbrance

In [None]:
# Load data for encumbrance type
def load_encumbrance_for_county(
        encumbrance_type: EncumbranceType,
        fips,
        state=None) -> gpd.GeoDataFrame:
    '''
    Load encumbrance data from <source> and return as GeoDataFrame.
    '''
    # Flag error if chosen encumbrance is not defined
    if encumbrance_type not in ENCUMBRANCES:
        raise ValueError(
            f"Invalid encumbrance type '{encumbrance_type}'. "
            f"Valid options are: {', '.join(ENCUMBRANCES)}."
        )
    # Obtain county boundary
    county_boundary = get_county_boundary(fips)

    # Read the parquet file for the encumbrance type
    try:
        file_path = os.path.join(PARQUET_INGESTION_PATH, f'{state}_{encumbrance_type}.parquet' if state else f'{encumbrance_type}.parquet')
        gdf = gpd.read_parquet(file_path)
    except Exception as e:
        raise FileNotFoundError(f"File {file_path} not found. Error: {e}")

    # Use spatial join to get encumbrance data for county
    county_gdf = filter_gdf_using_boundary(gdf, county_boundary)

    # save county_gdf as parquet file
    print(f"{encumbrance_type} data loaded for FIPS code: {fips}")
    
    # Convert to EPSG:4326
    county_gdf = county_gdf.to_crs(geo_crs)
    county_gdf.to_parquet(os.path.join(PARQUET_INGESTION_PATH,f'{fips}_{encumbrance_type}.parquet'))
    print(f'{encumbrance_type} parquet created for {fips}!')

In [64]:
# Mapping of county FIPS to state abbreviations
FIPS_TO_STATE = {
    '17031': 'IL',  # Cook County, IL
    '13121': 'GA',  # Fulton County, GA
    '53033': 'WA',  # King County, WA
    '48491': 'TX',  # Williamson County, TX
    '29181': 'MO',  # Warren County, MO
    '42011': 'PA',  # Berks County, PA
    '55107': 'WI',  # Rusk County, WI
    '35051': 'NM',  # Sierra County, NM
    '17127': 'IL',  # Massac County, IL
}

poc_fips = POC_FINALIZED_COUNTIES
encumbrances = ENCUMBRANCES

# Encumbrances that require state information
STATE_REQUIRED_TYPES = {'wetlands', 'protected_lands'}

for fips in poc_fips:
    for encumbrance in encumbrances:
        filename = f"{fips}_{encumbrance}.parquet"
        destination_file = os.path.join(PARQUET_INGESTION_PATH, filename)

        if not os.path.exists(destination_file):
            try:
                state = FIPS_TO_STATE.get(fips) if encumbrance in STATE_REQUIRED_TYPES else None

                load_encumbrance_for_county(
                    encumbrance_type=encumbrance,
                    fips=fips,
                    state=state
                )
                print(f"File {filename} processed and saved.")
            except Exception as e:
                print(f"Failed to process {filename}: {e}")
        else:
            print(f"File {filename} already exists. Skipping.")


File 17031_roadways.parquet already exists. Skipping.
File 17031_railways.parquet already exists. Skipping.
File 17031_protected_lands.parquet already exists. Skipping.
File 17031_wetlands.parquet already exists. Skipping.
CRS of the filtered dataframe is {"$schema": "https://proj.org/schemas/v0.7/projjson.schema.json", "type": "GeographicCRS", "name": "WGS 84", "datum_ensemble": {"name": "World Geodetic System 1984 ensemble", "members": [{"name": "World Geodetic System 1984 (Transit)"}, {"name": "World Geodetic System 1984 (G730)"}, {"name": "World Geodetic System 1984 (G873)"}, {"name": "World Geodetic System 1984 (G1150)"}, {"name": "World Geodetic System 1984 (G1674)"}, {"name": "World Geodetic System 1984 (G1762)"}, {"name": "World Geodetic System 1984 (G2139)"}], "ellipsoid": {"name": "WGS 84", "semi_major_axis": 6378137, "inverse_flattening": 298.257223563}, "accuracy": "2.0", "id": {"authority": "EPSG", "code": 6326}}, "coordinate_system": {"subtype": "ellipsoidal", "axis": [{"

#### 3.2 [Relevant for local development in python]: Parcel data for POC counties from BigQ in parquet

In [None]:
# Define constants here
PROJECT = 'clgx-gis-app-dev-06e3'
DATASET = 'property'
POC_DATASET = 'encumbered_parcels'
POC_TABLE = 'parcels'
CREDENTIALS_PATH =  r"C:\Users\eprashar\AppData\Roaming\gcloud\application_default_credentials.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(CREDENTIALS_PATH)

# Credentials verification
utils.check_and_authenticate(CREDENTIALS_PATH)

Credentials file is valid.


In [38]:
# Define function to get parcel data for the defined county
def fetch_and_save_county_parcels(fips_code: str) -> gpd.GeoDataFrame:
    """
    Load parcel data from BigQuery and filter by FIPS code.
    """
    # Define the SQL query to filter by FIPS code
    # TO-DO: Create a table in BQ with processed data
    query = f"""
        SELECT * 
        FROM `{PROJECT}.{POC_DATASET}.{POC_TABLE}`
        WHERE fips_code = '{fips_code}'
    """
    # Read the data into a GeoDataFrame
    gdf_parcel = utils.read_bigquery_to_gdf(project=PROJECT, dataset=POC_DATASET, table=POC_TABLE, query=query, output='gpd', geometry_col='geometry')
    
    # Convert to EPSG:4326
    gdf_parcel = gdf_parcel.to_crs(geo_crs)
    print(f'CRS of the parcel dataframe is {gdf_parcel.crs}')
    gdf_parcel.to_parquet(os.path.join(PARQUET_INGESTION_PATH,f'{fips_code}_parcels.parquet'))
    print(f'parcel parquet created for {fips_code}!')

In [None]:
# Save parcel parquets for all POC counties
for county_fips in POC_FINALIZED_COUNTIES:
    fetch_and_save_county_parcels(fips_code=county_fips)