In [1]:
# Importing required libraries
import os
import subprocess
from datetime import datetime, timedelta
import time
import logging
from typing import Literal
from collections import defaultdict

import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon
from shapely import wkt
import fiona
import matplotlib.pyplot as plt
import seaborn as sns
from pygris import counties

from google.cloud import bigquery
from pandas_gbq import to_gbq

# Import utility constants and functions
import utils

In [2]:
# Initialize global constants here
POC_FINALIZED_COUNTIES = [
    # urban
    '17031',
    '13121',
    '53033',
    # sub-urban
    '48491',
    '29181',
    '42011',
    # rural 
    '55107',
    '35051',
    '17127',
]

In [6]:
# Define constants
#POC_DATASET = 'encumbered_parcels'
#POC_TABLE = 'base_parcels_poc_counties_0419'
geo_crs = "EPSG:4326"
projected_crs = "EPSG:3857" 
ENCUMBRANCES = [
    'roadways',
    'railways',
    'protected_lands',
    'wetlands',
    'transmission_lines',
    ]
EncumbranceType = Literal[
    'roadways',
    'railways',
    'protected_lands',
    'wetlands',
    'transmission_lines',
]

LOCAL_DATA_FOLDER = r"C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data"
COUNTY_DATA = r"counties\tl_2024_us_county\tl_2024_us_county.shp"
RAILWAYS_DATA = r"NTAD_North_American_Rail_Network_Lines\NARN.gdb" 
TRANSMISSION_LINES_DATA = r"transmission_lines\Transmission_Lines.shp"
ROADWAYS_DATA = r"NTAD_North_American_Roads\North_American_Roads.shp"
PROTECTED_LANDS = r"C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data\protected_lands"
WETLANDS = r"C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data\Wetlands"
WETLAND_ATTRIBUTES = r"Wetlands\NWI-Code-Definitions\NWI-Code-Definitions\NWI_Code_Definitions.gdb"
PARQUET_INGESTION_PATH = r"C:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\jan_25_proj_infra_parcels\data\ingestion_parquets"

#### 1. Cleaning and saving source encumbrance data in parquet format

In [4]:
# Function to add wetland attributes
def add_wetland_attributes(gdf_wetland):
    """
    Load wetland attributes from a geodatabase and return as a GeoDataFrame.
    """
    # Read the geodatabase
    gdf_wetland_attributes = gpd.read_file(os.path.join(LOCAL_DATA_FOLDER, WETLAND_ATTRIBUTES))
    
    # Drop unnecessary columns
    columns_to_drop = [
        'SYSTEM', 'SYSTEM_NAME', 'SYSTEM_DEFINITION',
        'SUBSYSTEM', 'SUBSYSTEM_DEFINITION',
        'CLASS', 'CLASS_DEFINITION',
        'SUBCLASS', 'SUBCLASS_DEFINITION',
        'SPLIT_CLASS', 'SPLIT_CLASS_DEFINITION',
        'SPLIT_SUBCLASS', 'SPLIT_SUBCLASS_NAME', 'SPLIT_SUBCLASS_DEFINITION',
        'WATER_REGIME', 'WATER_REGIME_DEFINITION',
        'MODIFIER1', 'MODIFIER1_NAME', 'MODIFIER1_GROUP', 'MODIFIER1_SUBGROUP', 'MODIFIER1_DEFINITION',
        'MODIFIER2', 'MODIFIER2_NAME', 'MODIFIER2_GROUP', 'MODIFIER2_SUBGROUP', 'MODIFIER2_DEFINITION',
        'geometry'
    ]
    gdf_wetland_attributes = gdf_wetland_attributes.drop(
        columns=[col for col in columns_to_drop if col in gdf_wetland_attributes.columns])

    # Join datasets using the 'ATTRIBUTE' column
    wetlands_with_attributes = gdf_wetland.merge(
        gdf_wetland_attributes,
        how='left',
        on='ATTRIBUTE'
    )

    return wetlands_with_attributes

In [7]:
# Define global config variable to store dataset configurations
DATASET_CONFIG = {
    'transmission_lines': {
        'path': TRANSMISSION_LINES_DATA,
        'read_kwargs': {},
        'requires_state': False,
        'cleanup': lambda gdf: gdf
            .drop(columns=[
                'OBJECTID', 'SOURCE', 'SOURCEDATE', 'VAL_METHOD', 'VOLTAGE',
                'INFERRED', 'SUB_1', 'SUB_2'
            ])
            .assign(Shape__Len=gdf['Shape__Len'].round(2))
    },
    'railways': {
        'path': RAILWAYS_DATA,
        'read_kwargs': {'layer': 'North_American_Rail_Network_Lines'},
        'requires_state': False,
        'cleanup': lambda gdf: gdf
            .drop(columns=[
                'FRFRANODE', 'TOFRANODE', 'STFIPS', 'CNTYFIPS', 'STATEAB', 'COUNTRY',
                'FRADISTRCT', 'RROWNER1', 'RROWNER2', 'RROWNER3',
                'TRKRGHTS1', 'TRKRGHTS2', 'TRKRGHTS3', 'TRKRGHTS4', 'TRKRGHTS5',
                'TRKRGHTS6', 'TRKRGHTS7', 'TRKRGHTS8', 'TRKRGHTS9', 'DIVISION',
                'SUBDIV', 'BRANCH', 'YARDNAME', 'PASSNGR', 'STRACNET', 'TRACKS',
                'NET', 'MILES', 'TIMEZONE', 'SHAPE_Length'
            ], errors='ignore')
            .assign(KM=gdf['KM'].round(2))
    },
    'roadways': {
        'path': ROADWAYS_DATA,
        'read_kwargs': {},
        'requires_state': False,
        'cleanup': lambda gdf: gdf[gdf['COUNTRY'] == 2]
            .drop(columns=['DIR', 'LINKID', 'JURISCODE', 'ROADNUM', 'CLASS', 'NHS'], errors='ignore')
    },
    'wetlands': {
        'gdb_config': lambda state: {
            'folder': WETLANDS,
            'subfolder': f"{state}_geodatabase_wetlands",
            'gdb_name': f"{state}_geodatabase_wetlands.gdb"
        },
        'requires_state': True,
        'postprocess': lambda gdf: add_wetland_attributes(gdf)
    },
    'protected_lands': {
        'gdb_config': lambda state: {
            'folder': PROTECTED_LANDS,
            'subfolder': f"PADUS4_1_State_{state}_GDB_KMZ",
            'gdb_name': f"PADUS4_1_State{state}.gdb"
        },
        'requires_state': True
    }
}


In [8]:
# Function to clean and save dataset
def clean_and_save_dataset(
    dataset='railways',
    output_format='parquet',
    destination_path=PARQUET_INGESTION_PATH,
    state=None
):
    config = DATASET_CONFIG.get(dataset)
    if not config:
        raise ValueError(f"Unsupported dataset: {dataset}")

    if config.get('requires_state') and not state:
        raise ValueError(f"State must be provided for {dataset}")

    # Determine path
    if 'path' in config:
        full_path = os.path.join(LOCAL_DATA_FOLDER, config['path'])
        gdf = gpd.read_file(full_path, **config.get('read_kwargs', {}))
    else:
        # Handle GDB datasets
        gdb_info = config['gdb_config'](state)
        gdb_path = os.path.join(LOCAL_DATA_FOLDER, gdb_info['folder'], gdb_info['subfolder'], gdb_info['gdb_name'])

        # Find largest layer
        largest_layer = max(
            fiona.listlayers(gdb_path),
            key=lambda layer: len(fiona.open(gdb_path, layer=layer))
        )

        gdf = gpd.read_file(gdb_path, layer=largest_layer)
        print(f"Loaded {dataset} data from layer: {largest_layer} with {len(gdf)} features")

    # Optional post-processing
    if 'cleanup' in config:
        gdf = config['cleanup'](gdf)

    if 'postprocess' in config:
        gdf = config['postprocess'](gdf)
        print(f"Postprocessed {dataset} data")

    # Set CRS
    gdf = gdf.to_crs(crs=geo_crs)
    print(f"CRS of data is {gdf.crs}")

    # Save output
    if output_format == 'parquet':
        filename = f"{state}_{dataset}.parquet" if state else f"{dataset}.parquet"
        filepath = os.path.join(destination_path, filename)
        gdf.to_parquet(filepath)
        print(f"{filename} data cleaned and saved successfully!")


In [52]:
# Run all states for protected and wetlands
# List of states and datasets
states = ['GA', 'MO', 'PA', 'WI', 'NM', 'IL','WA','TX']
datasets = ['wetlands', 'protected_lands']

# Iterate through states and datasets
for state in states:
    for dataset in datasets:
        # Construct the filename
        filename = f"{state}_{dataset}.parquet" if state else f"{dataset}.parquet"
        destination_file = os.path.join(PARQUET_INGESTION_PATH, filename)

       # Check if the file already exists
        if not os.path.exists(destination_file):
            try:
                # Attempt to run the function
                clean_and_save_dataset(
                    dataset=dataset,
                    output_format='parquet',
                    destination_path=PARQUET_INGESTION_PATH,
                    state=state
                )
                print(f"File {filename} processed and saved.")
            except Exception as e:
                # Log the error and continue
                print(f"Failed to process {filename}: {e}")
        else:
            print(f"File {filename} already exists. Skipping.")

File GA_wetlands.parquet already exists. Skipping.
File GA_protected_lands.parquet already exists. Skipping.
File MO_wetlands.parquet already exists. Skipping.
File MO_protected_lands.parquet already exists. Skipping.
File PA_wetlands.parquet already exists. Skipping.
File PA_protected_lands.parquet already exists. Skipping.
File WI_wetlands.parquet already exists. Skipping.
File WI_protected_lands.parquet already exists. Skipping.
File NM_wetlands.parquet already exists. Skipping.
File NM_protected_lands.parquet already exists. Skipping.
File IL_wetlands.parquet already exists. Skipping.
Loaded protected_lands data from layer: PADUS4_1Comb_DOD_Trib_NGP_Fee_Desig_Ease_State_IL with 14768 features
CRS of data is EPSG:4326
IL_protected_lands.parquet data cleaned and saved successfully!
File IL_protected_lands.parquet processed and saved.
File WA_wetlands.parquet already exists. Skipping.
File WA_protected_lands.parquet already exists. Skipping.
File TX_wetlands.parquet already exists. Sk

#### 2. Parcel data for POC counties from BigQ in parquet

In [31]:
# Define constants here
PROJECT = 'clgx-gis-app-dev-06e3'
DATASET = 'property'
POC_DATASET = 'encumbered_parcels'
POC_TABLE = 'base_parcels_poc_counties_0419'
CREDENTIALS_PATH =  r"C:\Users\eprashar\AppData\Roaming\gcloud\application_default_credentials.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(CREDENTIALS_PATH)

# Credentials verification
utils.check_and_authenticate(CREDENTIALS_PATH)

Credentials file is older than 24 hours. Re-authenticating...
Trying reauthentication on gcloud server using shell command...
Login window opened...please complete authentication
Waiting for credentials file to update...
Authentication failed because of '<' not supported between instances of 'datetime.timedelta' and 'int'


In [30]:
# Define function to get parcel data for the defined county
def fetch_and_save_county_parcels(fips_code: str) -> gpd.GeoDataFrame:
    """
    Load parcel data from BigQuery and filter by FIPS code.
    """
    # Define the SQL query to filter by FIPS code
    # TO-DO: Create a table in BQ with processed data
    query = f"""
        SELECT * 
        FROM `{PROJECT}.{POC_DATASET}.{POC_TABLE}`
        WHERE fips_code = '{fips_code}'
    """
    # Read the data into a GeoDataFrame
    gdf_parcel = utils.read_bigquery_to_gdf(project=PROJECT, dataset=POC_DATASET, table=POC_TABLE, query=query, output='gpd', geometry_col='geometry')
    
    # Convert to EPSG:4326
    gdf_parcel = gdf_parcel.to_crs(geo_crs)
    print(f'CRS of the parcel dataframe is {gdf_parcel.crs}')
    gdf_parcel.to_parquet(os.path.join(PARQUET_INGESTION_PATH,f'{fips_code}_parcels.parquet'))
    print(f'parcel parquet created for {fips_code}!')

In [34]:
# Save parcel parquets for all POC counties
for county_fips in POC_FINALIZED_COUNTIES:
    fetch_and_save_county_parcels(fips_code=county_fips)

CRS of the parcel dataframe is EPSG:4326
parcel parquet created for 17031!
CRS of the parcel dataframe is EPSG:4326
parcel parquet created for 13121!
CRS of the parcel dataframe is EPSG:4326
parcel parquet created for 53033!
CRS of the parcel dataframe is EPSG:4326
parcel parquet created for 48491!
CRS of the parcel dataframe is EPSG:4326
parcel parquet created for 29181!
CRS of the parcel dataframe is EPSG:4326
parcel parquet created for 42011!
CRS of the parcel dataframe is EPSG:4326
parcel parquet created for 55107!
CRS of the parcel dataframe is EPSG:4326
parcel parquet created for 35051!
CRS of the parcel dataframe is EPSG:4326
parcel parquet created for 17127!


#### 3. Save county level data for encumbrances in parquet

In [53]:
# Function to load county boundary
def get_county_boundary(fips_code):
    """
    Load county boundary from shapefile and filter by FIPS code.
    """
    # Read the shapefile
    gdf_county = gpd.read_file(os.path.join(LOCAL_DATA_FOLDER, COUNTY_DATA))
    
    # Filter by FIPS code
    gdf_county = gdf_county[gdf_county['GEOID'] == fips_code]
    
    # Convert to EPSG:4326
    gdf_county = gdf_county.to_crs(geo_crs)
    
    return gdf_county

In [54]:
# Filter encumbrance data for county
def filter_gdf_using_boundary(gdf_encumbrance, county_boundary):
    """
    Load wetland attributes from a geodatabase and join them to the wetlands GeoDataFrame
    using the 'ATTRIBUTE' field as a key.

    Parameters:
        gdf_wetland (GeoDataFrame): Wetlands GeoDataFrame with 'ATTRIBUTE' field.

    Returns:
        GeoDataFrame: Wetlands with descriptive attribute fields added.
    """

    # Convert both dataframes to the same projection system
    gdf_encumbrance = gdf_encumbrance.to_crs(county_boundary.crs)

    # Perform a spatial join
    filtered_encumbrance = gpd.sjoin(gdf_encumbrance, county_boundary, predicate='within')

    # Drop unnecessary columns from county database
    filtered_encumbrance.drop(columns=[
        'index_right',
        'STATEFP',
        'COUNTYFP',
        'COUNTYNS',
        'GEOID',
        'GEOIDFQ',
        'LSAD',
        'CLASSFP',
        'MTFCC',
        'CSAFP',
        'CBSAFP',
        'METDIVFP',
        'FUNCSTAT',
        'ALAND',
        'AWATER',
        'INTPTLAT',
        'INTPTLON'], inplace=True)

    # Convert epsg for filtered gdf to 4326
    filtered_encumbrance = filtered_encumbrance.set_geometry('geometry').to_crs(geo_crs)
    print(f'CRS of the filtered dataframe is {filtered_encumbrance.crs}')
    return filtered_encumbrance

In [57]:
# Load data for encumbrance type
# TODO: Set up logging module and timing using @log_time decorator

def load_encumbrance_for_county(
        encumbrance_type: EncumbranceType,
        fips,
        state=None) -> gpd.GeoDataFrame:
    '''
    Load encumbrance data from <source> and return as GeoDataFrame.
    '''
    # Flag error if chosen encumbrance is not defined
    if encumbrance_type not in ENCUMBRANCES:
        raise ValueError(
            f"Invalid encumbrance type '{encumbrance_type}'. "
            f"Valid options are: {', '.join(ENCUMBRANCES)}."
        )
    # Obtain county boundary
    county_boundary = get_county_boundary(fips)

    # Define method for each encumbrance type
    if encumbrance_type in ['roadways', 'railways', 'transmission_lines']:
        
        # Read encumbrance parquet file
        gdf = gpd.read_parquet(os.path.join(PARQUET_INGESTION_PATH, f"{encumbrance_type}.parquet"))

    elif encumbrance_type == 'wetlands' or encumbrance_type == 'protected_lands':

        # Raise error if state not provided
        if state is None:
            raise ValueError("State must be provided for wetlands and protected_lands encumbrance types.")
       
        # Read encumbrance parquet file
        gdf = gpd.read_parquet(os.path.join(PARQUET_INGESTION_PATH, f"{state}_{encumbrance_type}.parquet"))

        # Pre-processing for analysis
        # Ideal to execute this in ingestion function.
        # TODO for later

        if encumbrance_type == 'wetlands':
            gdf = gdf.drop(columns=[
                'NWI_ID',
            ])

            # Add attributes related to wetlands
            gdf = add_wetland_attributes(gdf)
            
        else: 
            gdf = gdf.drop(columns=[
                'FeatClass',
                'Category',
                'Own_Name',
                'Mang_Type',
                'Mang_Name',
                'Des_Tp',
                'Agg_Src',
                'GIS_Src',
                'Src_Date',
                'GIS_Acres',
                'Source_PAID',
                'Pub_Access',
                'Access_Src',
                'GAP_Sts',
                'IUCN_Cat',
                'Date_Est',
                'Comments',
                'Term',
                'Duration',
            ])


    # Use spatial join to get encumbrance data for county
    county_gdf = filter_gdf_using_boundary(gdf, county_boundary)

    # save county_gdf as parquet file
    print(f"{encumbrance_type} data loaded for FIPS code: {fips}")
    
    # Convert to EPSG:4326
    county_gdf = county_gdf.to_crs(geo_crs)
    county_gdf.to_parquet(os.path.join(PARQUET_INGESTION_PATH,f'{fips}_{encumbrance_type}.parquet'))
    print(f'{encumbrance_type} parquet created for {fips}!')

In [64]:
# Mapping of county FIPS to state abbreviations
FIPS_TO_STATE = {
    '17031': 'IL',  # Cook County, IL
    '13121': 'GA',  # Fulton County, GA
    '53033': 'WA',  # King County, WA
    '48491': 'TX',  # Williamson County, TX
    '29181': 'MO',  # Warren County, MO
    '42011': 'PA',  # Berks County, PA
    '55107': 'WI',  # Rusk County, WI
    '35051': 'NM',  # Sierra County, NM
    '17127': 'IL',  # Massac County, IL
}

poc_fips = POC_FINALIZED_COUNTIES
encumbrances = ENCUMBRANCES

# Encumbrances that require state information
STATE_REQUIRED_TYPES = {'wetlands', 'protected_lands'}

for fips in poc_fips:
    for encumbrance in encumbrances:
        filename = f"{fips}_{encumbrance}.parquet"
        destination_file = os.path.join(PARQUET_INGESTION_PATH, filename)

        if not os.path.exists(destination_file):
            try:
                state = FIPS_TO_STATE.get(fips) if encumbrance in STATE_REQUIRED_TYPES else None

                load_encumbrance_for_county(
                    encumbrance_type=encumbrance,
                    fips=fips,
                    state=state
                )
                print(f"File {filename} processed and saved.")
            except Exception as e:
                print(f"Failed to process {filename}: {e}")
        else:
            print(f"File {filename} already exists. Skipping.")


File 17031_roadways.parquet already exists. Skipping.
File 17031_railways.parquet already exists. Skipping.
File 17031_protected_lands.parquet already exists. Skipping.
File 17031_wetlands.parquet already exists. Skipping.
CRS of the filtered dataframe is {"$schema": "https://proj.org/schemas/v0.7/projjson.schema.json", "type": "GeographicCRS", "name": "WGS 84", "datum_ensemble": {"name": "World Geodetic System 1984 ensemble", "members": [{"name": "World Geodetic System 1984 (Transit)"}, {"name": "World Geodetic System 1984 (G730)"}, {"name": "World Geodetic System 1984 (G873)"}, {"name": "World Geodetic System 1984 (G1150)"}, {"name": "World Geodetic System 1984 (G1674)"}, {"name": "World Geodetic System 1984 (G1762)"}, {"name": "World Geodetic System 1984 (G2139)"}], "ellipsoid": {"name": "WGS 84", "semi_major_axis": 6378137, "inverse_flattening": 298.257223563}, "accuracy": "2.0", "id": {"authority": "EPSG", "code": 6326}}, "coordinate_system": {"subtype": "ellipsoidal", "axis": [{"