In [8]:
import os
from google.cloud import bigquery, storage 
from google.cloud.exceptions import GoogleCloudError
from utils import check_and_authenticate

In [9]:
# --- Configuration ---
# First, validate the authentication token
CREDENTIALS_PATH =  r"C:\Users\eprashar\AppData\Roaming\gcloud\application_default_credentials.json"
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(CREDENTIALS_PATH)
check_and_authenticate(CREDENTIALS_PATH)

PROJECT_ID = "clgx-gis-app-dev-06e3"
DATASET_ID = "encumbered_parcels"
FINAL_TABLE_ID = "wetlands"
GCS_BUCKET = "geospatial-projects"
GCS_PREFIX = "infra_parcels/wetlands_v2/county/"

Credentials file is older than 24 hours. Re-authenticating...
Trying reauthentication on gcloud server using shell command...
Login window opened...please complete authentication
Waiting for credentials file to update...
Authentication confirmed! Credentials file updated.


In [10]:
# --- Initialization ---
# This script assumes you have authenticated with GCP CLI using:
# gcloud auth application-default login
bq_client = bigquery.Client(project=PROJECT_ID)
storage_client = storage.Client()

In [11]:
# This step ensures the target table is ready before the loop starts.
# The schema is defined to match the output of the append query.
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS `{PROJECT_ID}.{DATASET_ID}.{FINAL_TABLE_ID}` (
    NWI_ID STRING,
    state STRING,
    fips STRING,
    county STRING,
    county_full_name STRING,
    ATTRIBUTE STRING,
    WETLAND_TYPE STRING,
    ACRES FLOAT64,
    SUBSYSTEM_NAME STRING,
    CLASS_NAME STRING,
    SUBCLASS_NAME STRING,
    SPLIT_CLASS_NAME STRING,
    WATER_REGIME_NAME STRING,
    WATER_REGIME_SUBGROUP STRING,
    geometry GEOGRAPHY
)
CLUSTER BY state, fips, geometry;
"""
print(f"Ensuring final table `{FINAL_TABLE_ID}` exists...")
create_job = bq_client.query(create_table_sql)
create_job.result() # Wait for the create operation to complete
print("Table check complete.")

Ensuring final table `wetlands` exists...
Table check complete.


In [12]:
# Log files to track progress and failures
PROCESSED_LOG_FILE = "processed_files.log"
FAILED_LOG_FILE = "failed_files.log"

# --- Initialization ---
# This script assumes you have authenticated with GCP CLI using:
# gcloud auth application-default login

def get_processed_files(log_file):
    """Reads a log file and returns a set of GCS URIs that have been successfully processed."""
    if not os.path.exists(log_file):
        return set()
    with open(log_file, "r") as f:
        # Read lines and strip any whitespace/newlines
        return set(line.strip() for line in f if line.strip())

def get_files_to_process(bucket_name, prefix, processed_files_set):
    """Lists files in GCS, filtering out those already processed."""
    blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
    all_files = [f"gs://{bucket_name}/{blob.name}" for blob in blobs if blob.name.endswith(".parquet")]
    
    files_to_process = [f for f in all_files if f not in processed_files_set]
    print(f"Found {len(all_files)} total files in GCS.")
    print(f"Found {len(processed_files_set)} files already processed.")
    print(f"-> {len(files_to_process)} new files to process.")
    return files_to_process

# --- Main Processing Loop ---
print("--- Starting Resumable BigQuery Upload Process ---")

# 1. Get the set of files that have already been processed successfully.
processed_files = get_processed_files(PROCESSED_LOG_FILE)

# 2. Get the list of all source files in GCS, excluding the ones we've already done.
files_to_process = get_files_to_process(GCS_BUCKET, GCS_PREFIX, processed_files)

# 3. Loop over the remaining files and upload them one by one.
for i, gcs_uri in enumerate(files_to_process):
    print(f"\nProcessing file {i+1} of {len(files_to_process)}: {gcs_uri}")
    
    # Use a temporary table for each file load to make the process atomic
    temp_table_id = f"temp_load_{i}_{os.path.basename(gcs_uri).split('.')[0]}"
    temp_table_ref = bq_client.dataset(DATASET_ID).table(temp_table_id)

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.PARQUET,
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    )

    try:
        # Step 1: Load the single file into a temporary BQ table.
        load_job = bq_client.load_table_from_uri(
            gcs_uri, temp_table_ref, job_config=job_config
        )
        load_job.result()  # Wait for the job to complete
        print(f"  SUCCESS: Loaded {gcs_uri} to temp table.")

        # Step 2: Append the data to the final table, handling geometry conversion.
        append_sql = f"""
            INSERT INTO `{PROJECT_ID}.{DATASET_ID}.{FINAL_TABLE_ID}`
            SELECT
                * EXCEPT (geometry),
                -- This CASE statement robustly handles both WKB and GeoJSON
                CASE
                    WHEN STARTS_WITH(SAFE_CONVERT_BYTES_TO_STRING(geometry), '{{')
                        THEN ST_GEOGFROMGEOJSON(SAFE_CONVERT_BYTES_TO_STRING(geometry), make_valid => TRUE)
                    ELSE ST_GEOGFROMWKB(geometry)
                END AS geometry
            FROM `{PROJECT_ID}.{DATASET_ID}.{temp_table_id}`;
        """
        append_job = bq_client.query(append_sql)
        append_job.result() # Wait for the append to finish
        print(f"  SUCCESS: Appended data to final table.")

        # Step 3: If both steps succeed, log the file as processed.
        with open(PROCESSED_LOG_FILE, "a") as f:
            f.write(f"{gcs_uri}\n")
        print(f"  SUCCESS: Logged {gcs_uri} as completed.")

    except GoogleCloudError as e:
        # If the load fails, log the error and the file name, then continue.
        print(f"  ERROR: Failed to process {gcs_uri}.")
        error_message = e.errors[0]['message'] if e.errors else str(e)
        print(f"  REASON: {error_message}")
        with open(FAILED_LOG_FILE, "a") as f:
            f.write(f"{gcs_uri}\t{error_message}\n")
            
    finally:
        # Clean up the temporary table regardless of success or failure
        bq_client.delete_table(temp_table_ref, not_found_ok=True)
        print(f"  INFO: Cleaned up temp table {temp_table_id}.")

print("\nProcessing complete.")

--- Starting Resumable BigQuery Upload Process ---
Found 2723 total files in GCS.
Found 0 files already processed.
-> 2723 new files to process.

Processing file 1 of 2723: gs://geospatial-projects/infra_parcels/wetlands_v2/county/AK_02013_wetlands.parquet
  SUCCESS: Loaded gs://geospatial-projects/infra_parcels/wetlands_v2/county/AK_02013_wetlands.parquet to temp table.
  ERROR: Failed to process gs://geospatial-projects/infra_parcels/wetlands_v2/county/AK_02013_wetlands.parquet.
  REASON: No matching signature for function SAFE_CONVERT_BYTES_TO_STRING
  Argument types: STRING
  Signature: SAFE_CONVERT_BYTES_TO_STRING(BYTES)
    Argument 1: Unable to coerce type STRING to expected type BYTES at [7:38]
  INFO: Cleaned up temp table temp_load_0_AK_02013_wetlands.

Processing file 2 of 2723: gs://geospatial-projects/infra_parcels/wetlands_v2/county/AK_02013_wetlands.parquet/part-00000-b3b40955-611d-4032-be73-be012b8be1d0-c000.snappy.parquet
  SUCCESS: Loaded gs://geospatial-projects/infr

KeyboardInterrupt: 