# Colorado Shapefile Import and Schema Setup


In [10]:
# Colorado Shapefile and CSV/XLSX Import and Schema Setup

import os
import geopandas as gpd
import pandas as pd
import fiona
from sqlalchemy import create_engine, text
from zipfile import ZipFile
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Define target database (og_impact)
db_url = os.getenv("OG_IMPACT_DB_URL")
engine = create_engine(db_url)

# Ensure 'co' schema exists
with engine.begin() as conn:
    conn.execute(text("CREATE SCHEMA IF NOT EXISTS co"))

# Define download directory and file map
base_path = "/home/dadams/Downloads"
files = {
    "wells": "WELLS_SHP.ZIP",
    "fields": "COGCC_FIELDS_SHP.zip",
    "inspections": "Inspections_csv_20240601.zip",
    "violations": "NOAV.zip",
    "complaints": "Complaints.zip"
}

for name, zip_file in files.items():
    print(f"Processing {name}...")
    zip_path = os.path.join(base_path, zip_file)

    if zip_file.lower().endswith(".zip"):
        try:
            # Try reading as shapefile layer
            layers = fiona.listlayers(f"zip://{zip_path}")
            print(f"  Found layers: {layers}")
            gdf = gpd.read_file(f"zip://{zip_path}", layer=layers[0])

            # Assign CRS if missing
            if gdf.crs is None:
                gdf.set_crs("EPSG:4326", inplace=True)

            gdf.to_postgis(name=name, con=engine, schema="co", if_exists="replace", index=False)

        except Exception as shapefile_error:
            # Fallback: handle CSV or XLSX
            with ZipFile(zip_path, 'r') as zip_ref:
                csv_names = [f for f in zip_ref.namelist() if f.endswith(".csv")]
                xlsx_names = [f for f in zip_ref.namelist() if f.endswith(".xlsx")]

                if csv_names:
                    csv_filename = csv_names[0]
                    zip_ref.extract(csv_filename, base_path)
                    df = pd.read_csv(os.path.join(base_path, csv_filename))
                    df.to_sql(name=name, con=engine, schema="co", if_exists="replace", index=False)
                    os.remove(os.path.join(base_path, csv_filename))

                elif xlsx_names:
                    xlsx_filename = xlsx_names[0]
                    zip_ref.extract(xlsx_filename, base_path)
                    df = pd.read_excel(os.path.join(base_path, xlsx_filename))
                    df.to_sql(name=name, con=engine, schema="co", if_exists="replace", index=False)
                    os.remove(os.path.join(base_path, xlsx_filename))

                else:
                    raise ValueError(f"No CSV or XLSX file found in {zip_file}")

print("✅ All Colorado files processed and stored in 'co' schema.")

# Summarize row counts in 'co' schema
with engine.connect() as conn:
    result = conn.execute(text("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'co'
        ORDER BY table_name;
    """)).fetchall()

    print("📋 Table summary in 'co' schema:")
    for row in result:
        table = row.table_name
        count = conn.execute(text(f"SELECT COUNT(*) FROM co.{table}")).scalar()
        print(f"- {table}: {count} rows")

# Attempt spatial join to counties if geometry column exists
counties_gdf = gpd.read_postgis("SELECT county_fips, name, geom FROM counties", con=engine, geom_col="geom")

print("🔁 Performing spatial joins where applicable:")
for row in result:
    table = row.table_name
    try:
        gdf = gpd.read_postgis(f"SELECT * FROM co.{table}", con=engine, geom_col='geometry')
        if gdf.crs is None:
            gdf.set_crs("EPSG:4326", inplace=True)
        gdf = gdf.to_crs(counties_gdf.crs)
        if 'geometry' in gdf.columns:
            gdf = gdf.set_geometry('geometry')
        elif 'geom' in gdf.columns:
            gdf = gdf.set_geometry('geom')
        else:
            raise ValueError("No geometry column found")

        joined = gpd.sjoin(gdf, counties_gdf, how="left", predicate="within")
        joined.to_postgis(name=f"{table}_joined", con=engine, schema="co", if_exists="replace", index=False)
        print(f"  ✅ {table}_joined created")
    except Exception as e:
        print(f"  ⚠️  Skipping {table} (no geometry or join failed): {e}")


Processing wells...
  Found layers: ['Wells']
Processing fields...
  Found layers: ['COGCC_Fields', 'COGCC_Horizontal_Fields', 'COGCC_OGDP', 'COGCC_staff_contacts', 'COGCC_Wattenberg_Field']
Processing inspections...
  Found layers: ['BasicInfo', 'Bradenhead', 'Cement', 'Complaints', 'CorrectiveActions', 'Drill', 'Environmental', 'Idle', 'Locations', 'Pits', 'ProductionComments', 'Reclamations', 'Spills', 'Stimulation', 'StormWater', 'TankBerm', 'Uic', 'WasteManagement', 'WellInfo', 'WorkoverComments']
Processing violations...
Processing complaints...
✅ All Colorado files processed and stored in 'co' schema.
📋 Table summary in 'co' schema:
- complaints: 6604 rows
- fields: 1454 rows
- inspections: 229109 rows
- violations: 3841 rows
- wells: 123576 rows
🔁 Performing spatial joins where applicable:
  ⚠️  Skipping complaints (no geometry or join failed): Query missing geometry column 'geometry'
  ✅ fields_joined created
  ⚠️  Skipping inspections (no geometry or join failed): Query missi