# Texas Well Data Import and Spatial Join to Counties


In [9]:
# Texas Well Data Import and Spatial Join to Counties

import os
import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Define source and target database URLs
SRC_DB = os.getenv("TEXAS_OIL_DB_URL")  # postgresql://user:pass@localhost:5432/texas_oil
TGT_DB = os.getenv("OG_IMPACT_DB_URL")  # postgresql://user:pass@localhost:5432/og_impact

src_engine = create_engine(SRC_DB)
tgt_engine = create_engine(TGT_DB)

# Ensure tx schema exists
with tgt_engine.begin() as conn:
    conn.execute(text("CREATE SCHEMA IF NOT EXISTS tx"))

# Load county geometries from og_impact for spatial join
counties_gdf = gpd.read_postgis("SELECT county_fips, name, geom FROM counties", con=tgt_engine, geom_col="geom")

# Tables to import from texas_oil
tables = ["well_data", "well_shapes", "well_inspections", "well_violations", "inspections", "violations"]
schema = "tx"  # Target schema for imported Texas data

# Ensure spatial reference match
counties_crs = counties_gdf.crs

for table in tables:
    print(f"Processing {table}...")

    # Determine if table has geometry
    has_geom = "shapes" in table

    # Read from source
    if has_geom:
        gdf = gpd.read_postgis(f"SELECT * FROM {table}", con=src_engine, geom_col="geometry")

        # If geometry has no CRS, assume WGS 84
        if gdf.crs is None:
            gdf.set_crs("EPSG:4326", inplace=True)

        gdf = gdf.to_crs(counties_crs)

        # Spatial join to counties
        gdf = gpd.sjoin(gdf, counties_gdf, how="left", predicate="within")
    else:
        gdf = pd.read_sql(f"SELECT * FROM {table}", con=src_engine)

    # Export to target
    if has_geom:
        gdf.to_postgis(name=table, con=tgt_engine, schema=schema, if_exists="replace", index=False)
    else:
        gdf.to_sql(name=table, con=tgt_engine, schema=schema, if_exists="replace", index=False)

print("✅ All tables imported and joined to counties (where applicable).")


Processing well_data...
Processing well_shapes...
Processing well_inspections...
Processing well_violations...
Processing inspections...
Processing violations...
✅ All tables imported and joined to counties (where applicable).


In [13]:
# Verify imported tables and row counts
with tgt_engine.connect() as conn:
    result = conn.execute(text("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'tx'
        ORDER BY table_name;
    """)).fetchall()

    print("\n📋 Table summary in 'tx' schema:")
    for row in result:
        table = row.table_name
        count = conn.execute(text(f"SELECT COUNT(*) FROM tx.{table}")).scalar()
        print(f"- {table}: {count} rows")



📋 Table summary in 'tx' schema:
- inspections: 3207011 rows
- violations: 1675195 rows
- well_data: 1182773 rows
- well_inspections: 1889031 rows
- well_shapes: 964914 rows
- well_violations: 218623 rows
