# Jupyter Notebook: Connect to DB and Load External Data


In [2]:
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
import pandas as pd
import geopandas as gpd
from geoalchemy2 import Geometry
from census import Census

# Load .env variables
load_dotenv()

# Database config
DB_NAME = os.environ["DB_NAME"]
DB_USER = os.environ["DB_USER"]
DB_PASSWORD = os.environ["DB_PASSWORD"]
DB_HOST = os.environ.get("DB_HOST", "localhost")
DB_PORT = os.environ.get("DB_PORT", "5432")

# Census API config
CENSUS_API_KEY = os.environ["CENSUS_API_KEY"]
c = Census(CENSUS_API_KEY)

# Create database engine
connection_url = URL.create(
    "postgresql+psycopg2",
    username=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT,
    database=DB_NAME,
)
engine = create_engine(connection_url, echo=True)


In [4]:
import requests
import zipfile
import io
from sqlalchemy import text


# ----------------------------------------
# Load TIGER/Line County Shapefile and Insert into 'counties'
# ----------------------------------------

tiger_url = "https://www2.census.gov/geo/tiger/TIGER2020/COUNTY/tl_2020_us_county.zip"
tiger_path = "tl_2020_us_county.zip"

# Download if not already there
if not os.path.exists(tiger_path):
    import requests
    with requests.get(tiger_url, stream=True) as r:
        with open(tiger_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

# Read shapefile
counties_gdf = gpd.read_file(f"zip://{tiger_path}")

# Filter for Colorado (08) and Texas (48)
counties_gdf = counties_gdf[counties_gdf["STATEFP"].isin(["08", "48"])]

# Set CRS and reproject to EPSG:4326 if necessary
if counties_gdf.crs is None:
    counties_gdf.set_crs("EPSG:4269", inplace=True)
counties_gdf = counties_gdf.to_crs("EPSG:4326")

# Create county_fips
counties_gdf["county_fips"] = counties_gdf["STATEFP"] + counties_gdf["COUNTYFP"]
counties_gdf["name"] = counties_gdf["NAME"]
counties_gdf["state"] = counties_gdf["STATEFP"]
counties_gdf = counties_gdf[["county_fips", "name", "state", "geometry"]]

# Check if 'geometry' column exists in the real table
# Insert into a temporary table first
temp_table = "counties_temp"
counties_gdf.to_postgis(
    temp_table, engine, if_exists="replace", index=False,
    dtype={"geometry": Geometry(geometry_type="MULTIPOLYGON", srid=4326)}
)

# Use geometry from temp table, but only insert into known columns
with engine.begin() as conn:
    # Check if 'geometry' exists in the real table
    result = conn.execute(text("""
        SELECT column_name FROM information_schema.columns
        WHERE table_name = 'counties' AND column_name = 'geometry';
    """)).fetchone()

    if result:
        # Insert including geometry if it exists
        conn.execute(text(f"""
            INSERT INTO counties (county_fips, name, state, geometry)
            SELECT county_fips, name, state, geometry FROM {temp_table}
            ON CONFLICT (county_fips) DO UPDATE
            SET name = EXCLUDED.name,
                state = EXCLUDED.state,
                geometry = EXCLUDED.geometry;
        """))
    else:
        # Insert without geometry
        conn.execute(text(f"""
            INSERT INTO counties (county_fips, name, state)
            SELECT county_fips, name, state FROM {temp_table}
            ON CONFLICT (county_fips) DO UPDATE
            SET name = EXCLUDED.name,
                state = EXCLUDED.state;
        """))
    conn.execute(text(f"DROP TABLE {temp_table};"))

print("Loaded county shapefiles for CO and TX into 'counties' table.")


2025-04-19 10:27:36,898 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-19 10:27:36,900 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_namespace.nspname = %(nspname_1)s
2025-04-19 10:27:36,900 INFO sqlalchemy.engine.Engine [cached since 76.97s ago] {'table_name': 'counties_temp', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'public'}
2025-04-19 10:27:36,901 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.rel

In [None]:
# ----------------------------------------
# Load ACS Economic Indicators (Time Series)
# ----------------------------------------

acs_vars = {
    "B01003_001E": "population",
    "B19013_001E": "median_income",
    "B23025_005E": "unemployed",
    "B23025_003E": "labor_force"
}

data = []
for year in [2015, 2016, 2017, 2018, 2019, 2020, 2021]:
    for state_fips in ["08", "48"]:
        results = c.acs5.get(
            fields=list(acs_vars.keys()),
            geo={"for": "county:*", "in": f"state:{state_fips}"},
            year=year
        )
        for row in results:
            try:
                population = int(row.get("B01003_001E")) if row.get("B01003_001E") not in (None, "null") else None
                median_income = int(row.get("B19013_001E")) if row.get("B19013_001E") not in (None, "null") else None
                labor_force = int(row.get("B23025_003E")) if row.get("B23025_003E") not in (None, "null") else None
                unemployed = int(row.get("B23025_005E")) if row.get("B23025_005E") not in (None, "null") else None
                unemployment_rate = round((unemployed / labor_force) * 100, 2) if labor_force and unemployed is not None else None
            except (ValueError, TypeError, ZeroDivisionError):
                population = None
                median_income = None
                unemployment_rate = None

            data.append({
                "county_fips": f"{row['state']}{row['county']}",
                "year": year,
                "population": population,
                "median_income": median_income,
                "unemployment_rate": unemployment_rate,
                "net_migration": None,
                "personal_income": None
            })

# Create DataFrame and load to DB
df = pd.DataFrame(data)
df.to_sql("economic_indicators", engine, if_exists="append", index=False)

print("ACS time series data (2015–2021) for CO and TX loaded into economic_indicators table.")

2025-04-19 10:32:20,298 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-19 10:32:20,300 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname_1)s
2025-04-19 10:32:20,300 INFO sqlalchemy.engine.Engine [cached since 107.4s ago] {'table_name': 'economic_indicators', 'param_1': 'r', 'param_2': 'p', 'param_3': 'f', 'param_4': 'v', 'param_5': 'm', 'nspname_1': 'pg_catalog'}
2025-04-19 10:32:20,302 INFO sqlalchemy.engine.Engine 
CREATE TABLE economic_indicators (
	county_fips TEXT, 
	year BIGINT, 
	population BIGINT, 
	median_income FLOAT(53), 
	unemployment_rate FLOAT(53), 
	net_migration