# Jupyter Notebook: Load BLS QCEW Employment Data from API (NAICS 211 + 213111) into economic_indicators


In [8]:
# Jupyter Notebook: Load BLS QCEW Employment Data from API (NAICS 211 + 213111) into economic_indicators

import os
from dotenv import load_dotenv
import pandas as pd
import requests
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL
from time import sleep
from tqdm import tqdm

# Load environment variables
load_dotenv()
BLS_API_KEY = os.environ["BLS_API_KEY"]

# Database config
DB_NAME = os.environ["DB_NAME"]
DB_USER = os.environ["DB_USER"]
DB_PASSWORD = os.environ["DB_PASSWORD"]
DB_HOST = os.environ.get("DB_HOST", "localhost")
DB_PORT = os.environ.get("DB_PORT", "5432")

connection_url = URL.create(
    "postgresql+psycopg2",
    username=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT,
    database=DB_NAME,
)
engine = create_engine(connection_url, echo=True)

# ----------------------------------------
# Query target county-year pairs
# ----------------------------------------

with engine.connect() as conn:
    result = conn.execute(text("""
        SELECT DISTINCT county_fips, year
        FROM economic_indicators
        WHERE (county_fips LIKE '08%' OR county_fips LIKE '48%')
          AND year BETWEEN 2015 AND 2021
        ORDER BY year
    """))
    county_years = list(result.mappings())

    # Check which columns already exist
    result = conn.execute(text("""
        SELECT column_name 
        FROM information_schema.columns 
        WHERE table_name = 'economic_indicators'
    """))
    existing_columns = [row[0] for row in result]
    print("Existing columns in economic_indicators:", existing_columns)

# Add QCEW columns if they don't exist
columns_to_add = []
if 'qcew_emp_211' not in existing_columns:
    columns_to_add.append(('qcew_emp_211', 'NUMERIC'))
if 'qcew_emp_213111' not in existing_columns:
    columns_to_add.append(('qcew_emp_213111', 'NUMERIC'))

if columns_to_add:
    with engine.begin() as conn:
        for col_name, col_type in columns_to_add:
            print(f"Adding column {col_name} to economic_indicators table...")
            conn.execute(text(f"ALTER TABLE economic_indicators ADD COLUMN {col_name} {col_type}"))
    print("✅ Columns added successfully.")
else:
    print("QCEW columns already exist in the table.")

# ----------------------------------------
# Build list of all series IDs for NAICS 211 and 213111
# ----------------------------------------

series_map = {}
for row in county_years:
    state = row["county_fips"][:2]
    county = row["county_fips"][2:]
    year = row["year"]
    for naics in ["211", "213111"]:
        naics6 = naics.zfill(6)
        # Corrected series ID format:
        # EN + U + area code (5 digits) + data type (1) + size code (0) + ownership (5) + industry (6 digits)
        sid = f"ENU{state}{county}105{naics6}"
        series_map[sid] = {"county_fips": state + county, "year": year, "naics": naics}

# ----------------------------------------
# Batch query 50 at a time
# ----------------------------------------

all_series = list(series_map.keys())
batch_size = 50
records = []

for i in tqdm(range(0, len(all_series), batch_size), desc="Fetching BLS data"):
    batch_ids = all_series[i:i+batch_size]
    payload = {
        "seriesid": batch_ids,
        "startyear": "2015",
        "endyear": "2021",
        "registrationkey": BLS_API_KEY,
        "annualaverage": True,  # Request annual average data
        "catalog": False  # Don't include catalog data
    }
    url = "https://api.bls.gov/publicAPI/v2/timeseries/data/"
    headers = {"Content-type": "application/json"}

    try:
        response = requests.post(url, json=payload, headers=headers)
        data = response.json()
        
        if "Results" not in data:
            print(f"⚠️ No results in batch {i}")
            if "message" in data:
                print(f"API message: {data['message']}")
            continue

        for series in data["Results"]["series"]:
            sid = series["seriesID"]
            meta = series_map.get(sid, {})
            for obs in series["data"]:
                # API returns 'M13' for annual average
                if obs["period"] == "M13":
                    value = obs["value"]
                    # Skip values that are missing (represented as '-')
                    if value == '-':
                        continue
                    records.append({
                        "county_fips": meta["county_fips"],
                        "year": int(obs["year"]),
                        f"qcew_emp_{meta['naics']}": float(value)
                    })
    except Exception as e:
        print(f"❌ Batch {i} failed: {e}")
        # Print more debugging info
        if 'data' in locals() and 'Status' in data:
            print(f"API Status: {data['Status']}")
        if 'data' in locals() and 'message' in data:
            print(f"API Message: {data['message']}")
    sleep(1.5)  # avoid rate limits

# ----------------------------------------
# Convert to DataFrame and reshape for update
# ----------------------------------------

if not records:
    print("❌ No records found.")
    print(f"Total series requested: {len(all_series)}")
else:
    df = pd.DataFrame(records)
    
    # Show statistics about the retrieved data
    print(f"Total records retrieved: {len(df)}")
    
    # Show columns available
    print("Columns in data:", df.columns.tolist())
    
    # Show data for the two NAICS codes
    emp_211_count = df['qcew_emp_211'].count() if 'qcew_emp_211' in df.columns else 0
    emp_213_count = df['qcew_emp_213111'].count() if 'qcew_emp_213111' in df.columns else 0
    
    print(f"NAICS 211 records: {emp_211_count}")
    print(f"NAICS 213111 records: {emp_213_count}")
    
    # Pivot the data to have one row per county-year with both NAICS codes
    df_pivot = df.pivot_table(
        index=['county_fips', 'year'],
        values=[col for col in df.columns if col.startswith('qcew_emp_')],
        aggfunc='first'
    ).reset_index()
    
    print(f"✅ {len(df_pivot)} county-year records ready to update.")

    updated = 0
    with engine.begin() as conn:
        for _, row in df_pivot.iterrows():
            # Prepare update SQL based on available columns and data
            update_parts = []
            params = {
                "county_fips": row["county_fips"],
                "year": row["year"]
            }
            
            # Check which columns to update based on both existence and data
            if 'qcew_emp_211' in df_pivot.columns and pd.notna(row.get('qcew_emp_211')):
                update_parts.append("qcew_emp_211 = :emp211")
                params["emp211"] = row['qcew_emp_211']
            
            if 'qcew_emp_213111' in df_pivot.columns and pd.notna(row.get('qcew_emp_213111')):
                update_parts.append("qcew_emp_213111 = :emp213")
                params["emp213"] = row['qcew_emp_213111']
            
            if update_parts:
                update_sql = f"""
                    UPDATE economic_indicators
                    SET {', '.join(update_parts)}
                    WHERE county_fips = :county_fips AND year = :year
                    RETURNING county_fips, year
                """
                result = conn.execute(text(update_sql), params)
                if result.rowcount > 0:
                    updated += 1

    print(f"✅ Updated {updated} rows in economic_indicators table.")
    print("✅ BLS API QCEW employment data updated for NAICS 211 and 213111.")

2025-04-20 18:29:22,137 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-04-20 18:29:22,137 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-20 18:29:22,138 INFO sqlalchemy.engine.Engine select current_schema()
2025-04-20 18:29:22,138 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-20 18:29:22,138 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-04-20 18:29:22,139 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-20 18:29:22,139 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-20 18:29:22,139 INFO sqlalchemy.engine.Engine 
        SELECT DISTINCT county_fips, year
        FROM economic_indicators
        WHERE (county_fips LIKE '08%%' OR county_fips LIKE '48%%')
          AND year BETWEEN 2015 AND 2021
        ORDER BY year
    
2025-04-20 18:29:22,140 INFO sqlalchemy.engine.Engine [generated in 0.00065s] {}
2025-04-20 18:29:22,147 INFO sqlalchemy.engine.Engine 
        SELECT column_name 
        FROM information_schema.columns 
        WHE

Adding column qcew_emp_211 to economic_indicators table...
2025-04-20 18:29:22,153 INFO sqlalchemy.engine.Engine ALTER TABLE economic_indicators ADD COLUMN qcew_emp_211 NUMERIC
2025-04-20 18:29:22,153 INFO sqlalchemy.engine.Engine [generated in 0.00038s] {}
Adding column qcew_emp_213111 to economic_indicators table...
2025-04-20 18:29:22,154 INFO sqlalchemy.engine.Engine ALTER TABLE economic_indicators ADD COLUMN qcew_emp_213111 NUMERIC
2025-04-20 18:29:22,154 INFO sqlalchemy.engine.Engine [generated in 0.00025s] {}
2025-04-20 18:29:22,155 INFO sqlalchemy.engine.Engine COMMIT
✅ Columns added successfully.


Fetching BLS data: 100%|██████████| 13/13 [00:34<00:00,  2.65s/it]

Total records retrieved: 198
Columns in data: ['county_fips', 'year', 'qcew_emp_213111']
NAICS 211 records: 0
NAICS 213111 records: 198
✅ 198 county-year records ready to update.
2025-04-20 18:29:56,554 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-20 18:29:56,554 INFO sqlalchemy.engine.Engine 
                    UPDATE economic_indicators
                    SET qcew_emp_213111 = %(emp213)s
                    WHERE county_fips = %(county_fips)s AND year = %(year)s
                    RETURNING county_fips, year
                
2025-04-20 18:29:56,555 INFO sqlalchemy.engine.Engine [generated in 0.00035s] {'emp213': 77.0, 'county_fips': '08001', 'year': 2018}
2025-04-20 18:29:56,565 INFO sqlalchemy.engine.Engine 
                    UPDATE economic_indicators
                    SET qcew_emp_213111 = %(emp213)s
                    WHERE county_fips = %(county_fips)s AND year = %(year)s
                    RETURNING county_fips, year
                
2025-04-20 18:29:56,565 I




2025-04-20 18:29:56,750 INFO sqlalchemy.engine.Engine 
                    UPDATE economic_indicators
                    SET qcew_emp_213111 = %(emp213)s
                    WHERE county_fips = %(county_fips)s AND year = %(year)s
                    RETURNING county_fips, year
                
2025-04-20 18:29:56,750 INFO sqlalchemy.engine.Engine [cached since 0.1959s ago] {'emp213': 48.0, 'county_fips': '48485', 'year': 2018}
2025-04-20 18:29:56,752 INFO sqlalchemy.engine.Engine 
                    UPDATE economic_indicators
                    SET qcew_emp_213111 = %(emp213)s
                    WHERE county_fips = %(county_fips)s AND year = %(year)s
                    RETURNING county_fips, year
                
2025-04-20 18:29:56,753 INFO sqlalchemy.engine.Engine [cached since 0.1982s ago] {'emp213': 40.0, 'county_fips': '48485', 'year': 2019}
2025-04-20 18:29:56,754 INFO sqlalchemy.engine.Engine 
                    UPDATE economic_indicators
                    SET qcew_emp_2