# Jupyter Notebook: Load IRS SOI Net Migration Data to Database


In [11]:
# Jupyter Notebook: Load IRS Net Migration to economic_indicators Table

import os
from dotenv import load_dotenv
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.engine import URL

# Load environment variables
load_dotenv()

# Database config
DB_NAME = os.environ["DB_NAME"]
DB_USER = os.environ["DB_USER"]
DB_PASSWORD = os.environ["DB_PASSWORD"]
DB_HOST = os.environ.get("DB_HOST", "localhost")
DB_PORT = os.environ.get("DB_PORT", "5432")

# Connect to database
connection_url = URL.create(
    "postgresql+psycopg2",
    username=DB_USER,
    password=DB_PASSWORD,
    host=DB_HOST,
    port=DB_PORT,
    database=DB_NAME,
)
engine = create_engine(connection_url, echo=True)

# ----------------------------------------
# Load IRS Migration Data for CO and TX (Multiple Years)
# ----------------------------------------

def fetch_irs_net_migration(file_year):
    short_prev = str(file_year - 1)[-2:]
    short_curr = str(file_year)[-2:]
    url = f"https://www.irs.gov/pub/irs-soi/countyinflow{short_prev}{short_curr}.csv"
    try:
        df = pd.read_csv(url)
    except Exception as e:
        print(f"❌ Failed to fetch {url}: {e}")
        return pd.DataFrame()

    if not set(["y2_statefips", "y2_countyfips", "n2"]).issubset(df.columns):
        print(f"⚠️ Unexpected format in {url}, skipping")
        return pd.DataFrame()

    df = df[df["y2_statefips"].isin([8, 48])]
    df = df.groupby(["y2_statefips", "y2_countyfips"]).agg({"n2": "sum"}).reset_index()
    df["county_fips"] = df["y2_statefips"].astype(str).str.zfill(2) + df["y2_countyfips"].astype(str).str.zfill(3)
    df["year"] = file_year
    df.rename(columns={"n2": "net_migration"}, inplace=True)
    return df[["county_fips", "year", "net_migration"]]

# Fetch multiple years
years = [2016, 2017, 2018, 2019, 2020, 2021]
all_years = [fetch_irs_net_migration(y) for y in years]
net_migration_df = pd.concat([df for df in all_years if not df.empty])

# Check for matches in economic_indicators
with engine.begin() as conn:
    result = conn.execute(text("SELECT county_fips, year FROM economic_indicators"))
    existing_keys = set((row["county_fips"], row["year"]) for row in result.mappings())

# Keep only matching rows
net_migration_df = net_migration_df[net_migration_df.apply(lambda row: (row["county_fips"], row["year"]) in existing_keys, axis=1)]

# Update economic_indicators with net_migration
with engine.begin() as conn:
    for _, row in net_migration_df.iterrows():
        conn.execute(
            text("""
                UPDATE economic_indicators
                SET net_migration = :net_migration
                WHERE county_fips = :county_fips AND year = :year
            """),
            {
                "net_migration": int(row["net_migration"]),
                "county_fips": row["county_fips"],
                "year": row["year"]
            }
        )

print("✅ IRS net migration data for 2016–2020 loaded into economic_indicators table.")


2025-04-19 11:30:33,776 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-04-19 11:30:33,776 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-19 11:30:33,777 INFO sqlalchemy.engine.Engine select current_schema()
2025-04-19 11:30:33,777 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-19 11:30:33,778 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-04-19 11:30:33,778 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-19 11:30:33,779 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-19 11:30:33,779 INFO sqlalchemy.engine.Engine SELECT county_fips, year FROM economic_indicators
2025-04-19 11:30:33,779 INFO sqlalchemy.engine.Engine [generated in 0.00026s] {}
2025-04-19 11:30:33,783 INFO sqlalchemy.engine.Engine COMMIT
2025-04-19 11:30:33,793 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-19 11:30:33,793 INFO sqlalchemy.engine.Engine 
                UPDATE economic_indicators
                SET net_migration = %(net_migration)s
            