In [3]:
import pandas as pd
import ssl
import certifi
from pathlib import Path 
import os
import time
from datetime import datetime

In [4]:
# This tells Python to ignore SSL certificate errors globally for this session
# Approach is not ideal, so will use certifi
ssl._create_default_https_context = ssl._create_unverified_context
# ssl._create_default_https_context = ssl.create_default_context(cafile=certifi.where())

**Formalizing Download Approach**

In [5]:
# CONSTANTS
# DOWNLOAD YEAR  
YEAR = 2023

# ACS DATSET YEAR
DATASET = 5

# ACS DOWNLOAD URL
ACS_TABLE_DOWNLOAD_URL = f"https://www2.census.gov/programs-surveys/acs/summary_file/{YEAR}/table-based-SF/data/{DATASET}YRData/acsdt{DATASET}y{YEAR}"

# TABLE DOCUMENTATION
# For reference, check documentation: https://www2.census.gov/programs-surveys/acs/summary_file/2023/table-based-SF/documentation/ACS20235YR_Table_Shells.txt
ACS_TABLE_DOC = {
    "B01003": {
        "title": "TOTAL POPULATION",
        "topic": "Population",
        "why_we_care": "Baseline population size for network demand and market scale.",
        "uids": ["B01003_001"],
        "notes": "Single total population estimate."
    },

    "B07003": {
        "title": "GEOGRAPHICAL MOBILITY IN THE PAST YEAR",
        "topic": "Migration",
        "why_we_care": "Captures population churn and mobility, useful for network expansion risk.",
        "uids": [
            "B07003_001",  # Total
            "B07003_004",  # Same house
            "B07003_007",  # Moved within same county
            "B07003_010",  # Moved within state, different county
            "B07003_013",  # Moved from different state
            "B07003_016",  # Moved from abroad
        ],
        "notes": "Totals only; sex-specific rows intentionally excluded."
    },

    "B11001": {
        "title": "HOUSEHOLD TYPE",
        "topic": "Households",
        "why_we_care": "Household composition informs dwelling demand and service mix.",
        "uids": [
            "B11001_001",  # Total households
            "B11001_002",  # Family households
            "B11001_003",  # Married-couple family
            "B11001_004",  # Other family
            "B11001_007",  # Nonfamily households
            "B11001_008",  # Living alone
            "B11001_009",  # Not living alone
        ],
        "notes": "High-level household structure only."
    },

    "C17002": {
        "title": "RATIO OF INCOME TO POVERTY LEVEL",
        "topic": "Poverty",
        "why_we_care": "Income-to-poverty ratios provide a normalized hardship measure.",
        "uids": [f"C17002_{i:03d}" for i in range(1, 9)],
        "notes": "Collapsed table preferred over B17002 for stability."
    },

    "B19001": {
        "title": "HOUSEHOLD INCOME DISTRIBUTION",
        "topic": "Income",
        "why_we_care": "Income distribution is more informative than a single median value.",
        "uids": [f"B19001_{i:03d}" for i in range(1, 18)],
        "notes": "Full distribution retained."
    },

    "B19013": {
        "title": "MEDIAN HOUSEHOLD INCOME",
        "topic": "Income",
        "why_we_care": "Common headline metric; useful for benchmarking.",
        "uids": ["B19013_001"],
        "notes": "Supplemental to B19001."
    },
     "B25001": {
        "title": "HOUSING UNITS",
        "topic": "Housing",
        "why_we_care": "Total housing units",
        "uids": ["B25001_001"], # Total housing units
        "notes": ""
    },

    "B25002": {
        "title": "OCCUPANCY STATUS",
        "topic": "Housing",
        "why_we_care": "Vacancy vs occupancy informs spare capacity.",
        "uids": [f"B25002_{i:03d}" for i in range(1, 4)],
        "notes": ""
    },

    "B25003": {
        "title": "TENURE (OWNER / RENTER)",
        "topic": "Housing",
        "why_we_care": "Ownership vs rental mix impacts infrastructure usage patterns.",
        "uids": [f"B25003_{i:03d}" for i in range(1, 4)],
        "notes": ""
    },

    "B25024": {
        "title": "UNITS IN STRUCTURE",
        "topic": "Housing Stock",
        "why_we_care": "Direct measure of housing density and structure type.",
        "uids": [f"B25024_{i:03d}" for i in range(1, 12)],
        "notes": "Preferred over household-type-by-structure tables."
    },
    

    "B25077": {
        "title": "MEDIAN HOME VALUE",
        "topic": "Home Value",
        "why_we_care": "Asset value proxy for long-term investment stability.",
        "uids": ["B25077_001"],
        "notes": ""
    },
}

In [6]:
# Helper function to find root folder
# Downloaded ACS data should be stored in root folder/ data / raw / ACS
def find_project_root(start:Path | None=None)-> Path:
    '''
    Objective is to find a marker such as .toml, .git or a directory that points to root
    '''
    start = start or Path.cwd()
    for p in [start, *start.parents]:
        if (p / ".git").exists() or (p / "pyproject.toml").exists() or (p / "data").exists():
            return p
    raise FileNotFoundError("Project root not found")

In [7]:
# Helper function to define output directory
def ensure_outdir(outdir:Path) -> None:
    '''
    Ensure that the output directory exists. If it does not exist, create it.
    Output data directory should ideally be root / data / raw / ACS; i.e.
    '''
    os.makedirs(outdir, exist_ok=True)

In [8]:
# Helper function to build URL for 5 year ACS
def build_file_url(table_id:str, year:int=YEAR) -> str:
    '''
    Build the download URL for ACS table data.
    
    Args:
        table_id: The ACS table identifier (e.g., 'B01001')
        year: The year for the data (defaults to YEAR constant)
    Returns:
        Complete URL string for downloading the table
    '''
    base_url = ACS_TABLE_DOWNLOAD_URL
    filename = f"-{table_id.lower()}.dat"
    return base_url + filename

In [9]:
# Helper to convert columns from ACS data dictionary to actual columns in dat files
def get_acs_downloadable_columns(table_id:str, uids:list[str]) -> list[str]:
    '''
    '''
    cols = []
    for uid in uids:
        num = uid.split("_")[1] # "001"
        cols.append(f"{table_id}_E{num}")
        cols.append(f"{table_id}_M{num}")
    return cols

In [10]:
# Function to download and filter ACS data
def download_and_filter(file_url:str,
                        sumlevel:str,
                        state:str,
                        usecols:list[str]) -> pd.DataFrame:
    '''
    Downloads ACS table(s) and filters to:
        -sumlevel: geographic level; for example "140" for tract and "150" for block group
        -state: state code; for example "06" for California
    '''
    # Read data from the URL provided
    df = pd.read_csv(file_url, sep = "|", dtype=str, usecols=usecols)

    # Filter to summary level
    df = df[df["GEO_ID"].str.startswith(sumlevel)]

    # # Filter to state if testing
    if state != "all":
        # # GEO_ID looks like: {sumlevel}0000US{state}-{county}-{tract}
        prefix = f"{sumlevel}0000US{state.zfill(2)}"
        df = df[df["GEO_ID"].str.startswith(prefix)]
        
    return df

In [11]:
# Helper function to validate desired columns exist in dataframe header
def validate_columns_exist(df: pd.DataFrame, required_cols: list[str]) -> None:
    '''
    Once the dataframe is downloaded, check all defined columns exist in the downloaded file
    If columns don't exist, raise an exception
    '''
    # Obtain columns from dataframe header
    available = df.head(0).columns.tolist()

    # Define missing
    missing = [c for c in required_cols if c not in available]
    if missing:
        # Show sample for debugging
        sample_available = sorted(available)[:5]
        raise ValueError(
            f"Missing {len(missing)} columns: {missing}.\n"
            f"Sample available columns: {sample_available}"
            )

In [12]:
# Function to obtain one table from ACS data
# Right now, we obtain for one state
# TODO: Extend this function to handle multiple tables and multiple states
def get_acs_for_sumlevel(
        table_id:str,
        state:str,
        sumlevel: str,
        outdir:Path,
        year:int) -> pd.DataFrame:
    '''
    Orchestrates: pick cols -> download -> filter -> save output
    Parameters:
        - table_id: ACS table ID to download
        - state: This can either be "all" or a two-digit state code (e.g., "06" for California)
        - sumlevel: Geographic summary level; for example "140" for tract and "150" for block group
        - outdir: Directory to save the downloaded data
        - year: Year of the ACS data
    Returns output path
    '''
    start_time = time.time()
    
    # Ensure output directory exists
    ensure_outdir(outdir)

    # Ensure desired table has been added to docs
    if table_id not in ACS_TABLE_DOC:
        print(f"Note:{table_id} not in ACS TABLE DOCS. Add it please for better documentation!")
        meta = None
    else:
        meta = ACS_TABLE_DOC[table_id]
        print(f"\n Table: {table_id} - {meta['title']}")
        print(f"Why: {meta["why_we_care"]}")
        print(f"Columns: {meta['uids']}")
        print(f"Notes: {meta['notes']}")

    # Define the file url
    file_url = build_file_url(table_id=table_id)
    print(f"\n Fetching from: {file_url}")

    # Define the cols that need to be extracted
    keep_cols = ["GEO_ID"] + get_acs_downloadable_columns(table_id, meta['uids'])
    print(f"Keeping {len(keep_cols)} columns including GEO_ID + {table_id} E/M cols")

    # Extract the relevant data
    # Normalize state just in case
    state = state.lower()
    df = download_and_filter(file_url, sumlevel=sumlevel, state=state, usecols=keep_cols)

    # Validate that all columns exist in the dataframe
    # Raises an exception even if one column is missing
    validate_columns_exist(df, keep_cols)

    # Save the extracted data
    geo_name = "place" if sumlevel == "160" else "tract" if sumlevel == "140" else "block_group" if sumlevel == "150" else "unknown_geo"
    outpath = os.path.join(outdir, f"{table_id}_{geo_name}_state_{state}_{year}.csv")
    df.to_csv(outpath, index=False)
    print(f"Saved: {outpath}")
    
    # Ops measures
    print(f'File size {os.path.getsize(outpath) / (1024 * 1024):.2f} MB')
    end_time = time.time()
    print(f"Time taken to download and save data: {end_time - start_time:.2f} seconds")
    return df

In [None]:
# Test run for one table, one state
# ACS DOWNLOAD URL
DATASET = 5
YEAR = 2024 # New ACS 5 year data published on Jan 29, 2026
ACS_TABLE_DOWNLOAD_URL = f"https://www2.census.gov/programs-surveys/acs/summary_file/{YEAR}/table-based-SF/data/{DATASET}YRData/acsdt{DATASET}y{YEAR}"
STATE = "All"
TABLE = "B25001"

# Summary level dictionary for reference
SUM_LEVEL_DICT = {
    '010': 'nation',
    '040': 'state', 
    '050': 'county',
    '060': 'county_subdivision', 
    '140': 'census_tract',
    '150': 'block_group',
    '160': 'place',
    '310': 'cbsa',
    '860': 'zcta', 
    '950': 'el_sch_dist', 
    '960': 'sec_sch_dist',
    '970': 'uni_sch_dist'
}

# Change this based on sumlevel
OUTDIR = find_project_root() / f"data/raw/ACS/5yr/sumlevel_{SUM_LEVEL_DICT['160']}"

# Get data for census tracts
df_tracts = get_acs_for_sumlevel(
    table_id = TABLE,
    state=STATE,
    sumlevel='160', # For place
    outdir=OUTDIR,
    year=YEAR
)


 Table: B25001 - HOUSING UNITS
Why: Total housing units
Columns: ['B25001_001']
Notes: 

 Fetching from: https://www2.census.gov/programs-surveys/acs/summary_file/2024/table-based-SF/data/5YRData/acsdt5y2024-b25001.dat
Keeping 3 columns including GEO_ID + B25001 E/M cols
Saved: c:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\data\raw\ACS\5yr\sumlevel_place\B25001_place_state_all_2024.csv
File size 0.79 MB
Time taken to download and save data: 1.93 seconds


: 

In [18]:
# Function to run a batch download
def batch_download_acs(
        outdir:Path,
        sumlevels: list[str],
        states: list[str],
        tables: list[str],
        year:int = YEAR) -> None:
    '''
    Orchestrates downloads across:
        - tables (default: all keys in ACS_TABLE_DOC)
        - sumlevels (default: ["140", "150"])
        - states (default: ['all'])
    Writes:
        - CSV files for each table, sumlevel, and state combination in the specified output directory
    '''
    # Adding some basic checks first
    if sumlevels is None:
        sumlevels = ["140", "150", "160"]
    if states is None:
        states = ['all']
    if tables is None:
        tables = list(ACS_TABLE_DOC.keys())
    
    # TODO: In case we want to capture timestamps in a metadata file later
    # start_time = datetime.now()

    total_jobs = len(tables) * len(sumlevels) * len(states)
    job_num = 0
    for table_id in tables:
        for sumlevel in sumlevels:
            for state in states:
                job_num += 1
                print(f"Starting job {job_num}/{total_jobs}: Table {table_id}, Sumlevel {sumlevel}, State {state}")

                # Call the function to download the ACS data for the given table, sumlevel, and state
                try:
                    get_acs_for_sumlevel(
                    table_id=table_id,
                    state=state,
                    sumlevel=sumlevel,
                    year=year,
                    outdir=outdir)
                except Exception as e:
                    print(f"Error in job {job_num}/{total_jobs}: Table {table_id}, Sumlevel {sumlevel}, State {state}")
                

In [19]:
# Run the orchestrator to download all tables for both sumlevels and all states
# ACS DOWNLOAD URL
DATASET = 5
YEAR = 2024 # New ACS 5 year data published on Jan 29, 2026
ACS_TABLE_DOWNLOAD_URL = f"https://www2.census.gov/programs-surveys/acs/summary_file/{YEAR}/table-based-SF/data/{DATASET}YRData/acsdt{DATASET}y{YEAR}"

# Change this based on sumlevel
OUTDIR = find_project_root() / f"data/raw/ACS/5yr/sumlevel_{SUM_LEVEL_DICT['160']}"

# Function call 
batch_download_acs(
    outdir=OUTDIR,
    sumlevels=["160"],
    states=['all'],
    tables=list(ACS_TABLE_DOC.keys()),
    year=YEAR
)

Starting job 1/10: Table B01003, Sumlevel 160, State all

 Table: B01003 - TOTAL POPULATION
Why: Baseline population size for network demand and market scale.
Columns: ['B01003_001']
Notes: Single total population estimate.

 Fetching from: https://www2.census.gov/programs-surveys/acs/summary_file/2024/table-based-SF/data/5YRData/acsdt5y2024-b01003.dat
Keeping 3 columns including GEO_ID + B01003 E/M cols
Saved: c:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\data\raw\ACS\5yr\sumlevel_place\B01003_place_state_all_2024.csv
File size 0.81 MB
Time taken to download and save data: 1.12 seconds
Starting job 2/10: Table B07003, Sumlevel 160, State all

 Table: B07003 - GEOGRAPHICAL MOBILITY IN THE PAST YEAR
Why: Captures population churn and mobility, useful for network expansion risk.
Columns: ['B07003_001', 'B07003_004', 'B07003_007', 'B07003_010', 'B07003_013', 'B07003_016']
Notes: Totals only; sex-specific rows intentionally excluded.

 Fetching from: 