In [None]:
import ssl
import certifi
import pandas as pd
from pathlib import Path 
import os
import time
from datetime import datetime

: 

In [8]:
# Assign a function that RETURNS the context, rather than the context itself
# ssl._create_default_https_context = ssl._create_unverified_context
# TODO: This needs to be validated: failed to run on a couple of occasions
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())

**Formalizing Download Approach**

In [None]:
# TABLE DOCUMENTATION
# For reference, check documentation: https://www2.census.gov/programs-surveys/acs/summary_file/2024/table-based-SF/documentation/ACS20241YR_Table_Shells.txt
ACS_TABLE_DOC = {
    "B01003": {
        "title": "Total Population",
        "topic": "Population",
        "uids": ["B01003_001"],
        "years": [2022, 2023, 2024],
    },

    "B07003": {
        "title": "Geographic Mobility (1 Year)",
        "topic": "Migration",
        "uids": [
            "B07003_001", # Total population
            "B07003_004", # Same house 1 year ago
            "B07003_007", # Moved within the same county
            "B07003_010", # Moved from different county within the same state
            "B07003_013", # Moved from different state
            "B07003_016", # Moved from abroad
        ],
        "years": [2022, 2023, 2024],
    },

    "B11001": {
        "title": "Household Type",
        "topic": "Households",
        "uids": [
            "B11001_001", # Total households - we start with this
            "B11001_002", # Family households
            "B11001_003", # Married-couple households
            "B11001_004", # Other family households
            "B11001_007", # Nonfamily
            "B11001_008", # Householder living alone
            "B11001_009", # Not living alone
        ],
        "years": [2022, 2023, 2024],
    },

     "B19001": {
        "title": "Household Income Distribution",
        "topic": "Income",
        "uids": [f"B19001_{i:03d}" for i in range(1, 18)], # Total Households; Distribution across Income Brackets, last bracket is $200,000 or more
        "years": [2022, 2023, 2024],
    },

    "B19013": {
        "title": "Median Household Income", 
        "topic": "Income",
        "uids": ["B19013_001"], # Median household income
        "years": [2022, 2023, 2024],
    },

    "B25001": {
        "title": "Housing Units",
        "topic": "Housing",
        "uids": ["B25001_001"], # Total housing units
        "years": [2022, 2023, 2024],
    },

    "B25002": {
        "title": "Occupancy Status",
        "topic": "Housing",
        "uids": [
            "B25002_001", # Total Housing Units
            "B25002_002", # Occupied
            "B25002_003" # Vacant
            ], 
        "years": [2022, 2023, 2024],
    },

    "B25003": {
        "title": "Tenure (Owner / Renter)",
        "topic": "Housing",
        "uids": [
            "B25003_001", # Total Occupied Housing Units
            "B25003_002", # Total Owner-occupied Housing Units
            "B25003_003" # Total Renter-occupied Housing Units
            ],
        "years": [2022, 2023, 2024],
    },

     "B25024": {
        "title": "Units in Structure",
        "topic": "Housing Stock",
        "uids": [f"B25024_{i:03d}" for i in range(1, 12)], # Total Units in Structure, Splits by number of units
        "years": [2022, 2023, 2024],
    },

    "B25077": {
        "title": "Median Home Value",
        "topic": "Home Value",
        "uids": ["B25077_001"], # Median Home Value in USD
        "years": [2022, 2023, 2024],
    },

    "C17002": {
        "title": "Ratio of Income to Poverty Level",
        "uids": [f"C17002_{i:03d}" for i in range(1, 9)], # Ratio of Income to Poverty Level; Splits by income-to-poverty ratio
        "years": [2022, 2023, 2024],
    },
}

In [27]:
# Helper function to find root folder
# Downloaded ACS data should be stored in root folder/ data / raw / ACS
def find_project_root(start:Path | None=None)-> Path:
    '''
    Objective is to find a marker such as .toml, .git or a directory that points to root
    '''
    start = start or Path.cwd()
    for p in [start, *start.parents]:
        if (p / ".git").exists() or (p / "pyproject.toml").exists() or (p / "data").exists():
            return p
    raise FileNotFoundError("Project root not found")

In [28]:
# Helper function to define output directory
def ensure_outdir(outdir:Path) -> None:
    '''
    Ensure that the output directory exists. If it does not exist, create it.
    Output data directory should ideally be root / data / raw / ACS; i.e.
    '''
    os.makedirs(outdir, exist_ok=True)

In [29]:
# Helper function to build URL for 5 year ACS
def build_file_url(year:int, dataset:int,table_id:str) -> str:
    '''
    Build the download URL for ACS table data.
    
    Args:
        year: The year for the data (defaults to YEAR constant)
        dataset: ACS dataset identifier (e.g., 1 for 1-year, 5 for 5-year)
        table_id: The ACS table identifier (e.g., 'B01001')
    Returns:
        Complete URL string for downloading the table
    '''
    # ACS DOWNLOAD URL
    base_url = f"https://www2.census.gov/programs-surveys/acs/summary_file/{year}/table-based-SF/data/{dataset}YRData/acsdt{dataset}y{year}"
    filename = f"-{table_id.lower()}.dat"
    return base_url + filename

In [22]:
# Helper to convert columns from ACS data dictionary to actual columns in dat files
def get_acs_downloadable_columns(table_id:str, uids:list[str]) -> list[str]:
    '''
    '''
    cols = []
    for uid in uids:
        num = uid.split("_")[1] # "001"
        cols.append(f"{table_id}_E{num}")
        cols.append(f"{table_id}_M{num}")
    return cols

In [30]:
# Function to download and filter ACS data
def download_and_filter(file_url:str,
                        sumlevel:str,
                        state:str,
                        usecols:list[str]) -> pd.DataFrame:
    '''
    Downloads ACS table(s) and filters to:
        -sumlevel: geographic level; for example "140" for tract and "150" for block group
        -state: state code; for example "06" for California
    '''
    # Read data from the URL provided
    df = pd.read_csv(file_url, sep = "|", dtype=str, usecols=usecols)

    # Filter to summary level
    df = df[df["GEO_ID"].str.startswith(sumlevel)]

    # # Filter to state if testing
    if state != "all":
        # # GEO_ID looks like: {sumlevel}0000US{state}-{county}-{tract}
        prefix = f"{sumlevel}0000US{state.zfill(2)}"
        df = df[df["GEO_ID"].str.startswith(prefix)]
        
    return df

In [31]:
# Helper function to validate desired columns exist in dataframe header
def validate_columns_exist(df: pd.DataFrame, required_cols: list[str]) -> None:
    '''
    Once the dataframe is downloaded, check all defined columns exist in the downloaded file
    If columns don't exist, raise an exception
    '''
    # Obtain columns from dataframe header
    available = df.head(0).columns.tolist()

    # Define missing
    missing = [c for c in required_cols if c not in available]
    if missing:
        # Show sample for debugging
        sample_available = sorted(available)[:5]
        raise ValueError(
            f"Missing {len(missing)} columns: {missing}.\n"
            f"Sample available columns: {sample_available}"
            )

In [36]:
# Function to obtain one table from ACS data
# Right now, we obtain for one state
# TODO: Extend this function to handle multiple tables and multiple states
def get_acs_for_sumlevel(
        year:int,
        dataset:int,
        table_id:str,
        state:str,
        sumlevel: str,
        outdir:Path,
        ) -> pd.DataFrame:
    '''
    Orchestrates: pick cols -> download -> filter -> save output
    Parameters:
        - table_id: ACS table ID to download
        - state: This can either be "all" or a two-digit state code (e.g., "06" for California)
        - sumlevel: Geographic summary level; for example "140" for tract and "150" for block group
        - outdir: Directory to save the downloaded data
        - year: Year of the ACS data
    Returns output path
    '''
    start_time = time.time()
    
    # Ensure output directory exists
    ensure_outdir(outdir)

    # Ensure desired table has been added to docs
    if table_id not in ACS_TABLE_DOC:
        print(f"Note:{table_id} not in ACS TABLE DOCS. Add it please for better documentation!")
        meta = None
    # Check print statements: not all of them are necessary
    else:
        meta = ACS_TABLE_DOC[table_id]
        print(f"\n Table: {table_id} - {meta['title']}")
        #print(f"Why: {meta["why_we_care"]}")
        print(f"Columns: {meta['uids']}")
        #print(f"Notes: {meta['notes']}")

    # Define the file url
    file_url = build_file_url(year=year, dataset=1, table_id=table_id)
    print(f"\n Fetching from: {file_url}")

    # Define the cols that need to be extracted
    keep_cols = ["GEO_ID"] + get_acs_downloadable_columns(table_id, meta['uids'])
    print(f"Keeping {len(keep_cols)} columns including GEO_ID + {table_id} E/M cols")

    # Extract the relevant data
    # Normalize state just in case
    state = state.lower()
    df = download_and_filter(file_url, sumlevel=sumlevel, state=state, usecols=keep_cols)

    # Validate that all columns exist in the dataframe
    # Raises an exception even if one column is missing
    validate_columns_exist(df, keep_cols)

    # Save the extracted data
    geo_name = "subdivision" if sumlevel == "060" else"place" if sumlevel == "160" else "tract" if sumlevel == "140" else "block_group" if sumlevel == "150" else "unknown_geo"
    outpath = os.path.join(outdir, f"{table_id}_{geo_name}_state_{state}_{year}.csv")
    df.to_csv(outpath, index=False)
    print(f"Saved: {outpath}")
    
    # Ops measures
    print(f'File size {os.path.getsize(outpath) / (1024 * 1024):.2f} MB')
    end_time = time.time()
    print(f"Time taken to download and save data: {end_time - start_time:.2f} seconds")
    return df

In [37]:
# Test run for one table, one state
# ACS DOWNLOAD URL
DATASET = 1
YEAR = 2023
STATE = "All"
TABLE = "B01003"
# Summary level dictionary for reference
SUM_LEVEL_DICT = {
    '010': 'nation',
    '040': 'state', 
    '050': 'county',
    '060': 'county_subdivision', 
    '140': 'census_tract',
    '150': 'block_group',
    '160': 'place',
    '310': 'cbsa',
    '860': 'zcta', 
    '950': 'el_sch_dist', 
    '960': 'sec_sch_dist',
    '970': 'uni_sch_dist'
}

# Change this based on sumlevel
OUTDIR = find_project_root() / f"data/raw/ACS/1yr/sumlevel_{SUM_LEVEL_DICT['160']}"

# Get data for census tracts
df_tracts = get_acs_for_sumlevel(
    dataset=DATASET,
    year=YEAR,
    table_id = TABLE,
    state=STATE,
    sumlevel='160',
    outdir=OUTDIR
)


 Table: B01003 - Total Population
Columns: ['B01003_001']

 Fetching from: https://www2.census.gov/programs-surveys/acs/summary_file/2023/table-based-SF/data/1YRData/acsdt1y2023-b01003.dat
Keeping 3 columns including GEO_ID + B01003 E/M cols
Saved: c:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\data\raw\ACS\1yr\sumlevel_place\B01003_place_state_all_2023.csv
File size 0.02 MB
Time taken to download and save data: 0.65 seconds


In [39]:
# Function to run a batch download
def batch_download_acs(
        dataset:int,
        years:list[int],
        sumlevels: list[str],
        states: list[str],
        tables: list[str],
        outdir:dict[str, Path],
        ) -> None:
    '''
    Orchestrates downloads across:
        - years: multiple ACS years
        - tables (default: all keys in ACS_TABLE_DOC)
        - sumlevels (default: ["140", "150"])
        - states (default: ['all'])
    Writes:
        - CSV files for each table, sumlevel, and state combination in the specified output directory
    '''
    # Adding some basic checks first
    if years is None:
        years = [2022,2023,2024]
    if sumlevels is None:
        sumlevels = ["060", "160"]
    if states is None:
        states = ['all']
    if tables is None:
        tables = list(ACS_TABLE_DOC.keys())
    
    # TODO: In case we want to capture timestamps in a metadata file later
    # start_time = datetime.now()

    total_jobs = len(years) * len(tables) * len(sumlevels) * len(states)
    job_num = 0
    for year in years:
        for table_id in tables:
            for sumlevel in sumlevels:
                if sumlevel == "060":
                    final_path = outdir["060"]
                elif sumlevel == "160":
                    final_path = outdir["160"]
                for state in states:
                    job_num += 1
                    print(f"Starting job {job_num}/{total_jobs}: Year {year}, Table {table_id}, Sumlevel {sumlevel}, State {state}")

                    # Call the function to download the ACS data for the given table, sumlevel, and state
                    try:
                        get_acs_for_sumlevel(
                        dataset=dataset,
                        year = year,
                        table_id=table_id,
                        state=state,
                        sumlevel=sumlevel,
                        outdir=final_path)
                    except Exception as e:
                        print(f"Error in job {job_num}/{total_jobs}: Table {table_id}, Sumlevel {sumlevel}, State {state}")
                

In [41]:
# Run the orchestrator to download all tables for both sumlevels and all states
# ACS DOWNLOAD URL
DATASET = 1
YEARS = [2022, 2023, 2024]
TABLE = "B01003"

# Change this based on sumlevel
OUTDIR_PLACE = find_project_root() / f"data/raw/ACS/1yr/sumlevel_{SUM_LEVEL_DICT['160']}"
OUTDIR_SD = find_project_root() / f"data/raw/ACS/1yr/sumlevel_{SUM_LEVEL_DICT['060']}"

# Function call 
batch_download_acs(
    dataset=DATASET,
    years = YEARS,
    sumlevels=["060", "160"],
    states=['all'],
    tables=list(ACS_TABLE_DOC.keys()), # For testing, use TABLE = ["B01003"], to fetch all use list(ACS_TABLE_DOC.keys())
    outdir={"060": OUTDIR_SD, "160": OUTDIR_PLACE}
)

Starting job 1/66: Year 2022, Table B01003, Sumlevel 060, State all

 Table: B01003 - Total Population
Columns: ['B01003_001']

 Fetching from: https://www2.census.gov/programs-surveys/acs/summary_file/2022/table-based-SF/data/1YRData/acsdt1y2022-b01003.dat
Keeping 3 columns including GEO_ID + B01003 E/M cols
Saved: c:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\data\raw\ACS\1yr\sumlevel_county_subdivision\B01003_subdivision_state_all_2022.csv
File size 0.01 MB
Time taken to download and save data: 0.98 seconds
Starting job 2/66: Year 2022, Table B01003, Sumlevel 160, State all

 Table: B01003 - Total Population
Columns: ['B01003_001']

 Fetching from: https://www2.census.gov/programs-surveys/acs/summary_file/2022/table-based-SF/data/1YRData/acsdt1y2022-b01003.dat
Keeping 3 columns including GEO_ID + B01003 E/M cols
Saved: c:\Users\eprashar\OneDrive - CoreLogic Solutions, LLC\github\2026\geo similarity teu\data\raw\ACS\1yr\sumlevel_place\B01003_pla