Census API Data Extraction

In [1]:
import os
import pandas as pd
import arcpy
from datetime import datetime
import re
import requests
import numpy as np
import geopandas

In [2]:
# Function to clean table names
def func_clean_table_name(filename):
    # Replace forbidden characters with underscores
    cleaned_name = re.sub(r'[^a-zA-Z0-9_]', '_', os.path.splitext(filename)[0])
    # Ensure the name doesn't start with a number
    if cleaned_name[0].isdigit():
        cleaned_name = f"_{cleaned_name}"
    return cleaned_name

In [3]:
# Define URLs to run this program on
#Example API URL for Tract level:  
#https://api.census.gov/data/2023/acs/acs5/subject?get=GEO_ID,NAME,S2704_C03_006E,S2704_C03_002E&for=tract:*&in=state:26&in=county:163 
#Example API URL for Place level: 
#https://api.census.gov/data/2023/acs/acs5/subject?get=GEO_ID,NAME,S2704_C03_006E,S2704_C03_002E&for=place:*&in=state:26 
#Example API URL for Block Group level: 
#https://api.census.gov/data/2023/acs/acs5/subject?get=GEO_ID,NAME,S2704_C03_006E,S2704_C03_002E&for=block%20group:*&in=state:26&in=county:163&in=tract:000100 

census_base_url = "https://api.census.gov/" #census base URL
census_dataset_url = "data/2023/acs/acs5/subject" #the URL for a specific data set, in this case 2023 ACS 5 year subject data
census_combined_url = f"{census_base_url}{census_dataset_url}"
census_variables = "GEO_ID,NAME,S2704_C03_001E,S2704_C03_006E,S2704_C03_002E,S2704_C03_010E" #list all variables separated by commas. GEO_ID is the long ID format with all geographic levels combined. GEOID is a short ID just for the smallest geographic unit


census_fields_url = f"{census_combined_url}/variables.json" #URL to get the variables for the base URL dataset from above
state_census_url = f"{census_combined_url}?get={census_variables}&for=state:*" # Construct URL to pull state level data for the desired fields from Census
county_url = f"{census_combined_url}?get={census_variables}&for=county:*&in=state:*" # Define another Census URL for county level data
cbg_url = f"{census_combined_url}?get={census_variables}&for=county:*&in=state:*" # Define another Census URL for Census Block Group data

general_table_name = func_clean_table_name(f"{census_dataset_url}")
fields_table_name = f"{general_table_name}_Fields"
state_table_name = func_clean_table_name(f"{census_dataset_url}_State")
county_table_name = func_clean_table_name(f"{census_dataset_url}_County")
tract_table_name = func_clean_table_name(f"{census_dataset_url}_Tract")
cbg_table_name = func_clean_table_name(f"{census_dataset_url}_CBG")


print(fields_table_name)
print(state_table_name)
print(county_table_name)
print(cbg_table_name)
print(county_url)
print(census_fields_url)

data_2023_acs_acs5_subject_Fields
data_2023_acs_acs5_subject_State
data_2023_acs_acs5_subject_County
data_2023_acs_acs5_subject_CBG
https://api.census.gov/data/2023/acs/acs5/subject?get=GEO_ID,NAME,S2704_C03_001E,S2704_C03_006E,S2704_C03_002E,S2704_C03_010E&for=county:*&in=state:*
https://api.census.gov/data/2023/acs/acs5/subject/variables.json


In [4]:
current_dir = os.getcwd()
print(current_dir)
print(county_url)

C:\GITHUB\CCSVI\Scripts\Pull_Census_BG_Data
https://api.census.gov/data/2023/acs/acs5/subject?get=GEO_ID,NAME,S2704_C03_001E,S2704_C03_006E,S2704_C03_002E,S2704_C03_010E&for=county:*&in=state:*


In [5]:
# Create an output subfolder in the same folder as this notebook file, named for current date and time
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output_folder_name = f"output_{current_time}"

# Construct the full path for the new output folder
output_folder_path = os.path.join(current_dir, "output", output_folder_name)

# Create the folder
os.makedirs(output_folder_path, exist_ok=True)

destination_dir = output_folder_path # This is so the destination can be changed easily without messing up the previous code

print(f"Location of this notebook: {current_dir}")
print(f"Destination folder: {destination_dir}")

Location of this notebook: C:\GITHUB\CCSVI\Scripts\Pull_Census_BG_Data
Destination folder: C:\GITHUB\CCSVI\Scripts\Pull_Census_BG_Data\output_20250406_134036


In [6]:
print(census_fields_url)
print(state_census_url)
print(county_url)

https://api.census.gov/data/2023/acs/acs5/subject/variables.json
https://api.census.gov/data/2023/acs/acs5/subject?get=GEO_ID,NAME,S2704_C03_001E,S2704_C03_006E,S2704_C03_002E,S2704_C03_010E&for=state:*
https://api.census.gov/data/2023/acs/acs5/subject?get=GEO_ID,NAME,S2704_C03_001E,S2704_C03_006E,S2704_C03_002E,S2704_C03_010E&for=county:*&in=state:*


In [7]:
# Get STATE level data with selected fields

state_census_response = requests.get(state_census_url)

state_census_data = state_census_response.json()

state_df = pd.DataFrame(state_census_data[1:], columns=state_census_data[0])

state_df.head()

print(state_df)

         GEO_ID                  NAME S2704_C03_001E S2704_C03_006E  \
0   0400000US01               Alabama           37.5           20.0   
1   0400000US02                Alaska           37.1           23.1   
2   0400000US04               Arizona           38.1           20.5   
3   0400000US05              Arkansas           43.9           26.6   
4   0400000US06            California           38.9           26.3   
5   0400000US08              Colorado           32.8           18.2   
6   0400000US09           Connecticut           37.2           21.9   
7   0400000US10              Delaware           40.0           20.8   
8   0400000US11  District of Columbia           34.2           24.8   
9   0400000US12               Florida           37.0           17.5   
10  0400000US13               Georgia           32.4           18.1   
11  0400000US15                Hawaii           38.5           19.4   
12  0400000US16                 Idaho           35.0           18.5   
13  04

In [8]:
# Get COUNTY level data with selected fields

county_response = requests.get(county_url)

county_data = county_response.json()

county_df = pd.DataFrame(county_data[1:], columns=county_data[0])

county_df.head()

Unnamed: 0,GEO_ID,NAME,S2704_C03_001E,S2704_C03_006E,S2704_C03_002E,S2704_C03_010E,state,county
0,0500000US01001,"Autauga County, Alabama",34.7,16.4,20.3,4.0,1,1
1,0500000US01003,"Baldwin County, Alabama",36.3,14.7,23.4,2.9,1,3
2,0500000US01005,"Barbour County, Alabama",49.7,29.6,26.4,3.5,1,5
3,0500000US01007,"Bibb County, Alabama",41.4,22.7,22.1,2.7,1,7
4,0500000US01009,"Blount County, Alabama",38.7,20.8,22.5,2.1,1,9


In [9]:
print(county_df)

              GEO_ID                              NAME S2704_C03_001E  \
0     0500000US01001           Autauga County, Alabama           34.7   
1     0500000US01003           Baldwin County, Alabama           36.3   
2     0500000US01005           Barbour County, Alabama           49.7   
3     0500000US01007              Bibb County, Alabama           41.4   
4     0500000US01009            Blount County, Alabama           38.7   
...              ...                               ...            ...   
3217  0500000US72145  Vega Baja Municipio, Puerto Rico           60.6   
3218  0500000US72147    Vieques Municipio, Puerto Rico           72.5   
3219  0500000US72149   Villalba Municipio, Puerto Rico           71.4   
3220  0500000US72151    Yabucoa Municipio, Puerto Rico           76.4   
3221  0500000US72153      Yauco Municipio, Puerto Rico           65.9   

     S2704_C03_006E S2704_C03_002E S2704_C03_010E state county  
0              16.4           20.3            4.0    01   

In [10]:
print(census_fields_url)


https://api.census.gov/data/2023/acs/acs5/subject/variables.json


In [11]:
# Get field listing from json Census URL created previously and put into a data frame so that aliases can be assigned etc.
# Dictionary intermediate step is necessary because the API presents the data as a dictionary rather than simple JSON like the other data

fields_dict = requests.get(census_fields_url).json()

fields_df = pd.DataFrame.from_dict(fields_dict["variables"], orient="index").reset_index()

fields_df.head()

Unnamed: 0,index,label,concept,predicateType,group,limit,predicateOnly,hasGeoCollectionSupport,attributes,required
0,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,True,,,
1,in,Census API FIPS 'in' clause,Census API Geography Specification,fips-in,,0,True,,,
2,ucgid,Uniform Census Geography Identifier clause,Census API Geography Specification,ucgid,,0,True,True,,
3,S0804_C04_068E,Estimate!!Public transportation (excluding tax...,Means of Transportation to Work by Selected Ch...,float,S0804,0,,,"S0804_C04_068EA,S0804_C04_068M,S0804_C04_068MA",
4,S0503_C02_078E,Estimate!!Foreign-born; Born in Europe!!Civili...,Selected Characteristics of the Foreign-Born P...,float,S0503,0,,,"S0503_C02_078EA,S0503_C02_078M,S0503_C02_078MA",


In [12]:
fields_csv_path = os.path.join(destination_dir, f"{fields_table_name}.csv")

fields_df.to_csv(fields_csv_path, index=False, encoding="utf-8")

print(f"CSV file created successfully: {fields_csv_path}")

CSV file created successfully: C:\GITHUB\CCSVI\Scripts\Pull_Census_BG_Data\output_20250406_134036\data_2023_acs_acs5_subject_Fields.csv


In [13]:
state_csv_path = os.path.join(destination_dir, f"{state_table_name}.csv")

state_df.to_csv(state_csv_path, index=False, encoding="utf-8")

print(f"CSV file created successfully: {state_csv_path}")

CSV file created successfully: C:\GITHUB\CCSVI\Scripts\Pull_Census_BG_Data\output_20250406_134036\data_2023_acs_acs5_subject_State.csv


In [14]:
county_csv_path = os.path.join(destination_dir, f"{county_table_name}.csv")

county_df.to_csv(county_csv_path, index=False, encoding="utf-8")

print(f"CSV file created successfully: {county_csv_path}")

CSV file created successfully: C:\GITHUB\CCSVI\Scripts\Pull_Census_BG_Data\output_20250406_134036\data_2023_acs_acs5_subject_County.csv


In [15]:
# Generate a filename using the previously generated data set name 
# Also assign destination_dir to the new gdb_dir

gdb_dir = destination_dir
gdb_name = f"{general_table_name}.geodatabase"
#gdb_name = f"{general_table_name}{current_time}.geodatabase"

# Construct the full path for the Mobile Geodatabase using the same Destination Dir as before
gdb_path = os.path.join(gdb_dir, gdb_name)

# Create the Mobile Geodatabase
arcpy.management.CreateMobileGDB(gdb_dir, gdb_name)

print(f"Mobile Geodatabase created at: {gdb_path}")

Mobile Geodatabase created at: C:\GITHUB\CCSVI\Scripts\Pull_Census_BG_Data\output_20250406_134036\data_2023_acs_acs5_subject.geodatabase


In [16]:
# Loop through all CSV files in the destination directory, clean up names, and put them into the Geodatabase as tables

for root, dirs, files in os.walk(destination_dir):
    for file in files:
        if file.endswith('.csv'):
            try:
                # Construct the full path to the CSV file
                csv_path = os.path.join(root, file)

                # Generate a valid table name
                table_name = func_clean_table_name(file)

                # Import the CSV into the geodatabase
                arcpy.TableToTable_conversion(
                    in_rows=csv_path,
                    out_path=gdb_path,
                    out_name=table_name
                )
                print(f"Successfully imported {file} as {table_name}.")

            except Exception as e:
                print(f"Error importing {file}: {e}")

print("All CSV files have been processed.")

Successfully imported data_2023_acs_acs5_subject_County.csv as data_2023_acs_acs5_subject_County.
Successfully imported data_2023_acs_acs5_subject_Fields.csv as data_2023_acs_acs5_subject_Fields.
Successfully imported data_2023_acs_acs5_subject_State.csv as data_2023_acs_acs5_subject_State.
All CSV files have been processed.


In [17]:
# Assign field aliases to field names in all the data tables that were just put into the geodatabase

# Set workspace to the geodatabase path
arcpy.env.workspace = gdb_path

# List all tables in the geodatabase
tables = arcpy.ListTables()

# Separate data and metadata tables
data_tables = [t for t in tables if t.endswith(("_State", "_County","_Tract","_Block_Group"))]
metadata_tables = [t for t in tables if t.endswith("_Fields")]

print("Data and field name listing tables found")
print(data_tables)
print(metadata_tables)

Data and field name listing tables found
['main.data_2023_acs_acs5_subject_County', 'main.data_2023_acs_acs5_subject_State']
['main.data_2023_acs_acs5_subject_Fields']


In [18]:
# Loop through all data tables
for data_table in data_tables:
    try:
       
        metadata_table = re.sub(r'(_State|_County)$', '_Fields', data_table) # Find the corresponding metadata table to any given state table

        if metadata_table not in metadata_tables:
            print(f"Metadata table not found for {data_table}. Skipping.")
            continue

        print(f"Processing {data_table} with metadata {metadata_table}")

        # Get the actual field names from the metadata table
        metadata_fields = [field.name for field in arcpy.ListFields(metadata_table)]

        # Determine the correct field name for "index"
        index_field = "index_" if "index_" in metadata_fields else "index"
        # Read the mapping of short names to human-readable aliases
        field_name_mapping = {}
        with arcpy.da.SearchCursor(metadata_table, [index_field, "label"]) as cursor:
            for row in cursor:
                field_name_mapping[row[0]] = row[1]

        #print(f"Field name mapping for {metadata_table}: {field_name_mapping}")

        # Update field aliases in the data table
        fields = arcpy.ListFields(data_table)
        for field in fields:
            if field.name in field_name_mapping:
                
                alias0 = field_name_mapping[field.name]
                alias1 = alias0.replace("!!", " ")
                alias = alias1
                #print(f"Modified alias: '{alias1}'")
                
                arcpy.AlterField_management(data_table, field.name, new_field_alias=alias)
                #print(f"Updated alias for {field.name} to '{alias}'\n")
            else:
                print(f"Field {field.name} not found in metadata mapping. Skipping alias update.\n")

    except Exception as e:
        print(f"Error processing {data_table}: {e}")

print("Finished processing Mobile Geodatabase.")

Processing main.data_2023_acs_acs5_subject_County with metadata main.data_2023_acs_acs5_subject_Fields
Field OBJECTID not found in metadata mapping. Skipping alias update.

Field NAME not found in metadata mapping. Skipping alias update.

Field state not found in metadata mapping. Skipping alias update.

Field county not found in metadata mapping. Skipping alias update.

Processing main.data_2023_acs_acs5_subject_State with metadata main.data_2023_acs_acs5_subject_Fields
Field OBJECTID not found in metadata mapping. Skipping alias update.

Field NAME not found in metadata mapping. Skipping alias update.

Field state not found in metadata mapping. Skipping alias update.

Finished processing Mobile Geodatabase.


In [19]:
data_field_names = [field.name for field in arcpy.ListFields(data_table)]
print("Data table fields:", data_field_names)
print("Metadata keys:", list(field_name_mapping.keys()))

for field in arcpy.ListFields(metadata_table):
    print(field.name)

Data table fields: ['OBJECTID', 'GEO_ID', 'NAME', 'S2704_C03_001E', 'S2704_C03_006E', 'S2704_C03_002E', 'S2704_C03_010E', 'state']
Metadata keys: ['for', 'in', 'ucgid', 'S0804_C04_068E', 'S0503_C02_078E', 'S2603_C07_076E', 'S0701PR_C01_028E', 'S0804_C04_067E', 'S0503_C02_077E', 'S2603_C07_075E', 'S0701PR_C01_029E', 'S2603_C07_078E', 'S0503_C02_076E', 'S0506_C01_120E', 'S0804_C04_069E', 'S2603_C07_077E', 'S0503_C02_075E', 'S0506_C01_121E', 'S0804_C04_064E', 'S0503_C02_074E', 'S2101_C06_004E', 'S0506_C01_122E', 'S0506_C01_123E', 'S2603_C07_079E', 'S0503_C02_073E', 'S2101_C06_003E', 'S0804_C04_063E', 'S0506_C01_124E', 'S0804_C04_066E', 'S2101_C06_002E', 'S0503_C02_072E', 'S0804_C04_065E', 'S0506_C01_125E', 'S1902_C02_028E', 'S2101_C06_001E', 'S0503_C02_071E', 'S0102_C02_046E', 'S2101_C06_008E', 'S0503_C02_070E', 'S0102_C02_047E', 'S2602_C04_079E', 'S2101_C06_007E', 'S0102_C02_044E', 'S2603_C07_070E', 'S2101_C06_006E', 'S1811_C02_018E', 'S0102_C02_045E', 'S2101_C06_005E', 'S1811_C02_019E',

In [20]:
# List all tables in the geodatabase with field names and aliases nd store as a CSV to check

arcpy.env.workspace = gdb_path
tables = arcpy.ListTables()

# Collect field information for all tables
field_info = []

for table in tables:
    print(f"Processing table: {table}")
    fields = arcpy.ListFields(table)
    for field in fields:
        field_info.append({
            "Table": table,
            "Field Name": field.name,
            "Field Alias": field.aliasName
        })

# Convert to a DataFrame for better visualization
import pandas as pd

field_info_df = pd.DataFrame(field_info)

# Display the DataFrame in Jupyter Notebook
# field_info_df

#  save to a CSV for reference
output_csv = f"{destination_dir}\\FieldInfo.csv"
field_info_df.to_csv(output_csv, index=False)
print(f"Field information saved to {output_csv}")

Processing table: main.data_2023_acs_acs5_subject_County
Processing table: main.data_2023_acs_acs5_subject_Fields
Processing table: main.data_2023_acs_acs5_subject_State
Field information saved to C:\GITHUB\CCSVI\Scripts\Pull_Census_BG_Data\output_20250406_134036\FieldInfo.csv
