In [1]:
import os
import pandas as pd
import re
import requests

#### Define function to create clean table names from "dirty" text. This removes special characters etc:

In [2]:
def func_clean_table_name(filename):
    # Replace forbidden characters with underscores
    cleaned_name = re.sub(r'[^a-zA-Z0-9_]', '_', os.path.splitext(filename)[0])
    # Ensure the name doesn't start with a number
    if cleaned_name[0].isdigit():
        cleaned_name = f"_{cleaned_name}"
    return cleaned_name

#### Create variables for each part of the Census API URL. This makes it easy to alter the values later:

In [3]:
# Default values
census_base_url = "https://api.census.gov/"
census_dataset_url = "data/2022/acs/acs5"
census_variables = "GEO_ID,NAME,B19059_001E,B01001_001E,B25034_006E,B25034_007E,B25034_008E,B25034_009E,B25034_010E,B25034_011E,B25046_001E,C27001G_004E,B27010_033E,B27010_050E,C27001G_010E,B28009E_006E,B28002_013E,C16002_004E,C16002_007E,C16002_010E,C16002_013E,B09019_005E,B09019_008E,B19101A_001E,C17002_002E,C17002_003E,C17002_004E,C17002_005E,C17002_006E,C17002_007E,B20005_002E,B01001G_003E,B01001G_004E,B01001G_005E,B01001G_006E,B01001_020E,B01001_021E,B05013_017E,B05013_018E,B05013_019E,B01001B_016E,B01001B_017E,B01001B_018E,B01001B_019E,B01001_044E,B01001_045E,B05013_036E,B05013_037E,B05013_038E,B01001B_031E"
census_variables2 = "GEO_ID,NAME,B26209_017E,B26203D_004E,B26204_031E,B26203E_005E,B02001_002E,B02001_003E,B02001_004E,B02001_005E,B02001_006E,B02001_007E,B02001_008E,B25008D_003E"
# Geography is very important to structure correctly. Sometimes if it's incorrect, the data pull still works but
# will generate an error for higher counts of variables (like we're pulling here)
in_geography = r"&for=block%20group:*&in=state:15%20county:*%20tract:*"

In [4]:
census_combined_url = f"{census_base_url}{census_dataset_url}"
cbg_url = f"{census_combined_url}?get={census_variables}{in_geography}"
cbg_url2 = f"{census_combined_url}?get={census_variables2}{in_geography}"
census_fields_url = f"{census_combined_url}/variables.json"

print("Click link to test URL for first set of variables: " + cbg_url + "\r")
print("Click link to test URL for second set of variables: " + cbg_url2 + "\r")
print("Click link to test URL for the field/variable list with descriptions: " + census_fields_url + "\r")

Click link to test URL for first set of variables: https://api.census.gov/data/2022/acs/acs5?get=GEO_ID,NAME,B19059_001E,B01001_001E,B25034_006E,B25034_007E,B25034_008E,B25034_009E,B25034_010E,B25034_011E,B25046_001E,C27001G_004E,B27010_033E,B27010_050E,C27001G_010E,B28009E_006E,B28002_013E,C16002_004E,C16002_007E,C16002_010E,C16002_013E,B09019_005E,B09019_008E,B19101A_001E,C17002_002E,C17002_003E,C17002_004E,C17002_005E,C17002_006E,C17002_007E,B20005_002E,B01001G_003E,B01001G_004E,B01001G_005E,B01001G_006E,B01001_020E,B01001_021E,B05013_017E,B05013_018E,B05013_019E,B01001B_016E,B01001B_017E,B01001B_018E,B01001B_019E,B01001_044E,B01001_045E,B05013_036E,B05013_037E,B05013_038E,B01001B_031E&for=block%20group:*&in=state:15%20county:*%20tract:*
Click link to test URL for second set of variables: https://api.census.gov/data/2022/acs/acs5?get=GEO_ID,NAME,B26209_017E,B26203D_004E,B26204_031E,B26203E_005E,B02001_002E,B02001_003E,B02001_004E,B02001_005E,B02001_006E,B02001_007E,B02001_008E,B2500

#### Pull variable listing from Census API and store in dataframe:

In [5]:
fields_dict = requests.get(census_fields_url).json()
fields_df = pd.DataFrame.from_dict(fields_dict["variables"], orient="index").reset_index()
fields_df.head()

Unnamed: 0,index,label,concept,predicateType,group,limit,predicateOnly,hasGeoCollectionSupport,attributes,required
0,for,Census API FIPS 'for' clause,Census API Geography Specification,fips-for,,0,True,,,
1,in,Census API FIPS 'in' clause,Census API Geography Specification,fips-in,,0,True,,,
2,ucgid,Uniform Census Geography Identifier clause,Census API Geography Specification,ucgid,,0,True,True,,
3,B24022_060E,Estimate!!Total:!!Female:!!Service occupations...,Sex by Occupation and Median Earnings in the P...,int,B24022,0,,,"B24022_060EA,B24022_060M,B24022_060MA",
4,B19001B_014E,"Estimate!!Total:!!$100,000 to $124,999",Household Income in the Past 12 Months (in 202...,int,B19001B,0,,,"B19001B_014EA,B19001B_014M,B19001B_014MA",


#### Pull CBG level data from Census API and store in dataframe (first set of variables):

In [6]:
cbg_response = requests.get(cbg_url)
cbg_data = cbg_response.json()
cbg_df = pd.DataFrame(cbg_data[1:], columns=cbg_data[0])
cbg_df.head()

Unnamed: 0,GEO_ID,NAME,B19059_001E,B01001_001E,B25034_006E,B25034_007E,B25034_008E,B25034_009E,B25034_010E,B25034_011E,...,B01001_044E,B01001_045E,B05013_036E,B05013_037E,B05013_038E,B01001B_031E,state,county,tract,block group
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,510,1462,82,294,87,17,5,9,...,3,0,,,,,15,1,20100,1
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,248,602,24,7,57,28,29,76,...,5,9,,,,,15,1,20100,2
2,1500000US150010201003,Block Group 3; Census Tract 201; Hawaii County...,555,1339,148,53,70,141,67,49,...,22,54,,,,,15,1,20100,3
3,1500000US150010201004,Block Group 4; Census Tract 201; Hawaii County...,331,1096,32,48,56,45,24,144,...,19,39,,,,,15,1,20100,4
4,1500000US150010202021,Block Group 1; Census Tract 202.02; Hawaii Cou...,255,1279,43,31,24,35,22,72,...,20,1,,,,,15,1,20202,1


#### Pull CBG level data from Census API and store in dataframe (second set of variables):

In [7]:
cbg_response2 = requests.get(cbg_url2)
cbg_data2 = cbg_response2.json()
cbg_df2 = pd.DataFrame(cbg_data[1:], columns=cbg_data[0])
cbg_df2.head()

Unnamed: 0,GEO_ID,NAME,B19059_001E,B01001_001E,B25034_006E,B25034_007E,B25034_008E,B25034_009E,B25034_010E,B25034_011E,...,B01001_044E,B01001_045E,B05013_036E,B05013_037E,B05013_038E,B01001B_031E,state,county,tract,block group
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,510,1462,82,294,87,17,5,9,...,3,0,,,,,15,1,20100,1
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,248,602,24,7,57,28,29,76,...,5,9,,,,,15,1,20100,2
2,1500000US150010201003,Block Group 3; Census Tract 201; Hawaii County...,555,1339,148,53,70,141,67,49,...,22,54,,,,,15,1,20100,3
3,1500000US150010201004,Block Group 4; Census Tract 201; Hawaii County...,331,1096,32,48,56,45,24,144,...,19,39,,,,,15,1,20100,4
4,1500000US150010202021,Block Group 1; Census Tract 202.02; Hawaii Cou...,255,1279,43,31,24,35,22,72,...,20,1,,,,,15,1,20202,1


#### Find duplicate fields in the two data frames and remove all except GEO_ID which is needed to join
#### Merge tables on GEO_ID and add suffix to duplicated field (should just be GEO_ID):

In [8]:
duplicate_columns = [col for col in cbg_df2.columns if col in cbg_df.columns and col != 'GEO_ID']

# Drop them from cbg_df2
cbg_df2 = cbg_df2.drop(columns=duplicate_columns)

# Now safe to join but still add a suffix if a field is duplicated
cbg_df = cbg_df.join(cbg_df2, how='left', lsuffix='', rsuffix='_2')

cbg_df.head()

Unnamed: 0,GEO_ID,NAME,B19059_001E,B01001_001E,B25034_006E,B25034_007E,B25034_008E,B25034_009E,B25034_010E,B25034_011E,...,B01001_045E,B05013_036E,B05013_037E,B05013_038E,B01001B_031E,state,county,tract,block group,GEO_ID_2
0,1500000US150010201001,Block Group 1; Census Tract 201; Hawaii County...,510,1462,82,294,87,17,5,9,...,0,,,,,15,1,20100,1,1500000US150010201001
1,1500000US150010201002,Block Group 2; Census Tract 201; Hawaii County...,248,602,24,7,57,28,29,76,...,9,,,,,15,1,20100,2,1500000US150010201002
2,1500000US150010201003,Block Group 3; Census Tract 201; Hawaii County...,555,1339,148,53,70,141,67,49,...,54,,,,,15,1,20100,3,1500000US150010201003
3,1500000US150010201004,Block Group 4; Census Tract 201; Hawaii County...,331,1096,32,48,56,45,24,144,...,39,,,,,15,1,20100,4,1500000US150010201004
4,1500000US150010202021,Block Group 1; Census Tract 202.02; Hawaii Cou...,255,1279,43,31,24,35,22,72,...,1,,,,,15,1,20202,1,1500000US150010202021


#### Create path for output folder and store it as a variable:

In [9]:
output_folder_path = os.path.join(os.getcwd(), "output")
print ("Output folder: " + output_folder_path)

Output folder: C:\GITHUB\CCSVI\Scripts\Pull_Census_Simple\output


#### Generate table names from the URL segments so that it's clear what year/data is included in the output table:

In [10]:
general_table_name = func_clean_table_name(f"{census_dataset_url}")
fields_table_name = f"{general_table_name}_Variables"
cbg_table_name = f"{general_table_name}_CBG"


#### Generate output folder and use path to store CBG data pull as a CSV file

In [11]:
os.makedirs(output_folder_path, exist_ok=True)
cbg_csv_path = os.path.join(output_folder_path, f"{cbg_table_name}.csv")
cbg_df.to_csv(cbg_csv_path, index=False, encoding="utf-8")
print(f"CSV file created successfully: {cbg_csv_path}")

CSV file created successfully: C:\GITHUB\CCSVI\Scripts\Pull_Census_Simple\output\data_2022_acs_acs5_CBG.csv


#### Generate path to store Census variable list as a CSV file:

In [12]:
fields_csv_path = os.path.join(output_folder_path, f"{fields_table_name}.csv")
fields_df.to_csv(fields_csv_path, index=False, encoding="utf-8")
print(f"CSV file created successfully: {fields_csv_path}")

CSV file created successfully: C:\GITHUB\CCSVI\Scripts\Pull_Census_Simple\output\data_2022_acs_acs5_Variables.csv


In [13]:
print("Script complete! Output files are located at: " + output_folder_path)

Script complete! Output files are located at: C:\GITHUB\CCSVI\Scripts\Pull_Census_Simple\output
