In [None]:
import os
import pandas as pd
import re
import requests

#### Define function to create clean table names from "dirty" text. This removes special characters etc:

In [None]:
def func_clean_table_name(filename):
    # Replace forbidden characters with underscores
    cleaned_name = re.sub(r'[^a-zA-Z0-9_]', '_', os.path.splitext(filename)[0])
    # Ensure the name doesn't start with a number
    if cleaned_name[0].isdigit():
        cleaned_name = f"_{cleaned_name}"
    return cleaned_name

#### Create variables for each part of the Census API URL. This makes it easy to alter the values later:

In [None]:
# Default values
census_base_url = "https://api.census.gov/"
census_dataset_url = "data/2022/acs/acs5"
decennial_dataset_url = "data/2020/dec/pl"
census_variables1 = "GEO_ID,NAME,B01001_001E,B25034_006E,B25034_007E,B25034_008E,B25034_009E,B25034_010E,B25034_011E,B25046_001E,B27010_017E,B27010_033E,B27010_050E,B27010_066E,B28002_013E,C16002_004E,C16002_007E,C16002_010E,C16002_013E,B09019_005E,B09019_008E,B19101_001E,C17002_002E,C17002_003E,C17002_004E,C17002_005E,C17002_006E,C17002_007E,B20005_002E,B01001_003E,B01001_004E,B01001_005E,B01001_006E,B01001_020E,B01001_021E"
census_variables2 = "GEO_ID,B05013_017E,B05013_018E,B05013_019E,B01001_017E,B01001_018E,B01001_019E,B01001_016E,B01001_044E,B01001_045E,B01001_046E,B01001_047E,B01001_048E,B01001_049E,B02001_002E,B02001_003E,B02001_004E,B02001_005E,B02001_006E,B02001_007E,B02001_008E,B25008_003E"
decennial_variables3 = "GEO_ID,P5_002N,P5_003N,P5_004N,P5_005N,P5_008N,P5_009N,P5_010N"
# Geography is very important to structure correctly. Sometimes if it's incorrect, the data pull still works but
# will generate an error for higher counts of variables (like we're pulling here)
in_geography = r"&for=block%20group:*&in=state:15%20county:*%20tract:*"

In [None]:
census_combined_url = f"{census_base_url}{census_dataset_url}"
decennial_combined_url = f"{census_base_url}{decennial_dataset_url}"

cbg_url1 = f"{census_combined_url}?get={census_variables1}{in_geography}"
cbg_url2 = f"{census_combined_url}?get={census_variables2}{in_geography}"
cbg_url3 = f"{decennial_combined_url}?get={decennial_variables3}{in_geography}"

#### Create variables for the Census variable lookup API URLs.  
#### Variable lookup links:  
##### https://api.census.gov/data/2020/dec/pl/variables.html  
##### https://api.census.gov/data/2022/acs/acs5/variables.html  
##### https://api.census.gov/data/2022/acs/acs5/subject/variables.html    
#### Right now we just need the first 3 but may need others if some variables are added:

In [None]:
census_fields_url1 = f"{census_combined_url}/variables.json" #detailed variable lookup
census_fields_url2 = f"{census_combined_url}/subject/variables.json" #subject variable lookup with stats
census_fields_url3 = f"{decennial_combined_url}/variables.json"
#census_fields_url7 = f"{census_combined_url}/profile/variables.json" #data profiles lookup for demographic/econ data
#census_fields_url8 = f"{census_combined_url}/cprofile/variables.json" #comparison profiles lookup for 5yr to 5yr comps
#census_fields_url9 = f"{census_combined_url}/spt/variables.json" #supplemental estimates lookup

print("Click link to test URL for first set of variables: " + cbg_url1 + "\r")
print("Click link to test URL for second set of variables: " + cbg_url2 + "\r")
print("Click link to test URL for third set of variables (Decennial Census 2020): " + cbg_url3 + "\r")
print("Click link to test URL for the field/variable list with descriptions: " + census_fields_url1 + "\r")
print("Click link to test URL for the field/variable list with descriptions: " + census_fields_url2 + "\r")
print("Click link to test URL for the field/variable list with descriptions: " + census_fields_url3 + "\r")

#### Pull multiple variable listings from Census API and store each in a dataframe:

In [None]:
fields_dict1 = requests.get(census_fields_url1).json()
fields_df1 = pd.DataFrame.from_dict(fields_dict1["variables"], orient="index").reset_index()
fields_df1.head()

In [None]:
fields_dict2 = requests.get(census_fields_url2).json()
fields_df2 = pd.DataFrame.from_dict(fields_dict2["variables"], orient="index").reset_index()
fields_df2.head()

In [None]:
fields_dict3 = requests.get(census_fields_url3).json()
fields_df3 = pd.DataFrame.from_dict(fields_dict3["variables"], orient="index").reset_index()
fields_df3.head()

In [None]:
fields_df = pd.concat([fields_df1, fields_df2, fields_df3], ignore_index=True)
print("Fields aka variables data have been concatenated")

#### Pull CBG level data from Census API and store in dataframe (first set of variables):

In [None]:
cbg_response = requests.get(cbg_url1)
cbg_data = cbg_response.json()
cbg_df = pd.DataFrame(cbg_data[1:], columns=cbg_data[0])
cbg_df.head()

In [None]:
print(cbg_response.status_code)
#print(cbg_response.text)

#### Pull CBG level data from Census API and store in dataframe (second set of variables):

In [None]:
cbg_response2 = requests.get(cbg_url2)
cbg_data2 = cbg_response2.json()
cbg_df2 = pd.DataFrame(cbg_data2[1:], columns=cbg_data2[0])
cbg_df2.head()

In [None]:
cbg_response3 = requests.get(cbg_url3)
#print(cbg_response3.status_code)

cbg_data3 = cbg_response3.json()
cbg_df3 = pd.DataFrame(cbg_data3[1:], columns=cbg_data3[0])
cbg_df3.head()

#### Find duplicate fields in the second DF and remove all except GEO_ID which is needed to join
#### Merge tables on GEO_ID and add suffix to duplicated field (should just be GEO_ID):

In [None]:
duplicate_columns = [col for col in cbg_df2.columns if col in cbg_df.columns and col != 'GEO_ID']

# Drop them from cbg_df2
cbg_df2 = cbg_df2.drop(columns=duplicate_columns)

# Now safe to join but still add a suffix if a field is duplicated
cbg_df = cbg_df.join(cbg_df2, how='left', lsuffix='', rsuffix='_2')

cbg_df.head()

#### Find duplicate fields in third dataframe and remove all except GEO_ID which is needed to join
#### Merge tables on GEO_ID and add suffix to duplicated field (should just be GEO_ID):

In [None]:
duplicate_columns = [col for col in cbg_df3.columns if col in cbg_df.columns and col != 'GEO_ID']

# Drop them from cbg_df2
cbg_df3 = cbg_df3.drop(columns=duplicate_columns)

# Now safe to join but still add a suffix if a field is duplicated
cbg_df = cbg_df.join(cbg_df3, how='left', lsuffix='', rsuffix='_2')

cbg_df.head()

#### Create path for output folder and store it as a variable:

In [None]:
output_folder_path = os.path.join(os.getcwd(), "output")
print ("Output folder: " + output_folder_path)

#### Generate table names from the URL segments so that it's clear what year/data is included in the output table:

In [None]:
#general_table_name = func_clean_table_name(f"{census_dataset_url}")
#fields_table_name = f"{general_table_name}_Variables"
#cbg_table_name = f"{general_table_name}_CBG"

fields_table_name = f"census_variables"
cbg_table_name = f"census_cbg_data"


#### Generate output folder and use path to store CBG data pull as a CSV file

In [None]:
os.makedirs(output_folder_path, exist_ok=True)
cbg_csv_path = os.path.join(output_folder_path, f"{cbg_table_name}.csv")
cbg_df.to_csv(cbg_csv_path, index=False, encoding="utf-8")
print(f"CSV file created successfully: {cbg_csv_path}")

#### Generate path to store Census variable list as a CSV file:

In [None]:
fields_csv_path = os.path.join(output_folder_path, f"{fields_table_name}.csv")
fields_df.to_csv(fields_csv_path, index=False, encoding="utf-8")
print(f"CSV file created successfully: {fields_csv_path}")

In [None]:
print("Script complete! Output files are located at: " + output_folder_path)