In [11]:
# Import dependencies
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
from io import StringIO

# Initiate Variables

In [12]:
##### Initiate Global Variables
### API key for census website

# api key --> somewhere in census website, email & they send an api key
# don't put api keys on github
# webscrapers out there, public thing, expose api key
# code is along the lines of --> MY_API_KEY = os.environ.get("MY_API_KEY")
MY_API_KEY = "xxx"

# go into conda environment, create an environment variable -- myapikey = long string of text
### Timeframes
CURRENT_YEAR = "2020"

### Geographic codes
# States
TEXAS = "48"
# Counties
HARRIS = "201"
FORTBEND = "157"
# Neighborhoods
ALL = "*"


In [13]:
# zipcodes

# Get lists of zip codes
# Harris County
HarrisZips = pd.read_csv('/Users/emmali/Desktop/Community_Bridges/BakerRipley/Census_Data_Mapping/CensusMapping/definitions/HarrisZips.csv')
HarrisZips['zipCode'] = HarrisZips['zipCode'].astype('str')
HarrisZips = ",".join(HarrisZips['zipCode'])

# Ft Bend County
FtBendZips = pd.read_csv('/Users/emmali/Desktop/Community_Bridges/BakerRipley/Census_Data_Mapping/CensusMapping/definitions/FtBendZips.csv')
FtBendZips['zipCode'] = FtBendZips['zipCode'].astype('str')
FtBendZips = ",".join(FtBendZips['zipCode'])

# Define Functions

In [14]:
def buildTableCodes(variables_toquery, entireTable):
    # If you are querying an entire table then mark entireTable=True
    # If you are querying at the variable level then mark entireTable=False
    # Define dictionary of tableCodes and tableTypes 
    tableDict = {
        "D":"/profile",
        "S":"/subject",
        "B":""
    }
    # Find the tableTypes the user is requesting
    tableList = set([variables_toquery[i][0] for i in range(len(variables_toquery))])

    # If the query is for specific variables, generate the dictionary this way:
    if entireTable:
        # Find the variables that fall under each tableCode
        tableCodes = {
            x:{
                # Get the tableType from the tableDict
                "tableType":tableDict[x], 
                # Join the tableVariables together in a string
                "tableVariables":{
                    # Get the list of tableVariables for that tableType
                    ("group(" + variables_toquery[i] + ")") for i in range(len(variables_toquery)) if variables_toquery[i][0]==x
                }
            } for x in tableList
        }

    # If the query is for specific variables, generate the dictionary this way:
    else:
        # Find the variables that fall under each tableCode
        tableCodes = {
            x:{
                # Get the tableType from the tableDict
                "tableType":tableDict[x], 
                # Join the tableVariables together in a string
                "tableVariables":"GEO_ID," + "NAME," + ",".join({
                    # Get the list of tableVariables for that tableType
                    variables_toquery[i] for i in range(len(variables_toquery)) if variables_toquery[i][0]==x
                })
            } for x in tableList
        }

    return tableCodes

In [5]:
def parseCensus(url):
    # Use BeautifulSoup to parse html and find the document text
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    censusData = soup.get_text()
    
    # Clean data by removing brackets
    censusData = censusData.replace("[", "")
    censusData = censusData.replace("]", "")

    # Create dataframe from string data
    censusData = pd.read_csv(StringIO(censusData), sep=",")
    return censusData

In [6]:
# we're gonna use tracts, zipcodes don't align directly with neighborhood definitions, are larger

def fetchCensusTracts(year, state, county, tractList, variables_toquery, entireTable, apiKey):
    # If you are querying an entire table then mark entireTable=True
    # If you are querying at the variable level then mark entireTable=False
    # Create empty df to hold the data
    censusData = pd.DataFrame(columns=['GEO_ID', 'NAME', 'state', 'county', 'tract'])
    # Generate dictionary of table codes, table types, and census variables
    tableCodes = buildTableCodes(variables_toquery, entireTable)
    # Check if the query is for full tables
    if entireTable:
        for code in tableCodes.values():
            # Find the tableType and tableVariable for that type of table
            tableType = code["tableType"]
            # Loop over the tables because the census API will not let you query multiple tables
            for tableVariables in code["tableVariables"]:
                # Build a url specific to each tableType
                url = f'https://api.census.gov/data/{year}/acs/acs5{tableType}?key={apiKey}&in=state:{state}%20county:{county}&for=tract:{tractList}&get={tableVariables}'
                # Parse the data returned from the census API
                tableData = parseCensus(url)    
                # Drop final column which is empty
                tableData = tableData.drop(tableData.columns[-1], axis=1)
                # Merge each iteration of tableData into the full censusData
                censusData = censusData.merge(tableData, how="outer", on=['GEO_ID', 'NAME', 'state', 'county', 'tract'])
    # If the query is for individual variables
    else:
        # Loop over the types of tables
        for code in tableCodes.values():
            # Find the tableType and tableVariable for that type of table
            tableType = code["tableType"]
            tableVariables = code["tableVariables"]
            # Build a url specific to each tableType
            url = f'https://api.census.gov/data/{year}/acs/acs5{tableType}?key={apiKey}&in=state:{state}%20county:{county}&for=tract:{tractList}&get={tableVariables}'
            # Parse the data returned from the census API
            tableData = parseCensus(url)    
            # Drop final column which is empty
            tableData = tableData.drop(tableData.columns[-1], axis=1)
            # Merge each iteration of tableData into the full censusData
            censusData = censusData.merge(tableData, how="outer", on=['GEO_ID', 'NAME', 'state', 'county', 'tract'])
    # Reorder the columns to make it easier to read
    censusData.insert(0, 'tract', censusData.pop('tract'))
    censusData.insert(0, 'county', censusData.pop('county'))
    censusData.insert(0, 'state', censusData.pop('state'))
    censusData.insert(0, 'NAME', censusData.pop('NAME'))
    censusData.insert(0, 'GEO_ID', censusData.pop('GEO_ID'))

    # Clean the GEO_ID column so it matches census shapefiles
    censusData['GEO_ID'] = censusData['GEO_ID'].astype('string')
    censusData['GEO_ID'] = censusData['GEO_ID'].str.replace("1400000US", "")
    return censusData

In [7]:
def fetchCensusZips(year, zipList, variables_toquery, entireTable, apiKey):
    # If you are querying an entire table then mark entireTable=True
    # If you are querying at the variable level then mark entireTable=False
    # Create empty df to hold the data
    censusData = pd.DataFrame(columns=['GEO_ID', 'NAME', 'zip code tabulation area'])
    # Generate dictionary of table codes, table types, and census variables
    tableCodes = buildTableCodes(variables_toquery, entireTable)
    # Check if the query is for full tables
    if entireTable:
        for code in tableCodes.values():
            # Find the tableType and tableVariable for that type of table
            tableType = code["tableType"]
            # Loop over the tables because the census API will not let you query multiple tables
            for tableVariables in code["tableVariables"]:
                # Build a url specific to each tableType
                url = f'https://api.census.gov/data/{year}/acs/acs5{tableType}?key={apiKey}&for=zip%20code%20tabulation%20area:{zipList}&get={tableVariables}'
                # Parse the data returned from the census API
                tableData = parseCensus(url)    
                # Drop final column which is empty
                tableData = tableData.drop(tableData.columns[-1], axis=1)
                # Merge each iteration of tableData into the full censusData
                censusData = censusData.merge(tableData, how="outer", on=['GEO_ID', 'NAME', 'zip code tabulation area'])
    # If the query is for individual variables
    else:
        # Loop over the types of tables
        for code in tableCodes.values():
            # Find the tableType and tableVariable for that type of table
            tableType = code["tableType"]
            tableVariables = code["tableVariables"]
            # Build a url specific to each tableType
            url = f'https://api.census.gov/data/{year}/acs/acs5{tableType}?key={apiKey}&for=zip%20code%20tabulation%20area:{zipList}&get={tableVariables}'
            # Parse the data returned from the census API
            tableData = parseCensus(url)    
            # Drop final column which is empty
            tableData = tableData.drop(tableData.columns[-1], axis=1)
            # Merge each iteration of tableData into the full censusData
            censusData = censusData.merge(tableData, how="outer", on=['GEO_ID', 'NAME', 'zip code tabulation area'])
    # Reorder the columns to make it easier to read
    censusData.insert(0, 'zip code tabulation area', censusData.pop('zip code tabulation area'))
    censusData.insert(0, 'NAME', censusData.pop('NAME'))
    censusData.insert(0, 'GEO_ID', censusData.pop('GEO_ID'))

    # Clean the GEO_ID column so it matches census shapefiles
    censusData['GEO_ID'] = censusData['GEO_ID'].astype('string')
    censusData['GEO_ID'] = censusData['GEO_ID'].str.replace("8600000US", "")

    # Limit to texas zip codes
    censusData = censusData.loc[(censusData['zip code tabulation area']>=77000) & (censusData['zip code tabulation area']<78000)]
    return censusData

# Test functions

In [8]:
### Select data to query
apiKey = MY_API_KEY
# Select year
year = CURRENT_YEAR
# Select geography
state = TEXAS
county = FORTBEND
zipList = FtBendZips
# Select census variables
entireTable = False
variables_toquery = [
    'DP05_0005E','DP05_0001E','DP03_0062E',
    'S1701_C02_001E','S1701_C01_001E',
    'B14006_004E','B14006_005E','B14006_006E',
    'B17020_001E','B17020_003E', 'B17020_004E'
]

DataRequest_ECE_Zips = fetchCensusZips(year, zipList, variables_toquery, entireTable, apiKey)
DataRequest_ECE_Zips.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 9, saw 3


In [None]:
### Select data to query
apiKey = MY_API_KEY
# Select year
year = CURRENT_YEAR
# Select geography
state = TEXAS
county = FORTBEND
tractList = ALL
# Select census variables
entireTable = False
variables_toquery = [
    'DP05_0005E','DP05_0001E','DP03_0062E',
    'S1701_C02_001E','S1701_C01_001E',
    'B14006_004E','B14006_005E','B14006_006E',
    'B17020_001E','B17020_003E', 'B17020_004E'
]

DataRequest_ECE_Tracts = fetchCensusTracts(year, state, county, tractList, variables_toquery, entireTable, apiKey)
DataRequest_ECE_Tracts.head()

In [37]:
### Select data to query
apiKey = MY_API_KEY
# Select year
year = CURRENT_YEAR
# Select geography
state = TEXAS
county = FORTBEND
tractList = ALL
# Select census variables
entireTable = True
variables_toquery = [
    'DP05', 'DP03'
]

fullTables = fetchCensusTracts(year, state, county, tractList, variables_toquery, entireTable, apiKey)
fullTables.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 9, saw 3


# Print to csv

In [38]:
DataRequest_ECE.to_csv("data/Output.csv")

NameError: name 'DataRequest_ECE' is not defined