# US Census Data



In [1]:
# Dependencies
import pandas as pd
import requests
from pathlib import Path
from census import Census

In [2]:
census_key = '921e59d6bcaa21630b4f53e74b7a522a3502b8cb'

# Define function to retrieve US census data for a specific year and return a dataframe
def get_census_data(year):
    census_library = Census(census_key, year=year)
    # state_code = states.CA.fips
    variables = [
        'NAME',
        'B01003_001E',   # Population
        'B02001_002E',   # White
        'B02001_003E',   # Black or African American
        'B02001_004E',   # American Indian and Alaska Native
        'B02001_005E',   # Asian
        'B02001_006E',   # Native Hawaiian and Other Pacific Islander
        'B02001_008E'    # Two or more races        
        #'B02001_007E'   # Some other race (this category doesn't exist in FBI hate crime data)
    ]
    
    # Retrieve data for all states
    data = census_library.acs5.state(variables, Census.ALL)

    # Convert to dataframe
    df = pd.DataFrame(data)
    df['year'] = year

    # Return the dataframe
    return df

In [3]:
# Retrieve census data for multiple years and merge into one dataframe
first_year = 2009
last_year = 2021

# Code reference: https://blog.finxter.com/how-to-create-a-python-list-of-size-n/
year_df = [None] * (last_year - first_year + 1)
for year in range(first_year, last_year + 1):
    print(f'Getting data for {year}')
    year_df[year - first_year] = get_census_data(year)

# Code reference: https://www.geeksforgeeks.org/merge-two-dataframes-with-same-column-names/
census_df = pd.concat(year_df, axis=0)

Getting data for 2009
Getting data for 2010
Getting data for 2011
Getting data for 2012
Getting data for 2013
Getting data for 2014
Getting data for 2015
Getting data for 2016
Getting data for 2017
Getting data for 2018
Getting data for 2019
Getting data for 2020
Getting data for 2021


In [4]:
print(census_df.dtypes)
census_df

NAME            object
B01003_001E    float64
B02001_002E    float64
B02001_003E    float64
B02001_004E    float64
B02001_005E    float64
B02001_006E    float64
B02001_008E    float64
state           object
year             int64
dtype: object


Unnamed: 0,NAME,B01003_001E,B02001_002E,B02001_003E,B02001_004E,B02001_005E,B02001_006E,B02001_008E,state,year
0,Alaska,683142.0,467650.0,25161.0,91939.0,31878.0,4269.0,50632.0,02,2009
1,Alabama,4633360.0,3256941.0,1209938.0,22969.0,46655.0,2025.0,56490.0,01,2009
2,Arkansas,2838143.0,2228798.0,439355.0,19233.0,31120.0,2505.0,49997.0,05,2009
3,Arizona,6324865.0,4906936.0,227282.0,284265.0,153301.0,11045.0,164255.0,04,2009
4,California,36308527.0,22258042.0,2249404.0,283031.0,4473292.0,132535.0,1272989.0,06,2009
...,...,...,...,...,...,...,...,...,...,...
47,Washington,7617364.0,5465011.0,293401.0,90789.0,682711.0,50902.0,655594.0,53,2021
48,West Virginia,1801049.0,1658405.0,61143.0,1911.0,13882.0,683.0,55463.0,54,2021
49,Wisconsin,5871661.0,4894019.0,372439.0,45831.0,166670.0,2862.0,259477.0,55,2021
50,Wyoming,576641.0,511179.0,5242.0,12987.0,5046.0,433.0,28437.0,56,2021


In [7]:
# Create empty dataframe
census_new_df = pd.DataFrame(columns=['year', 'state', 'race_id', 'population'])

# loop through rows creating a dataframe with separate rows for each ethnicity
for row in range(0, len(census_df)):
    state = [census_df.iloc[row, 0]] * 7
    year = [census_df.iloc[row, 9]] * 7
    race_id = [-1, 0, 1, 2, 3, 4, 5] # These are the indexes for the race table (see data_engineering notebook)
    population = [census_df.iloc[row, col] for col in range(1, 8)]
    df = pd.DataFrame({'year': year, 'state': state, 'race_id': race_id, 'population': population})
    # Add row dataframe to new dataframe
    census_new_df = pd.concat([census_new_df, df])

census_new_df

Unnamed: 0,year,state,race_id,population
0,2009,Alaska,-1,683142.0
1,2009,Alaska,0,467650.0
2,2009,Alaska,1,25161.0
3,2009,Alaska,2,91939.0
4,2009,Alaska,3,31878.0
...,...,...,...,...
2,2021,Puerto Rico,1,329651.0
3,2021,Puerto Rico,2,5407.0
4,2021,Puerto Rico,3,6263.0
5,2021,Puerto Rico,4,245.0


## Replace state column with state abbreviation

In [34]:
# Path to states csv
path = Path('data/states.csv')

# Load hate crime data
states_df = pd.read_csv(path)
states_df.head()

Unnamed: 0,state_abbr,state,division,region
0,AK,Alaska,Pacific,West
1,AL,Alabama,East South Central,South
2,AR,Arkansas,West South Central,South
3,AZ,Arizona,Mountain,West
4,CA,California,Pacific,West


In [35]:
# Remove unneeded columns before merging with census data
states_df = states_df.drop(columns=['division', 'region'])

# Remove FS (Federal) and GM (Guam) from the table given these are not in census data
# Code Reference: https://www.statology.org/pandas-filter-by-column-value-not-equal/
state_abbr_df = state_abbr_df[~state_abbr_df['state_abbr'].isin(['FS', 'GM'])]
state_abbr_df.head()

Unnamed: 0,state_abbr,state
0,AK,Alaska
1,AL,Alabama
2,AR,Arkansas
3,AZ,Arizona
4,CA,California


In [38]:
# Add state abbreviation to census data using merge
census_merged_df = pd.merge(census_new_df, state_abbr_df, how='right', on='state')

# Remove state column since we only need state_abbr
census_merged_df.drop(columns=['state'], inplace=True)

# Convert population from a float to an integer
# Code Reference: https://sparkbyexamples.com/pandas/pandas-convert-float-to-integer-type/
census_merged_df['population'] = census_merged_df['population'].fillna(0).astype(int)

# Reorder columns and name indixe column
census_final_df = census_merged_df[['year', 'state_abbr', 'race_id', 'population']]
census_final_df.index.name = 'id'

census_final_df

Unnamed: 0_level_0,year,state_abbr,race_id,population
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2009,AK,-1,683142
1,2009,AK,0,467650
2,2009,AK,1,25161
3,2009,AK,2,91939
4,2009,AK,3,31878
...,...,...,...,...
4636,2021,WY,1,5242
4637,2021,WY,2,12987
4638,2021,WY,3,5046
4639,2021,WY,4,433


In [40]:
# Export dataframe to csv 
census_final_df.to_csv('data/census_data.csv')

# Confirm that export completed
print('Dataframe exported to csv')

Dataframe exported to csv
