# Decennial Census 2000 and 2010 data pull & table build for 31CR counties & sub-place-subborough, and U.S.

This notebook is customized for pulling race over age 18 given different variable codes for 2000 versus 2010, and the need to simplify and reaggregate a select number of variables.

It includes custom functions for pulling data in both years at each geographic level

Last date updated: 8/31/2021

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from geo import stco,stco_fips,sub_7,sub_lbl
myAPI = os.environ.get('Census_API')

### variables to pull and rename

In [3]:
#variables and rename (different codes in 2000 vs. 2010 census)
col_00 = f'NAME,P006001,P006002,P006003,P006005,P006006,P006007,P006008,P006009,P006010,P006011'
col_00_rename={'P006001':'P_O18','P006002':'P_HispO18','P006003':'P_NonHispO18',\
              'P006005':'P_WhiteO18','P006006':'P_BlackO18','P006008':'P_AsianO18',\
               'P006007':'P_OtherO18','P006009':'P_OtherO18','P006010':'P_OtherO18',\
               'P006011':'P_TwoO18'}

col_10 = f'NAME,P011001,P011002,P011003,P011005,P011006,P011007,P011008,P011009,P011010,P011011'
col_10_rename={'P011001':'P_O18','P011002':'P_HispO18','P011003':'P_NonHispO18',\
              'P011005':'P_WhiteO18','P011006':'P_BlackO18','P011008':'P_AsianO18',\
               'P011007':'P_OtherO18','P011009':'P_OtherO18','P011010':'P_OtherO18',\
               'P011011':'P_TwoO18'}

In [4]:
years = ['2000','2010']
year_data = {'2000':col_00,'2010':col_10}
year_recode = {'2000':col_00_rename,'2010':col_10_rename}

### geo tables for join with NYC and subplace

In [5]:
# For NYC subborough calculations
boro = ['005','047','061','081','085']
geo_nyc_00 = pd.read_csv('../data/geo/nyc_subbor_00.csv')
geo_nyc_10 = pd.read_csv('../data/geo/nyc_subbor_10.csv')
geo_nyc_00['yr'],geo_nyc_10['yr']='2000','2010'
geo_nyc = pd.concat([geo_nyc_00,geo_nyc_10])

In [6]:
# For subplace calculations
geo_subpl_00 = pd.read_csv('../data/geo/subpl00.csv')
geo_subpl_10 = pd.read_csv('../data/geo/subpl10.csv')
geo_subpl_00['yr'],geo_subpl_10['yr']='2000','2010'
geo_subpl=pd.concat([geo_subpl_00,geo_subpl_10])

## data getters

In [7]:
## census data api pull & table build
def api_pull(url):
    df = pd.read_json(url)
    df.columns = df.iloc[0]
    df = df[1:]
    return df

## column cleanup function
# moves label ids to the front, converts data columns to ints
# recodes data columns to new id and groups duplicates (i.e., "other race")
# then reorders after the groupby
def col_clean(df,year,move):
    df = df[move+[col for col in df.columns if col not in move]]
    for col in df.columns[len(move):]:
        df[col] = df[col].astype(int)
    df = df.rename(columns=year_recode.get(year))
    df = df.groupby(df.columns,axis=1).sum()
    df = df[move+[col for col in df.columns if col not in move]]
    return df

## county data 
def get_co(years):
    dff = pd.DataFrame()
    for year in years:
        cols = year_data.get(year)
        base_url = f'https://api.census.gov/data/{year}/dec/sf1'
        data_url = f'{base_url}?get={cols}&for=county:*&in=state:*&key={myAPI}'
        df = api_pull(data_url)
        df['stco'] = df.state + df.county
        df = df.drop(columns=['state','county'])
        df['yr'] = year
        move = ['stco','NAME','yr'] #move non-variable columns to front
        df = col_clean(df,year,move)
        dff = pd.concat([dff,df])
    dff['yr'] = dff['yr'].astype(int).astype(str)
    dff = dff.rename(columns={'NAME':'name'})
    dff['stco'] = dff.stco.astype(int).astype(str).str.pad(width=5,side='left',fillchar='0')
    dff['id'] = dff['stco']
    return dff

## national data
def get_us(years):
    dff = pd.DataFrame()
    for year in years:
        cols = year_data.get(year)
        base_url = f'https://api.census.gov/data/{year}/dec/sf1'
        data_url = f'{base_url}?get={cols}&for=us:*&key={myAPI}'
        df = api_pull(data_url)
        df['yr'] = year
        df = df.drop(columns='us')
        move = ['NAME','yr']
        df = col_clean(df,year,move)
        dff = pd.concat([dff,df])
    dff['yr'] = dff['yr'].astype(int).astype(str)
    dff['id'] = 'us'
    dff = dff.rename(columns={'NAME':'name'})
    return dff

## nyc tracts converts to sub-borough geography
def get_nycsub(years):
    dff = pd.DataFrame()
    for year in years:
        cols = year_data.get(year)
        base_url = f'https://api.census.gov/data/{year}/dec/sf1'
        for b in boro:
            data_url = f'{base_url}?get={cols}&for=tract:*&in=state:36%20county:{b}&key={myAPI}'
            df = api_pull(data_url)
            df['yr'] = year
            df['tract'] = df.tract.str.pad(width=6,side='right',fillchar='0')
            df['ct_id']=df.state+df.county+df.tract
            df = df.drop(columns=['state','county','tract','NAME'])
            move = ['ct_id','yr']
            df = col_clean(df,year,move)
            df['ct_id']=df['ct_id'].astype(int)
            dff = pd.concat([dff,df])
    dff['yr'] = dff['yr'].astype(int).astype(str) #clean up year column
    dff = pd.merge(dff,geo_nyc,left_on=['ct_id','yr'],right_on=['ct_id','yr'],how='left')
    dff['stco'] = dff['ct_id'].astype(str).str[:5]
    dff = dff.drop(columns=['nta_id','nta_nm','puma','ct_id','boro'])
    dff = dff.groupby(['stco','id','name','yr']).sum().reset_index()
    dff['subreg']=dff.stco.map(sub_7)
    return dff

## county subdivisions and places (long island only)
def get_subpl(years):
    dff = pd.DataFrame()
    subpl = pd.DataFrame()
    for year in years:
        cols = year_data.get(year)
        base_url = f'https://api.census.gov/data/{year}/dec/sf1'
        #subdivision table
        for st,co in stco.items():
            for c in co:
                data_url = f'{base_url}?get={cols}&for=county%20subdivision:*&in=state:{st}&in=county:{c}&key={myAPI}'
                df = api_pull(data_url)
                df['yr'] = year
                df['id']=df.state+df.county+df['county subdivision']
                df = df.drop(columns=['state','county','county subdivision','NAME'])
                move = ['id','yr']
                df = col_clean(df,year,move)
                df['id']=df['id'].astype(int)
                dff = pd.concat([dff,df])
        #place table
        pl_url = f'{base_url}?get={cols}&for=place:*&in=state:36&key={myAPI}'
        nyp = api_pull(pl_url)
        nyp['yr'] = year
        nyp['id']=(nyp.state+nyp.place).astype(int)
        nyp = nyp.drop(columns=['state','place','NAME'])
        nyp = col_clean(nyp,year,move)
        subpl = pd.concat([subpl,nyp,dff])
    subpl['yr'] = subpl['yr'].astype(int).astype(str) #clean up year column
    subpl = pd.merge(subpl,geo_subpl,left_on=['id','yr'],right_on=['id','yr'],how='left')
    subpl = subpl.drop(columns=['id','nm']).rename(columns={'geoid':'id'})
    subpl = subpl.groupby(['stco','id','name','yr']).sum().reset_index()
    subpl['stco'],subpl['id'] = subpl['stco'].astype(int).astype(str).str.pad(width=5,side='left',fillchar='0'),\
                                   subpl['id'].astype(int).astype(str).str.pad(width=5,side='left',fillchar='0')
    subpl['subreg']=subpl.stco.map(sub_7)
    return subpl

## build master table

In [8]:
#full US counties
co_us = get_co(years)
co_us.to_csv('output/2000_2010/pop_race_hou_co_us.csv')

#US
us = get_us(years)
us['geotype'] = 'nation'

#31CR counties
reg = co_us[co_us.stco.isin(stco_fips)].copy()
reg['geotype'] = 'county'
reg['subreg'] = reg.stco.map(sub_7)

#subregions
subreg = reg.copy()
subreg = subreg.drop(columns=['stco','name']).groupby(by=['subreg','yr'],dropna=False).sum().reset_index()
subreg['id']=subreg['subreg']
subreg['name'],subreg['geotype']=subreg['id'].map(sub_lbl),'subregion'

#Subplaces & NYC subboorough
subpl = get_subpl(years)
subpl['geotype'] = 'municipality'
nyc=get_nycsub(years)
nyc['geotype'] = 'municipality'

In [9]:
#compile into master and move id to front column
master = pd.concat([reg,subreg,us,subpl,nyc])
master = master[['id']+[col for col in master.columns if col != 'id']]

In [10]:
master.to_csv('output/2000_2010/raceO18_allgeos.csv')

#### pivoted table for the two years

In [11]:
val_cols = master.columns[4:-1]
master_piv = pd.pivot_table(master,values=val_cols,index=['id','stco','subreg','name'],columns='yr',\
                            aggfunc=np.sum,fill_value=0,margins=False).reset_index()

In [12]:
master_piv.to_excel('output/2000_2010/raceO18_piv_allgeos.xlsx')