### downloading census data for comparison with synth pop outputs

In [1]:
%load_ext autoreload
%autoreload 2
from synthpop.census_helpers import Census
from synthpop import categorizer as cat
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns', 500)

This notebook is being used to process ACS data obtained from the ACS API. 

First some notes about the structure of how synthpop works. 
It has a few dependencies: 
- **census**, a python library that is a wrapper for the U.S. Census Bureau API
- **pandas** a python library used to create and work with dataframes (like tables)
- **numpy**
- **os** which allows you to set up a development environment (not actually entirely necessary)



**Step 1:**  
Set API key. (If you don't already have you can get one [here](https://api.census.gov/data/key_signup.html))

In [2]:
# Dare's Census API key
c = Census("d95e144b39e17f929287714b0b8ba9768cecdc9f")

**Step 2:**  
Set up state and county variables to use for the state/county you are working with

In [3]:
# set state to North Carolina
stateFips = "37"
# set county to Mecklenburg
countyFips = "119"

**Step 3:**  
Define household variables of interest

In [4]:
income_columns = ['B19001_0%02dE'%i for i in range(1, 18)]
income_columns_moe = ['B19001_0%02dM'%i for i in range(1, 18)]
vehicle_columns = ['B08201_0%02dE'%i for i in range(1, 7)]
vehicle_columns_moe = ['B08201_0%02dM'%i for i in range(1, 7)]

workers_columns = ['B08202_0%02dE'%i for i in range(1, 6)]
families_columns = ['B11001_001E', 'B11001_002E']
families_columns_moe = ['B11001_001M', 'B11001_002M']
# traveltimesex_columns = ['B08013_001E','B08013_002E','B08013_003E']
census_col = families_columns + families_columns_moe+income_columns + income_columns_moe + vehicle_columns + vehicle_columns_moe
h_acs = c.tract_query(census_col, stateFips, countyFips)


In [6]:
##Note: it is also possible to do this at the block and tract level. See example below. 
## We are not using this
income_columns = ['B19001_0%02dE'%i for i in range(1, 18)]
vehicle_columns = ['B08201_0%02dE'%i for i in range(1, 7)]
workers_columns = ['B08202_0%02dE'%i for i in range(1, 6)]
families_columns = ['B11001_001E', 'B11001_002E']
# traveltimesex_columns = ['B08013_001E','B08013_002E','B08013_003E']
block_group_columns = income_columns + families_columns
tract_columns = vehicle_columns + workers_columns 
# + traveltimesex_columns
#function for getting block and tract level data. merges to the block level
h_acs_block = c.block_group_and_tract_query(block_group_columns,
                tract_columns, stateFips, countyFips, 
                merge_columns=['tract', 'county', 'state'],
                block_group_size_attr="B11001_001E",
                tract_size_attr="B08201_001E")

**Step 4:**  
Define person-level variables of interest

In [7]:
population = ['B01001_001E']
# margin of error = _moe in all columns
population_moe = ['B01001_001M']
sex = ['B01001_002E', 'B01001_026E']
sex_moe = ['B01001_002M', 'B01001_026M']
race = ['B02001_0%02dE'%i for i in range(1,11)]
race_moe = ['B02001_0%02dM'%i for i in range(1,11)]                                     
male_age_columns = ['B01001_0%02dE'%i for i in range(3,26)]
male_age_columns_moe = ['B01001_0%02dM'%i for i in range(3,26)]
female_age_columns = ['B01001_0%02dE'%i for i in range(27,50)]
female_age_columns_moe = ['B01001_0%02dM'%i for i in range(27,50)]
hispanic = ['B03003_002E', 'B03003_003E']
hispanic_moe = ['B03003_002M', 'B03003_003M']

                                             
all_columns = population + sex + race + male_age_columns + hispanic+ hispanic_moe+ female_age_columns+population_moe+sex_moe+race_moe+male_age_columns_moe+female_age_columns_moe
p_acs_tract = c.tract_query(all_columns, stateFips, countyFips)


In [24]:
##calculate standard error for person level columns
margin_error = [population_moe, sex_moe, race_moe, male_age_columns_moe,female_age_columns_moe]


0.21888412017167383

0.21888412017167383

**Step 5:**  
Categorize the ACS household data into categories we can read

This step includes calculating aggregate errors for the ACS variables we are using. 

    This was calculated using 
    SE1 + SE2 + ...+ SEN

    Where SE = Standard Error = Margin of Error / Z
    And Z is defined by ACS as 1.645

    See [this](https://www2.census.gov/programs-surveys/acs/tech_docs/statistical_testing/2015StatisticalTesting5year.pdf) guide from ACS for further info. 

In [20]:
h_acs_cat = cat.categorize(h_acs, {
    ("households", "total"): "B11001_001E",
    ("income", "lt35"): "B19001_002E + B19001_003E + B19001_004E + "
                        "B19001_005E + B19001_006E + B19001_007E",
    ("income", "lt35_me"): "(B19001_002M + B19001_003M + B19001_004M +"
                        "B19001_005M + B19001_006M + B19001_007M)/1.645",
    ("income", "gt35-lt100"): "B19001_008E + B19001_009E + "
                        "B19001_010E + B19001_011E + B19001_012E"
                        "+ B19001_013E",
    ("income", "gt35-lt100_me"): "(B19001_008M + B19001_009M + "
                        "B19001_010M + B19001_011M + B19001_012M"
                        "+ B19001_013M)/1.645",
    ("income", "gt100"): "B19001_014E + B19001_015E + B19001_016E"
                        "+ B19001_017E",
    ("income", "gt100_me"): "(B19001_014M + B19001_015M + B19001_016M"
                        "+ B19001_017M)/1.645",
    ("cars", "none"): "B08201_002E",
    ("cars", "none_me"): "B08201_002M",
    ("cars", "one"): "B08201_003E",
    ("cars", "one_me"): "B08201_003M",
    ("cars", "two or more"): "B08201_004E + B08201_005E + B08201_006E",
    ("cars", "two or more_me"): "(B08201_004M + B08201_005M + B08201_006M)/1.645",
#     ("workers", "none"): "B08202_002E",
#     ("workers", "one"): "B08202_003E",
#     ("workers", "two or more"): "B08202_004E + B08202_005E",
#     ("traveltime","total"):"B08013_001E",
#     ("traveltime","male"):"B08013_002E",
#     ("traveltime","female"):"B08013_003E"
}, index_cols=['tract'])

h_acs_cat.columns = h_acs_cat.columns.droplevel()

In [21]:
h_acs_cat.head()

cat_value,none,none_me,one,one_me,two or more,two or more_me,total,gt100,gt100_me,gt35-lt100,gt35-lt100_me,lt35,lt35_me
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100,426,180,1327,229,1072,181.762918,2825,1476,329.483283,867,291.793313,482,263.221884
300,155,40,206,37,120,31.610942,481,152,55.319149,93,48.632219,236,75.987842
400,278,103,1014,161,472,97.87234,1764,650,187.841945,794,230.395137,320,150.759878
500,441,156,1271,214,783,144.680851,2495,944,353.799392,921,274.772036,630,231.00304
600,235,79,832,118,502,98.480243,1569,379,135.56231,644,199.392097,546,193.920973


**Step 8:**  
Categorize the ACS person data into categories we can read


This step includes calculating aggregate errors for the ACS variables we are using. 

    This was calculated using 
    SE1 + SE2 + ...+ SEN

    Where SE = Standard Error = Margin of Error / Z
    And Z is defined by ACS as 1.645

    See [this](https://www2.census.gov/programs-surveys/acs/tech_docs/statistical_testing/2015StatisticalTesting5year.pdf) guide from ACS for further info. 

In [10]:
p_acs_cat = cat.categorize(p_acs_tract, {
    ("population", "total"): "B01001_001E",
    ("population", "total_me"): "B01001_001M",
    ("age", "19 and under"): "B01001_003E + B01001_004E + B01001_005E + "
                             "B01001_006E + B01001_007E + B01001_027E + "
                             "B01001_028E + B01001_029E + B01001_030E + "
                             "B01001_031E",
    ("age", "19 and under_me"): "(B01001_003M + B01001_004M + B01001_005M + "
                             "B01001_006M + B01001_007M + B01001_027M + "
                             "B01001_028M + B01001_029M + B01001_030M + "
                             "B01001_031M)/1.645",
    ("age", "20 to 35"): "B01001_008E + B01001_009E + B01001_010E + "
                         "B01001_011E + B01001_012E + B01001_032E + "
                         "B01001_033E + B01001_034E + B01001_035E + "
                         "B01001_036E",
    ("age", "20 to 35_me"): "(B01001_008M + B01001_009M + B01001_010M + "
                         "B01001_011M + B01001_012M + B01001_032M + "
                         "B01001_033M + B01001_034M + B01001_035M + "
                         "B01001_036M)/1.645",
    ("age", "35 to 60"): "B01001_013E + B01001_014E + B01001_015E + "
                         "B01001_016E + B01001_017E + B01001_037E + "
                         "B01001_038E + B01001_039E + B01001_040E + "
                         "B01001_041E",
    ("age", "35 to 60_me"): "(B01001_013M + B01001_014M + B01001_015M + "
                         "B01001_016M + B01001_017M + B01001_037M + "
                         "B01001_038M + B01001_039M + B01001_040M + "
                         "B01001_041M)/1.645",
    ("age", "above 60"): "B01001_018E + B01001_019E + B01001_020E + "
                         "B01001_021E + B01001_022E + B01001_023E + "
                         "B01001_024E + B01001_025E + B01001_042E + "
                         "B01001_043E + B01001_044E + B01001_045E + "
                         "B01001_046E + B01001_047E + B01001_048E + "
                         "B01001_049E", 
    ("age", "above 60_me"): "(B01001_018M + B01001_019M + B01001_020M + "
                         "B01001_021M + B01001_022M + B01001_023M + "
                         "B01001_024M + B01001_025M + B01001_042M + "
                         "B01001_043M + B01001_044M + B01001_045M + "
                         "B01001_046M + B01001_047M + B01001_048M + "
                         "B01001_049M)/1.645", 
    ("race", "white"):   "B02001_002E",
    ("race", "white_me"):   "B02001_002M",
    ("race", "black"):   "B02001_003E",
    ("race", "black_me"):   "B02001_003M",
    ("race", "asian"):   "B02001_005E",
    ("race", "asian_me"):   "B02001_005M",
    ("race", "other"):   "B02001_004E + B02001_006E + B02001_007E + "
                         "B02001_008E",
    ("race", "other_me"):   "(B02001_004M + B02001_006M + B02001_007M + "
                         "B02001_008M)/1.645",
    ("sex", "male"):     "B01001_002E",
    ("sex", "male_me"):     "B01001_002M",
    ("sex", "female"):   "B01001_026E",
    ("sex", "female_me"):   "B01001_026M",
    ("hispanic", "yes"): "B03003_003E",
    ("hispanic", "yes_me"): "B03003_003M",
    ("hispanic", "no"): "B03003_002E",
    ("hispanic", "no_me"): "B03003_002M",
}, index_cols=['tract'])
p_acs_cat

cat_name,age,age,age,age,age,age,age,age,hispanic,hispanic,hispanic,hispanic,population,population,race,race,race,race,race,race,race,race,sex,sex,sex,sex
cat_value,19 and under,19 and under_me,20 to 35,20 to 35_me,35 to 60,35 to 60_me,above 60,above 60_me,no,no_me,yes,yes_me,total,total_me,asian,asian_me,black,black_me,other,other_me,white,white_me,female,female_me,male,male_me
tract,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
000100,244,158.054711,2713,733.130699,1544,608.510638,430,268.085106,4532,357,399,187,4931,392,289,139,501,212,329,130.091185,3812,409,1989,355,2942,329
000300,18,56.534954,204,93.617021,340,131.914894,83,112.462006,641,84,4,6,645,85,24,18,150,38,28,28.571429,443,77,304,63,341,64
000400,52,84.498480,1751,417.021277,661,300.303951,176,170.820669,2561,194,79,48,2640,194,113,84,407,153,104,66.261398,2016,217,1160,148,1480,208
000500,834,412.158055,2791,708.814590,981,455.319149,376,293.617021,4598,386,384,292,4982,422,149,109,1278,303,332,181.155015,3223,515,2167,291,2815,332
000600,409,202.431611,1588,416.413374,671,277.203647,204,187.234043,2781,241,91,60,2872,247,9,14,1140,239,119,71.124620,1604,214,1501,185,1371,175
000700,120,88.753799,262,117.325228,344,168.389058,87,145.896657,767,79,46,36,813,83,12,13,406,98,7,21.276596,388,117,526,76,287,66
000800,1008,281.458967,822,225.531915,849,250.455927,213,147.720365,2868,315,24,21,2892,313,16,15,1969,252,522,208.510638,385,118,1551,193,1341,183
000900,322,167.781155,516,209.118541,638,217.021277,292,158.662614,1702,206,66,50,1768,204,33,44,1273,209,25,34.042553,437,100,1035,149,733,113
001000,463,209.726444,901,281.458967,942,288.145897,257,159.878419,2545,169,18,20,2563,168,46,40,207,211,62,65.045593,2248,244,1201,142,1362,155
001100,437,181.155015,685,234.042553,983,275.987842,164,122.796353,2216,204,53,40,2269,201,18,21,168,95,122,88.753799,1961,226,1098,143,1171,133


In [15]:
# dropping top level of column index
p_acs_cat.columns = p_acs_cat.columns.droplevel()

In [18]:
# renaming hispanic columns
p_acs_cat= p_acs_cat.rename(columns={'no_me':'nonhispanic_me',
                          'no':'nonhispanic','yes_me':'hispanic_me',
                         'yes':'hispanic'})

In [22]:
p_acs_cat.to_csv('data_outputs/20190330_census_aggregates/37119_people_meck.csv')
h_acs_cat.to_csv('data_outputs/20190330_census_aggregates/37119_households_meck.csv')