### downloading census data for comparison with synth pop outputs

In [2]:
%load_ext autoreload
%autoreload 2
from synthpop.census_helpers import Census
from synthpop import categorizer as cat
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns', 500)

This notebook is being used to process ACS data obtained from the ACS API. 

First some notes about the structure of how synthpop works. 
It has a few dependencies: 
- **census**, a python library that is a wrapper for the U.S. Census Bureau API
- **pandas** a python library used to create and work with dataframes (like tables)
- **numpy**
- **os** which allows you to set up a development environment (not actually entirely necessary)



**Step 1:**  
Set API key. (If you don't already have you can get one [here](https://api.census.gov/data/key_signup.html))

In [3]:
# Dare's Census API key
c = Census("d95e144b39e17f929287714b0b8ba9768cecdc9f")

**Step 2:**  
Set up state and county variables to use for the state/county you are working with

In [4]:
# set state to North Carolina
stateFips = "37"
# set county to Mecklenburg
countyFips = "119"

**Step 3:**  
Define household variables of interest

In [5]:
income_columns = ['B19001_0%02dE'%i for i in range(1, 18)]
income_columns_moe = ['B19001_0%02dM'%i for i in range(1, 18)]
vehicle_columns = ['B08201_0%02dE'%i for i in range(1, 7)]
vehicle_columns_moe = ['B08201_0%02dM'%i for i in range(1, 7)]

workers_columns = ['B08202_0%02dE'%i for i in range(1, 6)]
families_columns = ['B11001_001E', 'B11001_002E']
families_columns_moe = ['B11001_001M', 'B11001_002M']
# traveltimesex_columns = ['B08013_001E','B08013_002E','B08013_003E']
census_col = families_columns + families_columns_moe+income_columns + income_columns_moe + vehicle_columns + vehicle_columns_moe
h_acs = c.tract_query(census_col, stateFips, countyFips)


In [6]:
##Note: it is also possible to do this at the block and tract level. See example below. 
## We are not using this
income_columns = ['B19001_0%02dE'%i for i in range(1, 18)]
vehicle_columns = ['B08201_0%02dE'%i for i in range(1, 7)]
workers_columns = ['B08202_0%02dE'%i for i in range(1, 6)]
families_columns = ['B11001_001E', 'B11001_002E']
# traveltimesex_columns = ['B08013_001E','B08013_002E','B08013_003E']
block_group_columns = income_columns + families_columns
tract_columns = vehicle_columns + workers_columns 
# + traveltimesex_columns
#function for getting block and tract level data. merges to the block level
h_acs_block = c.block_group_and_tract_query(block_group_columns,
                tract_columns, stateFips, countyFips, 
                merge_columns=['tract', 'county', 'state'],
                block_group_size_attr="B11001_001E",
                tract_size_attr="B08201_001E")

**Step 4:**  
Define person-level variables of interest

In [7]:
population = ['B01001_001E']
# margin of error = _moe in all columns
population_moe = ['B01001_001M']
sex = ['B01001_002E', 'B01001_026E']
sex_moe = ['B01001_002M', 'B01001_026M']
race = ['B02001_0%02dE'%i for i in range(1,11)]
race_moe = ['B02001_0%02dM'%i for i in range(1,11)]                                     
male_age_columns = ['B01001_0%02dE'%i for i in range(3,26)]
male_age_columns_moe = ['B01001_0%02dM'%i for i in range(3,26)]
female_age_columns = ['B01001_0%02dE'%i for i in range(27,50)]
female_age_columns_moe = ['B01001_0%02dM'%i for i in range(27,50)]
hispanic = ['B03003_002E', 'B03003_003E']
hispanic_moe = ['B03003_002M', 'B03003_003M']

                                             
all_columns = population + sex + race + male_age_columns + hispanic+ hispanic_moe+ female_age_columns+population_moe+sex_moe+race_moe+male_age_columns_moe+female_age_columns_moe
p_acs_tract = c.tract_query(all_columns, stateFips, countyFips)


**Step 5:**  
Categorize the ACS household data into categories we can read

This step includes calculating aggregate errors for the ACS variables we are using. 

This was calculated using : SE1 + SE2 + ...+ SEN

Where SE = Standard Error = Margin of Error / Z
And Z is defined by ACS as 1.645

See [this](https://www2.census.gov/programs-surveys/acs/tech_docs/statistical_testing/2015StatisticalTesting5year.pdf) guide from ACS for further info. 
    
Further [info](https://www.census.gov/content/dam/Census/programs-surveys/acs/guidance/training-presentations/20170419_MOE_Transcript.pdf) on ACS margins of error. 

In [8]:
h_acs_cat = cat.categorize(h_acs, {
    ("households", "total"): "B11001_001E",
    ("households", "total_me"): "B11001_001M",
    ("income", "lt30"): "B19001_002E + B19001_003E + B19001_004E + "
                        "B19001_005E + B19001_006E",
    ("income", "gt30-lt60"): "B19001_007E +B19001_008E +B19001_009E +B19001_010E"
                        "+B19001_011E",
    ("income","gt60-lt100"): "B19001_012E + B19001_013E",
    ("income","gt100-lt150"):"B19001_014E +  B19001_015E",
    ("income","gt150"):"B19001_016E +B19001_017E",
    
    ("income", "lt30_me"): "(B19001_002M + B19001_003M + B19001_004M + "
                        "B19001_005M + B19001_006M)",
    ("income", "gt30-lt6_me"): "(B19001_007M +B19001_008M +B19001_009M +B19001_010M"
                        "+B19001_011M)",
    ("income","gt60-lt100_me"): "(B19001_012M + B19001_013M)",
    ("income","gt100-lt150_me"):"(B19001_014M +  B19001_015M)",
    ("income","gt150_me"):"(B19001_016M +B19001_017M)",
    ("cars", "none"): "B08201_002E",
    ("cars", "none_me"): "B08201_002M",
    ("cars", "one"): "B08201_003E",
    ("cars", "one_me"): "B08201_003M",
    ("cars", "two or more"): "B08201_004E + B08201_005E + B08201_006E",
    ("cars", "two or more_me"): "(B08201_004M + B08201_005M + B08201_006M)/1.645",
#     ("workers", "none"): "B08202_002E",
#     ("workers", "one"): "B08202_003E",
#     ("workers", "two or more"): "B08202_004E + B08202_005E",
#     ("traveltime","total"):"B08013_001E",
#     ("traveltime","male"):"B08013_002E",
#     ("traveltime","female"):"B08013_003E"
}, index_cols=['tract'])

h_acs_cat.columns = h_acs_cat.columns.droplevel()

In [None]:
h_acs_cat.head()

**Step 8:**  
Categorize the ACS person data into categories we can read


This step includes calculating aggregate errors for the ACS variables we are using. 

This was calculated using : SE1 + SE2 + ...+ SEN

Where SE = Standard Error = Margin of Error / Z
And Z is defined by ACS as 1.645

See [this](https://www2.census.gov/programs-surveys/acs/tech_docs/statistical_testing/2015StatisticalTesting5year.pdf) guide from ACS for further info. 

In [18]:
p_acs_cat = cat.categorize(p_acs_tract, {
    ("population", "total"): "B01001_001E",
    ("population", "total_me"): "B01001_001M",
    ("age", "19 and under"): "B01001_003E + B01001_004E + B01001_005E + "
                             "B01001_006E + B01001_007E + B01001_027E + "
                             "B01001_028E + B01001_029E + B01001_030E + "
                             "B01001_031E",
    ("age", "19 and under_me"): "(B01001_003M + B01001_004M + B01001_005M + "
                             "B01001_006M + B01001_007M + B01001_027M + "
                             "B01001_028M + B01001_029M + B01001_030M + "
                             "B01001_031M)",
    ("age", "20 to 35"): "B01001_008E + B01001_009E + B01001_010E + "
                         "B01001_011E + B01001_012E + B01001_032E + "
                         "B01001_033E + B01001_034E + B01001_035E + "
                         "B01001_036E",
    ("age", "20 to 35_me"): "(B01001_008M + B01001_009M + B01001_010M + "
                         "B01001_011M + B01001_012M + B01001_032M + "
                         "B01001_033M + B01001_034M + B01001_035M + "
                         "B01001_036M)",
    ("age", "35 to 60"): "B01001_013E + B01001_014E + B01001_015E + "
                         "B01001_016E + B01001_017E + B01001_037E + "
                         "B01001_038E + B01001_039E + B01001_040E + "
                         "B01001_041E",
    ("age", "35 to 60_me"): "(B01001_013M + B01001_014M + B01001_015M + "
                         "B01001_016M + B01001_017M + B01001_037M + "
                         "B01001_038M + B01001_039M + B01001_040M + "
                         "B01001_041M)",
    ("age", "above 60"): "B01001_018E + B01001_019E + B01001_020E + "
                         "B01001_021E + B01001_022E + B01001_023E + "
                         "B01001_024E + B01001_025E + B01001_042E + "
                         "B01001_043E + B01001_044E + B01001_045E + "
                         "B01001_046E + B01001_047E + B01001_048E + "
                         "B01001_049E", 
    ("age","women_60"): "(B01001_042E + "
                         "B01001_043E + B01001_044E + B01001_045E + "
                         "B01001_046E + B01001_047E + B01001_048E + "
                         "B01001_049E)",
    ("age","women_60_me"): "(B01001_042M + "
                         "B01001_043M + B01001_044M + B01001_045M + "
                         "B01001_046M + B01001_047M + B01001_048M + "
                         "B01001_049M)",
    ("age", "above 60_me"): "(B01001_018M + B01001_019M + B01001_020M + "
                         "B01001_021M + B01001_022M + B01001_023M + "
                         "B01001_024M + B01001_025M + B01001_042M + "
                         "B01001_043M + B01001_044M + B01001_045M + "
                         "B01001_046M + B01001_047M + B01001_048M + "
                         "B01001_049M)", 
    ("race", "white"):   "B02001_002E",
    ("race", "white_me"):   "B02001_002M",
    ("race", "black"):   "B02001_003E",
    ("race", "black_me"):   "B02001_003M",
    ("race", "asian"):   "B02001_005E",
    ("race", "asian_me"):   "B02001_005M",
    ("race", "other"):   "B02001_004E + B02001_006E + B02001_007E + "
                         "B02001_008E",
    ("race", "other_me"):   "(B02001_004M + B02001_006M + B02001_007M + "
                         "B02001_008M)",
    ("sex", "male"):     "B01001_002E",
    ("sex", "male_me"):     "B01001_002M",
    ("sex", "female"):   "B01001_026E",
    ("sex", "female_me"):   "B01001_026M",
    ("hispanic", "yes"): "B03003_003E",
    ("hispanic", "yes_me"): "B03003_003M",
    ("hispanic", "no"): "B03003_002E",
    ("hispanic", "no_me"): "B03003_002M",
}, index_cols=['tract'])


In [20]:
# dropping top level of column index
p_acs_cat.columns = p_acs_cat.columns.droplevel()

In [21]:
# renaming hispanic columns
p_acs_cat= p_acs_cat.rename(columns={'no_me':'nonhispanic_me',
                          'no':'nonhispanic','yes_me':'hispanic_me',
                         'yes':'hispanic'})

In [22]:
p_acs_cat.to_csv('data_outputs/20190330_census_aggregates/37119_people_meck.csv')
h_acs_cat.to_csv('data_outputs/20190330_census_aggregates/37119_households_meck.csv')

In [23]:
p_acs_cat

cat_value,19 and under,19 and under_me,20 to 35,20 to 35_me,35 to 60,35 to 60_me,above 60,above 60_me,women_60,women_60_me,nonhispanic,nonhispanic_me,hispanic,hispanic_me,total,total_me,asian,asian_me,black,black_me,other,other_me,white,white_me,female,female_me,male,male_me
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
000100,244,260,2713,1206,1544,1001,430,441,129,170,4532,357,399,187,4931,392,289,139,501,212,329,214,3812,409,1989,355,2942,329
000300,18,93,204,154,340,217,83,185,27,83,641,84,4,6,645,85,24,18,150,38,28,47,443,77,304,63,341,64
000400,52,139,1751,686,661,494,176,281,50,112,2561,194,79,48,2640,194,113,84,407,153,104,109,2016,217,1160,148,1480,208
000500,834,678,2791,1166,981,749,376,483,163,210,4598,386,384,292,4982,422,149,109,1278,303,332,298,3223,515,2167,291,2815,332
000600,409,333,1588,685,671,456,204,308,149,183,2781,241,91,60,2872,247,9,14,1140,239,119,117,1604,214,1501,185,1371,175
000700,120,146,262,193,344,277,87,240,75,146,767,79,46,36,813,83,12,13,406,98,7,35,388,117,526,76,287,66
000800,1008,463,822,371,849,412,213,243,108,120,2868,315,24,21,2892,313,16,15,1969,252,522,343,385,118,1551,193,1341,183
000900,322,276,516,344,638,357,292,261,170,137,1702,206,66,50,1768,204,33,44,1273,209,25,56,437,100,1035,149,733,113
001000,463,345,901,463,942,474,257,263,167,145,2545,169,18,20,2563,168,46,40,207,211,62,107,2248,244,1201,142,1362,155
001100,437,298,685,385,983,454,164,202,86,104,2216,204,53,40,2269,201,18,21,168,95,122,146,1961,226,1098,143,1171,133
