In [3]:
%load_ext autoreload
%autoreload 2
from synthpop.census_helpers import Census
from synthpop import categorizer as cat
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_columns', 500)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


This notebook is being used to process ACS data obtained from the ACS API. 

First some notes about the structure of how synthpop works. 
It has a few dependencies: 
- **census**, a python library that is a wrapper for the U.S. Census Bureau API
- **pandas** a python library used to create and work with dataframes (like tables)
- **numpy**
- **os** which allows you to set up a development environment (not actually entirely necessary)

SynthPop itself is actually a few separate scripts that each handle different aspects of the synthesizing process: 
The Synth pop library **census_helpers.py** relies on **census** and is a set of funcitons to assist with downloading and processing census data for a given geography. It allows you to select geography and columns of interest and to download data at the block or tract level (or both)
**zone_synthesizer.py** is a set of functions that accepts marginals and sample files from a CSV and produces a synthesized population
**synthesizer.py** uses a 'recipe' which is the output of the **starter.py** script
**starter.py** uses **census_helpers.py** to generate and return: 
- household marginals
- person marginals
- household joint distribution
- person joint distribution
- tract to PUMA map (a disctionary showing the relationship between tracts and PUMAs)


**Step 1:**  
Set API key. (If you don't already have you can get one [here](https://api.census.gov/data/key_signup.html))

In [4]:
# Dare's Census API key
c = Census("d95e144b39e17f929287714b0b8ba9768cecdc9f")

**Step 2:**  
Set up state and county variables to use for the state/county you are working with

In [5]:
# set state to North Carolina
stateFips = "37"
# set county to Mecklenburg
countyFips = "119"

**Step 3:**  
Define household variables of interest

In [7]:
income_columns = ['B19001_0%02dE'%i for i in range(1, 18)]
vehicle_columns = ['B08201_0%02dE'%i for i in range(1, 7)]
workers_columns = ['B08202_0%02dE'%i for i in range(1, 6)]
families_columns = ['B11001_001E', 'B11001_002E']
traveltimesex_columns = ['B08013_001E','B08013_002E','B08013_003E']
census_col = income_columns + vehicle_columns + workers_columns + families_columns + traveltimesex_columns
h_acs_tract = c.tract_query(census_col, stateFips, countyFips)


In [11]:
h_acs_tract.head(2)

Unnamed: 0,B08013_001E,B08013_002E,B08013_003E,B08201_001E,B08201_002E,B08201_003E,B08201_004E,B08201_005E,B08201_006E,B08202_001E,B08202_002E,B08202_003E,B08202_004E,B08202_005E,B11001_001E,B11001_002E,B19001_001E,B19001_002E,B19001_003E,B19001_004E,B19001_005E,B19001_006E,B19001_007E,B19001_008E,B19001_009E,B19001_010E,B19001_011E,B19001_012E,B19001_013E,B19001_014E,B19001_015E,B19001_016E,B19001_017E,NAME,county,state,tract
0,57540,35055,22485,2825,426,1327,955,81,36,2825,364,1695,754,12,2825,559,2825,122,87,83,125,13,52,51,103,0,191,212,310,248,212,334,682,"Census Tract 1, Mecklenburg County, North Caro...",119,37,100
1,4700,2355,2340,481,155,206,105,12,3,481,210,180,88,3,481,71,481,135,64,16,6,15,0,10,11,0,18,15,39,56,37,13,46,"Census Tract 3, Mecklenburg County, North Caro...",119,37,300


Note: it is also possible to do this at the block and tract level. See example below.

In [13]:
income_columns = ['B19001_0%02dE'%i for i in range(1, 18)]
vehicle_columns = ['B08201_0%02dE'%i for i in range(1, 7)]
workers_columns = ['B08202_0%02dE'%i for i in range(1, 6)]
families_columns = ['B11001_001E', 'B11001_002E']
traveltimesex_columns = ['B08013_001E','B08013_002E','B08013_003E']
block_group_columns = income_columns + families_columns
tract_columns = vehicle_columns + workers_columns + traveltimesex_columns
#function for getting block and tract level data. merges to the block level
h_acs = c.block_group_and_tract_query(block_group_columns,
                tract_columns, stateFips, countyFips, 
                merge_columns=['tract', 'county', 'state'],
                block_group_size_attr="B11001_001E",
                tract_size_attr="B08201_001E")

**Step 4:**  
Define person-level variables of interest

In [24]:
population = ['B01001_001E']
sex = ['B01001_002E', 'B01001_026E']
race = ['B02001_0%02dE'%i for i in range(1,11)]
male_age_columns = ['B01001_0%02dE'%i for i in range(3,26)]
female_age_columns = ['B01001_0%02dE'%i for i in range(27,50)]
all_columns = population + sex + race + male_age_columns + female_age_columns
p_acs_tract = c.tract_query(all_columns, stateFips, countyFips)
p_acs_tract

Unnamed: 0,B01001_001E,B01001_002E,B01001_003E,B01001_004E,B01001_005E,B01001_006E,B01001_007E,B01001_008E,B01001_009E,B01001_010E,B01001_011E,B01001_012E,B01001_013E,B01001_014E,B01001_015E,B01001_016E,B01001_017E,B01001_018E,B01001_019E,B01001_020E,B01001_021E,B01001_022E,B01001_023E,B01001_024E,B01001_025E,B01001_026E,B01001_027E,B01001_028E,B01001_029E,B01001_030E,B01001_031E,B01001_032E,B01001_033E,B01001_034E,B01001_035E,B02001_001E,B02001_002E,B02001_003E,B02001_004E,B02001_005E,B02001_006E,B02001_007E,B02001_008E,B02001_009E,B02001_010E,NAME,county,state,tract,B01001_036E,B01001_037E,B01001_038E,B01001_039E,B01001_040E,B01001_041E,B01001_042E,B01001_043E,B01001_044E,B01001_045E,B01001_046E,B01001_047E,B01001_048E,B01001_049E
0,4931,2942,26,11,0,15,43,117,26,512,706,341,223,263,126,67,165,44,58,7,52,29,86,25,0,1989,21,13,12,17,86,57,98,339,304,4931,3812,501,21,289,0,71,237,45,192,"Census Tract 1, Mecklenburg County, North Caro...",119,37,000100,213,229,177,32,163,99,10,0,7,57,49,6,0,0
1,645,341,1,5,2,0,0,0,5,4,14,56,44,11,68,14,61,19,10,9,4,14,0,0,0,304,6,0,4,0,0,0,6,17,51,645,443,150,19,24,0,4,5,0,5,"Census Tract 3, Mecklenburg County, North Caro...",119,37,000300,51,13,22,20,38,49,10,0,4,0,0,8,5,0
2,2640,1480,0,26,0,0,0,0,0,229,460,306,86,70,120,20,37,0,36,8,73,9,0,0,0,1160,0,10,16,0,0,0,0,184,393,2640,2016,407,14,113,0,0,90,0,90,"Census Tract 4, Mecklenburg County, North Caro...",119,37,000400,179,74,104,10,40,100,11,11,0,28,0,0,0,0
3,4982,2815,133,65,43,77,161,62,41,161,570,747,156,57,184,33,112,22,35,12,64,68,12,0,0,2167,59,26,24,16,230,98,66,238,414,4982,3223,1278,37,149,0,14,281,0,281,"Census Tract 5, Mecklenburg County, North Caro...",119,37,000500,394,155,45,144,16,79,16,45,31,0,30,14,11,16
4,2872,1371,49,57,34,0,8,9,8,258,317,236,155,53,59,23,50,9,18,0,0,20,0,0,8,1501,153,35,57,16,0,0,49,234,215,2872,1604,1140,0,9,0,8,111,18,93,"Census Tract 6, Mecklenburg County, North Caro...",119,37,000600,262,95,7,66,73,90,8,47,27,7,37,7,16,0
5,813,287,0,13,26,0,0,0,3,21,32,39,47,22,28,20,24,7,5,0,0,0,0,0,0,526,60,0,14,0,7,4,11,17,102,813,388,406,0,12,0,4,3,0,3,"Census Tract 7, Mecklenburg County, North Caro...",119,37,000700,33,30,65,26,38,44,11,9,0,36,0,19,0,0
6,2892,1341,152,167,134,55,9,22,8,106,121,108,79,43,60,104,68,26,9,13,28,7,17,0,5,1551,164,147,77,51,52,21,14,113,188,2892,385,1969,16,16,400,35,71,0,71,"Census Tract 8, Mecklenburg County, North Caro...",119,37,000800,121,132,61,99,70,133,18,26,10,13,22,14,5,0
7,1768,733,26,34,9,25,0,17,0,44,53,65,137,85,18,65,33,18,42,17,5,19,17,4,0,1035,75,34,52,31,36,16,15,59,170,1768,437,1273,0,33,0,25,0,0,0,"Census Tract 9, Mecklenburg County, North Caro...",119,37,000900,77,52,69,60,52,67,36,24,12,42,8,16,27,5
8,2563,1362,113,56,37,33,0,0,0,13,252,265,89,115,90,102,107,23,0,27,20,6,7,7,0,1201,106,53,41,12,12,26,13,16,170,2563,2248,207,24,46,0,0,38,0,38,"Census Tract 10, Mecklenburg County, North Car...",119,37,001000,146,52,126,74,80,107,8,61,7,0,7,54,19,11
9,2269,1171,80,62,33,31,0,0,6,73,150,113,197,167,59,65,57,7,0,5,16,22,9,11,8,1098,134,73,24,0,0,25,31,44,119,2269,1961,168,15,18,0,17,90,0,90,"Census Tract 11, Mecklenburg County, North Car...",119,37,001100,124,169,62,114,52,41,0,12,0,20,20,11,10,13


**Step 5:**  
Get the puma for our test tracts - this actually downloads the mapping file from the census website so it might take a few seconds

the `tract_to_puma` function below is doing this:

    def tract_to_puma(self, state, county, tract):
        state, county = self.try_fips_lookup(state, county)

        df = self._get_pums_relationship()
        q = "statefp == '%s' and countyfp == '%s' and tractce == '%s'" % (state, county, tract)
        r = df.query(q)
        return r["puma10_id"].values[0], r["puma00_id"].values[0]

In [28]:
puma = c.tract_to_puma(stateFips, countyFips, "006406")
puma

('03106', '01000')

In [30]:
puma10 = puma[0]
puma00 = puma[1]

**Step 6:** 
Download PUMS for people records for a PUMA from SynthPop server (they have pre-processed the large files into smaller ones)

the `download_population_pums` is doing this: 

     def download_population_pums(self, state, puma10=None, puma00=None, **kargs):
            state = self.try_fips_lookup(state)
            if (puma10 is None) & (puma00 is None):
                return self._read_csv(self.pums_population_state_base_url % (state), **kargs)
            pums = self._read_csv(self.pums10_population_base_url % (state, puma10), **kargs)
            if puma00 is not None:
                pums00 = self._read_csv(self.pums00_population_base_url % (state, puma00), **kargs)
                pums = pd.concat([pums, pums00], ignore_index=True)
            return pums

      def download_household_pums(self, state, puma10=None, puma00=None, **kargs):
            state = self.try_fips_lookup(state)
            if (puma10 is None) & (puma00 is None):
                return self._read_csv(self.pums_household_state_base_url % (state), **kargs)
            pums = self._read_csv(self.pums10_household_base_url % (state, puma10), **kargs)
            if puma00 is not None:
                pums00 = self._read_csv(self.pums00_household_base_url % (state, puma00), **kargs)
                pums = pd.concat([pums, pums00], ignore_index=True)

            # filter out gq and empty units (non-hh records)
            pums = pums[(pums.RT == 'H') & (pums.NP > 0) & (pums.TYPE == 1)]

            return pums

In [43]:
p_pums = c.download_population_pums(stateFips, puma10=puma10, puma00=puma00)
p_pums.head(5)

Unnamed: 0,serialno,RT,SPORDER,puma00,puma10,ST,ADJINC,PWGTP,AGEP,CIT,CITWP05,CITWP12,COW,DDRS,DEAR,DEYE,DOUT,DPHY,DRAT,DRATX,DREM,ENG,FER,GCL,GCM,GCR,HINS1,HINS2,HINS3,HINS4,HINS5,HINS6,HINS7,INTP,JWMNP,JWRIP,JWTR,LANX,MAR,MARHD,MARHM,MARHT,MARHW,MARHYP05,MARHYP12,MIG,MIL,MLPA,MLPB,MLPCD,MLPE,MLPFG,MLPH,MLPI,MLPJ,MLPK,NWAB,NWAV,NWLA,NWLK,NWRE,OIP,PAP,RELP,RETP,SCH,SCHG,SCHL,SEMP,SEX,SSIP,SSP,WAGP,WKHP,WKL,WKW,WRK,YOEP05,YOEP12,ANC,ANC1P05,ANC1P12,ANC2P05,ANC2P12,DECADE,DIS,DRIVESP,ESP,ESR,FOD1P,FOD2P,HICOV,HISP,INDP,JWAP,JWDP,LANP05,LANP12,MIGPUMA00,MIGPUMA10,MIGSP05,MIGSP12,MSP,NAICSP,NATIVITY,NOP,OC,OCCP02,OCCP10,OCCP12,PAOC,PERNP,PINCP,POBP05,POBP12,POVPIP,POWPUMA00,POWPUMA10,POWSP05,POWSP12,PRIVCOV,PUBCOV,QTRBIR,RAC1P,RAC2P05,RAC2P12,RAC3P05,RAC3P12,RACAIAN,RACASN,RACBLK,RACNHPI,RACNUM,RACSOR,RACWHT,RC,SCIENGP,SCIENGRLP,SFN,SFR,SOCP00,SOCP10,SOCP12,VPS,WAOB,FAGEP,FANCP,FCITP,FCITWP,FCOWP,FDDRSP,FDEARP,FDEYEP,FDOUTP,FDPHYP,FDRATP,FDRATXP,FDREMP,FENGP,FESRP,FFERP,FFODP,FGCLP,FGCMP,FGCRP,FHINS1P,FHINS2P,FHINS3C,FHINS3P,FHINS4C,FHINS4P,FHINS5C,FHINS5P,FHINS6P,FHINS7P,FHISP,FINDP,FINTP,FJWDP,FJWMNP,FJWRIP,FJWTRP,FLANP,FLANXP,FMARHDP,FMARHMP,FMARHTP,FMARHWP,FMARHYP,FMARP,FMIGP,FMIGSP,FMILPP,FMILSP,FOCCP,FOIP,FPAP,FPOBP,FPOWSP,FRACP,FRELP,FRETP,FSCHGP,FSCHLP,FSCHP,FSEMP,FSEXP,FSSIP,FSSP,FWAGP,FWKHP,FWKLP,FWKWP,FWRKP,FYOEP,PWGTP1,PWGTP2,PWGTP3,PWGTP4,PWGTP5,PWGTP6,PWGTP7,PWGTP8,PWGTP9,PWGTP10,PWGTP11,PWGTP12,PWGTP13,PWGTP14,PWGTP15,PWGTP16,PWGTP17,PWGTP18,PWGTP19,PWGTP20,PWGTP21,PWGTP22,PWGTP23,PWGTP24,PWGTP25,PWGTP26,PWGTP27,PWGTP28,PWGTP29,PWGTP30,PWGTP31,PWGTP32,PWGTP33,PWGTP34,PWGTP35,PWGTP36,PWGTP37,PWGTP38,PWGTP39,PWGTP40,PWGTP41,PWGTP42,PWGTP43,PWGTP44,PWGTP45,PWGTP46,PWGTP47,PWGTP48,PWGTP49,PWGTP50,PWGTP51,PWGTP52,PWGTP53,PWGTP54,PWGTP55,PWGTP56,PWGTP57,PWGTP58,PWGTP59,PWGTP60,PWGTP61,PWGTP62,PWGTP63,PWGTP64,PWGTP65,PWGTP66,PWGTP67,PWGTP68,PWGTP69,PWGTP70,PWGTP71,PWGTP72,PWGTP73,PWGTP74,PWGTP75,PWGTP76,PWGTP77,PWGTP78,PWGTP79,PWGTP80
0,2012000000943,P,1,-9,3106,37,1024887,12,55,1,,,1.0,2.0,2,2,2.0,2.0,,,2.0,,,2.0,,,1,2,2,1,2,2,2,0.0,,,,2.0,3,1.0,2.0,2.0,2.0,-9.0,1994.0,1.0,4.0,,,,,,,,,,2.0,1.0,2.0,1.0,3.0,2400.0,60.0,0,0.0,1.0,,21.0,0.0,2,0.0,0.0,7000.0,40.0,1.0,5.0,2.0,,,4,-9,999,-9,999,,2,,,3.0,6206.0,,1,1,7270.0,,,,,,,,,4.0,5411.0,1,,0,N.A.,N.A.,5400.0,2.0,7000.0,9460.0,-9,37,131.0,,,,,1,1,3,1,-9,1,-9,1,0,0,0,0,1,0,1,0,2.0,2.0,,,N.A.//,N.A.//,434171.0,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,1.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,11,13,23,3,12,12,19,11,24,16,2,4,12,12,11,12,23,4,12,3,10,10,18,4,11,11,19,11,19,22,4,3,12,11,10,12,22,3,14,23,13,11,4,19,11,10,4,12,3,3,20,16,12,11,13,11,3,21,11,21,12,12,3,22,10,13,3,11,3,3,19,18,11,11,12,11,3,20,10,3
1,2012000000943,P,2,-9,3106,37,1024887,13,16,1,,,,2.0,2,2,2.0,2.0,,,2.0,,,,,,2,1,2,1,2,2,2,0.0,,,,2.0,5,,,,,,,3.0,,,,,,,,,,,2.0,3.0,2.0,2.0,2.0,7200.0,0.0,2,0.0,2.0,11.0,12.0,0.0,1,0.0,0.0,0.0,,3.0,,2.0,,,1,-9,924,-9,999,,2,,7.0,6.0,,,1,1,,,,,,-9.0,3100.0,-9.0,37.0,6.0,,1,7.0,1,,,,,0.0,7200.0,-9,37,131.0,,,,,1,1,3,1,-9,1,-9,1,0,0,0,0,1,0,1,1,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,1.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,12,11,21,2,15,14,19,13,24,23,4,4,15,12,11,12,17,3,13,4,12,9,20,4,12,12,23,13,23,22,3,3,13,13,10,11,24,4,14,19,13,11,4,20,11,14,3,15,3,4,21,19,12,15,13,15,4,19,12,16,13,10,4,19,13,11,4,12,3,4,16,18,13,14,12,16,4,21,13,4
2,2012000000943,P,3,-9,3106,37,1024887,13,15,1,,,,2.0,2,2,2.0,2.0,,,2.0,,2.0,,,,2,1,2,1,2,2,2,0.0,,,,2.0,5,,,,,,,3.0,,,,,,,,,,,,,,,,7200.0,0.0,2,0.0,2.0,11.0,12.0,0.0,2,0.0,0.0,0.0,,,,,,,1,-9,924,-9,999,,2,,7.0,,,,1,1,,,,,,-9.0,3100.0,-9.0,37.0,6.0,,1,7.0,1,,,,,,7200.0,-9,37,131.0,,,,,1,1,4,1,-9,1,-9,1,0,0,0,0,1,0,1,1,,,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,1.0,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,14,14,21,4,12,11,16,10,19,21,4,4,12,15,12,13,20,4,16,3,11,12,21,4,16,9,20,12,22,20,4,4,10,14,12,11,31,3,14,19,14,10,3,21,10,13,5,13,4,4,22,18,11,14,13,11,3,24,16,21,13,12,3,19,9,14,3,10,4,3,19,22,14,12,14,14,3,22,14,3
3,2012000002100,P,1,-9,3106,37,1024887,12,84,1,,,,1.0,2,2,1.0,1.0,,,2.0,,,2.0,,,2,2,1,2,2,2,2,4000.0,,,,2.0,3,2.0,2.0,1.0,2.0,-9.0,1954.0,1.0,4.0,,,,,,,,,,2.0,5.0,2.0,2.0,2.0,30000.0,0.0,0,12000.0,1.0,,22.0,0.0,2,0.0,14000.0,0.0,,3.0,,2.0,,,1,-9,939,-9,999,,1,,,6.0,1902.0,3501.0,1,1,,,,,,,,,,4.0,,1,,0,,,,4.0,0.0,60000.0,-9,21,501.0,,,,,2,1,4,1,-9,1,-9,1,0,0,0,0,1,0,1,0,2.0,2.0,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,13,15,13,4,22,10,13,4,12,4,3,23,21,13,14,10,12,4,24,24,12,12,12,3,20,11,15,3,14,4,4,24,24,17,14,14,13,4,23,4,13,14,12,21,3,13,14,18,13,17,23,3,3,14,15,15,12,21,4,4,14,13,14,20,3,13,13,26,12,20,22,3,4,14,12,13,15,20,4,19
4,2012000014584,P,1,-9,3106,37,1024887,14,80,1,,,,2.0,1,2,2.0,1.0,,2.0,2.0,,,2.0,,,1,2,1,2,2,2,2,0.0,,,,2.0,1,2.0,2.0,3.0,2.0,-9.0,1986.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,2.0,2.0,3.0,0.0,0.0,0,27100.0,1.0,,19.0,0.0,1,0.0,17500.0,0.0,,3.0,,2.0,,,1,-9,32,-9,999,,1,,,6.0,,,1,1,,,,,,,,,,1.0,,1,,0,,,,,0.0,44600.0,-9,26,501.0,,,,,1,1,4,1,-9,1,-9,1,0,0,0,0,1,0,1,0,,,,,,,,9.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,5,12,15,17,15,4,12,14,24,29,4,3,24,14,4,20,29,15,14,13,5,13,12,11,13,5,13,18,29,28,4,5,22,16,5,20,24,14,19,12,5,15,19,16,15,4,15,12,25,21,4,3,24,13,5,29,24,14,15,20,4,14,12,15,15,4,17,15,26,27,4,4,23,16,5,23,25,11,14,12


**Step 7:**  
Categorize the ACS household data into categories we can read

In [37]:
h_acs_cat = cat.categorize(h_acs_tract, {
    ("households", "total"): "B11001_001E",
    ("children", "yes"): "B11001_002E",
    ("children", "no"): "B11001_001E - B11001_002E",
    ("income", "lt35"): "B19001_002E + B19001_003E + B19001_004E + "
                        "B19001_005E + B19001_006E + B19001_007E",
    ("income", "gt35-lt100"): "B19001_008E + B19001_009E + "
                        "B19001_010E + B19001_011E + B19001_012E"
                        "+ B19001_013E",
    ("income", "gt100"): "B19001_014E + B19001_015E + B19001_016E"
                        "+ B19001_017E",
    ("cars", "none"): "B08201_002E",
    ("cars", "one"): "B08201_003E",
    ("cars", "two or more"): "B08201_004E + B08201_005E + B08201_006E",
    ("workers", "none"): "B08202_002E",
    ("workers", "one"): "B08202_003E",
    ("workers", "two or more"): "B08202_004E + B08202_005E",
    ("traveltime","total"):"B08013_001E",
    ("traveltime","male"):"B08013_002E",
    ("traveltime","female"):"B08013_003E"
}, index_cols=['NAME'])
h_acs_cat

cat_name,cars,cars,cars,children,children,households,income,income,income,traveltime,traveltime,traveltime,workers,workers,workers
cat_value,none,one,two or more,no,yes,total,gt100,gt35-lt100,lt35,female,male,total,none,one,two or more
NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
"Census Tract 1, Mecklenburg County, North Carolina",426,1327,1072,2266,559,2825,1476,867,482,22485,35055,57540,364,1695,766
"Census Tract 3, Mecklenburg County, North Carolina",155,206,120,410,71,481,152,93,236,2340,2355,4700,210,180,91
"Census Tract 4, Mecklenburg County, North Carolina",278,1014,472,1511,253,1764,650,794,320,18845,20525,39365,189,1087,488
"Census Tract 5, Mecklenburg County, North Carolina",441,1271,783,1891,604,2495,944,921,630,22745,36210,58950,422,1275,798
"Census Tract 6, Mecklenburg County, North Carolina",235,832,502,1081,488,1569,379,644,546,17065,19865,36930,241,872,456
"Census Tract 7, Mecklenburg County, North Carolina",16,137,206,182,177,359,38,244,77,7125,2590,9710,23,211,125
"Census Tract 8, Mecklenburg County, North Carolina",207,412,312,380,551,931,49,306,576,15880,10560,26440,222,481,228
"Census Tract 9, Mecklenburg County, North Carolina",116,292,307,424,291,715,79,269,367,12580,11865,24445,196,256,263
"Census Tract 10, Mecklenburg County, North Carolina",61,365,766,613,579,1192,586,369,237,12840,15130,27965,174,489,529
"Census Tract 11, Mecklenburg County, North Carolina",25,330,577,468,464,932,414,389,129,13625,17710,31340,58,385,489


In [36]:
# this is a check to see whether categories have duplicate values 
# throws an error for us because of aggregate travel time (its about time not just counts of people)
assert np.all(cat.sum_accross_category(h_acs_cat) < 2) 

In [None]:
**Step 8:** 
Categorize the ACS person data into categories we can read

In [39]:
p_acs_cat = cat.categorize(p_acs_tract, {
    ("population", "total"): "B01001_001E",
    ("age", "19 and under"): "B01001_003E + B01001_004E + B01001_005E + "
                             "B01001_006E + B01001_007E + B01001_027E + "
                             "B01001_028E + B01001_029E + B01001_030E + "
                             "B01001_031E",
    ("age", "20 to 35"): "B01001_008E + B01001_009E + B01001_010E + "
                         "B01001_011E + B01001_012E + B01001_032E + "
                         "B01001_033E + B01001_034E + B01001_035E + "
                         "B01001_036E",
    ("age", "35 to 60"): "B01001_013E + B01001_014E + B01001_015E + "
                         "B01001_016E + B01001_017E + B01001_037E + "
                         "B01001_038E + B01001_039E + B01001_040E + "
                         "B01001_041E",
    ("age", "above 60"): "B01001_018E + B01001_019E + B01001_020E + "
                         "B01001_021E + B01001_022E + B01001_023E + "
                         "B01001_024E + B01001_025E + B01001_042E + "
                         "B01001_043E + B01001_044E + B01001_045E + "
                         "B01001_046E + B01001_047E + B01001_048E + "
                         "B01001_049E", 
    ("race", "white"):   "B02001_002E",
    ("race", "black"):   "B02001_003E",
    ("race", "asian"):   "B02001_005E",
    ("race", "other"):   "B02001_004E + B02001_006E + B02001_007E + "
                         "B02001_008E",
    ("sex", "male"):     "B01001_002E",
    ("sex", "female"):   "B01001_026E"
}, index_cols=['NAME'])
p_acs_cat

cat_name,age,age,age,age,population,race,race,race,race,sex,sex
cat_value,19 and under,20 to 35,35 to 60,above 60,total,asian,black,other,white,female,male
NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
"Census Tract 1, Mecklenburg County, North Carolina",244,2713,1544,430,4931,289,501,329,3812,1989,2942
"Census Tract 3, Mecklenburg County, North Carolina",18,204,340,83,645,24,150,28,443,304,341
"Census Tract 4, Mecklenburg County, North Carolina",52,1751,661,176,2640,113,407,104,2016,1160,1480
"Census Tract 5, Mecklenburg County, North Carolina",834,2791,981,376,4982,149,1278,332,3223,2167,2815
"Census Tract 6, Mecklenburg County, North Carolina",409,1588,671,204,2872,9,1140,119,1604,1501,1371
"Census Tract 7, Mecklenburg County, North Carolina",120,262,344,87,813,12,406,7,388,526,287
"Census Tract 8, Mecklenburg County, North Carolina",1008,822,849,213,2892,16,1969,522,385,1551,1341
"Census Tract 9, Mecklenburg County, North Carolina",322,516,638,292,1768,33,1273,25,437,1035,733
"Census Tract 10, Mecklenburg County, North Carolina",463,901,942,257,2563,46,207,62,2248,1201,1362
"Census Tract 11, Mecklenburg County, North Carolina",437,685,983,164,2269,18,168,122,1961,1098,1171


In [41]:
assert np.all(cat.sum_accross_category(p_acs_cat) < 2)

In [42]:
def age_cat(r):
    if r.AGEP <= 19: return "19 and under"
    elif r.AGEP <= 35: return "20 to 35"
    elif r.AGEP <= 60: return "35 to 60"
    return "above 60"

def race_cat(r):
    if r.RAC1P == 1: return "white"
    elif r.RAC1P == 2: return "black"
    elif r.RAC1P == 6: return "asian"
    return "other"

def sex_cat(r):
    if r.SEX == 1: return "male"
    return "female"

_, jd_persons = cat.joint_distribution(
    p_pums,
    cat.category_combinations(p_acs_cat.columns),
    {"age": age_cat, "race": race_cat, "sex": sex_cat}
)
jd_persons 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cat_id,frequency
age,race,sex,Unnamed: 3_level_1,Unnamed: 4_level_1
19 and under,asian,female,0,67
19 and under,asian,male,1,55
19 and under,black,female,2,289
19 and under,black,male,3,273
19 and under,other,female,4,115
19 and under,other,male,5,102
19 and under,white,female,6,952
19 and under,white,male,7,984
20 to 35,asian,female,8,61
20 to 35,asian,male,9,68
