# Collect econ attributes for MA, NH, FL, and IL in 2016 and 2018 - Census Tracts

- Two time points: ACS 2016 5y and ACS 2018 5y.
- Output six files: Boston metro, florida, and illinois (2016, 2018)
- Spatial unit: census tracts. 


In [1]:
import pandas as pd
import censusdata # to download ACS data
import copy
import numpy as np
import matplotlib.pyplot as plt
import pickle


Bad key text.latex.unicode in file /home/jtl/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 112 ('text.latex.unicode : False # use "ucs" and "inputenc" LaTeX packages for handling')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.1/matplotlibrc.template
or from the matplotlib source distribution

Bad key text.latex.preview in file /home/jtl/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 125 ('text.latex.preview : False')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.5.1/matplotlibrc.template
or from the matplotlib source distribution

Bad key mathtext.fallback_to_cm in file /home/jtl/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 157 ('mathtext.fallback_to_cm : True  # When True, use symbols from the Computer Modern')


## Check Information & Setup

In [2]:
censusdata.printtable(censusdata.censustable('acs5', 2016, 'B01002'))

Variable     | Table                          | Label                                                    | Type 
-------------------------------------------------------------------------------------------------------------------
B01002_001E  | MEDIAN AGE BY SEX              | !! !! Estimate Median age Total                          | float
B01002_002E  | MEDIAN AGE BY SEX              | !! !! Estimate Median age Male                           | float
B01002_003E  | MEDIAN AGE BY SEX              | !! !! Estimate Median age Female                         | float
-------------------------------------------------------------------------------------------------------------------


In [3]:
# define variable list
# Include population, sex, age, race, income, travel, education, and properties.
# in 2009, the rows of B15003, B23035, B25031, B25111 did not exist. These rows exist in 2016 and 2018.

var_list = [
# population        
            'B01003_001E', 
            'B01001_001E', 'B01001_002E', 'B01001_026E', 
            'B01002_001E',
# households
            'B11001_001E',
# race
            'B02001_001E', 'B02001_002E', 'B02001_003E', 'B02001_004E', 'B02001_005E', 
# income info (a lot of NAs)    
            'B06010_001E', 'B06010_002E', 'B06010_003E', 'B06010_004E', 'B06010_005E', 'B06010_006E', 'B06010_007E', 'B06010_008E', 'B06010_009E', 'B06010_010E', 'B06010_011E',
            'B06011_001E',      
# travel
#             'B08101_001E', 'B08101_009E', 'B08101_017E', 'B08101_025E', 'B08101_033E', 'B08101_041E', 'B08101_049E', 
#             'B08015_001E', 
            'B08301_001E', 'B08301_002E', 'B08301_010E', 'B08301_016E', 'B08301_018E', 'B08301_019E', 'B08301_021E', 
# education                         
            'B15001_001E',
            'B15001_017E', 'B15001_018E', 'B15001_025E', 'B15001_026E', 'B15001_033E', 'B15001_034E', 'B15001_041E', 'B15001_042E', 
            'B15001_058E', 'B15001_059E', 'B15001_066E', 'B15001_067E', 'B15001_074E', 'B15001_075E', 'B15001_082E', 'B15001_083E',
            'B15003_001E', 'B15003_022E', 'B15003_023E', 'B15003_025E', 
# income info (more complete)                       
            'B19013_001E', 'B19301_001E', 
# employement
            'B23025_001E', 'B23025_002E', 'B23025_007E', 
# properties            
            'B25002_001E', 'B25002_002E', 'B25002_003E',             
            'B25064_001E', 
#             'B25031_002E', 'B25031_003E', 'B25031_004E', 'B25031_005E', 'B25031_006E', 'B25031_007E', 
#             'B25111_002E', 'B25111_003E', 'B25111_004E', 'B25111_005E', 'B25111_006E', 'B25111_007E', 'B25111_008E', 'B25111_009E', 'B25111_010E', 'B25111_011E',
            'B25075_001E', 'B25077_001E',
# imputation
            'B99082_001E'
           ]

var_names = [
# population
             'pop_total', 
             'sex_total', 'sex_male', 'sex_female',
             'age_median',
# hosueholds
             'households',
# race             
             'race_total', 'race_white', 'race_black', 'race_native', 'race_asian',
# income info (a lot of NAs)
             'inc_total_pop', 'inc_no_pop', 'inc_with_pop', 'inc_pop_10k', 'inc_pop_1k_15k', 'inc_pop_15k_25k', 'inc_pop_25k_35k', 'inc_pop_35k_50k', 'inc_pop_50k_65k', 'inc_pop_65k_75k', 'inc_pop_75k',
             'inc_median_ind', 
# travel
#              'travel_total_to_work', 'travel_single_driving_to_work', 'travel_carpool_to_work', 'travel_public_transit_to_work', 'travel_walking_to_work', 'travel_cycling_to_work', 'travel_work_from_home',  
#              'vehicle_total', 
             'travel_total_to_work', 'travel_driving_to_work', 'travel_pt_to_work', 'travel_taxi_to_work', 'travel_cycle_to_work', 'travel_walk_to_work', 'travel_work_from_home', 
# education             
             'edu_total_pop', 
             'bachelor_male_25_34', 'master_phd_male_25_34', 'bachelor_male_35_44', 'master_phd_male_35_44', 'bachelor_male_45_64', 'master_phd_male_45_64',  'bachelor_male_65_over', 'master_phd_male_65_over',
             'bachelor_female_25_34', 'master_phd_female_25_34', 'bachelor_female_35_44', 'master_phd_female_35_44', 'bachelor_female_45_64', 'master_phd_female_45_64',  'bachelor_female_65_over', 'master_phd_female_65_over',
             'edu_total', 'edu_bachelor', 'edu_master', 'edu_phd',  
# income info (more complete)                       
             'inc_median_household', 'inc_per_capita', 
# employement
            'employment_total_labor', 'employment_employed', 'employment_unemployed', 
# properties                                       
             'housing_units_total', 'housing_units_occupied', 'housing_units_vacant',
             'rent_median', 
#              'rent_0_bedroom', 'rent_1_bedroom', 'rent_2_bedroom', 'rent_3_bedroom', 'rent_4_bedroom', 'rent_5_bedroom', 
#              'rent_built_2014', 'rent_built_2010', 'rent_built_2000', 'rent_built_1990', 'rent_built_1980', 'rent_built_1970', 'rent_built_1960', 'rent_built_1950', 'rent_built_1940', 'rent_built_1930',
             'property_value_total', 'property_value_median', 
# imputation
            'vehicle_total_imputed'
            ]


## Download data (MA, NH, FL, IL; 2016 and 2018)

In [4]:
ma_county_list = ['003', '001', '023', '021', '019', '025', '007', '017', '005', '013', '009', '011', '027', '015']
nh_county_list = ['013', '001', '017', '005', '009', '015', '019', '011', '007', '003']

In [4]:
censusdata.geographies(censusdata.censusgeo([('state', '33'), ('tract', '*')]), 'acs5', 2016)


{'Census Tract 1.02, Hillsborough County, New Hampshire': censusgeo((('state', '33'), ('county', '011'), ('tract', '000102'))),
 'Census Tract 14, Hillsborough County, New Hampshire': censusgeo((('state', '33'), ('county', '011'), ('tract', '001400'))),
 'Census Tract 106, Hillsborough County, New Hampshire': censusgeo((('state', '33'), ('county', '011'), ('tract', '010600'))),
 'Census Tract 114.02, Hillsborough County, New Hampshire': censusgeo((('state', '33'), ('county', '011'), ('tract', '011402'))),
 'Census Tract 121, Hillsborough County, New Hampshire': censusgeo((('state', '33'), ('county', '011'), ('tract', '012100'))),
 'Census Tract 131, Hillsborough County, New Hampshire': censusgeo((('state', '33'), ('county', '011'), ('tract', '013100'))),
 'Census Tract 142.01, Hillsborough County, New Hampshire': censusgeo((('state', '33'), ('county', '011'), ('tract', '014201'))),
 'Census Tract 161, Hillsborough County, New Hampshire': censusgeo((('state', '33'), ('county', '011'), (

In [5]:
# test the variable availability using county = 001, before downloading all the variables.
test_df = censusdata.download('acs5', 2016, censusdata.censusgeo([('state', '25'), ('tract', '*')]),
                                            var_list)
test_df.columns = var_names
pd.set_option('display.max_rows', 500)
print(np.sum(test_df.isna()))
pd.set_option('display.max_rows', 10) # sw: very high quality

pop_total                    0
sex_total                    0
sex_male                     0
sex_female                   0
age_median                   0
households                   0
race_total                   0
race_white                   0
race_black                   0
race_native                  0
race_asian                   0
inc_total_pop                0
inc_no_pop                   0
inc_with_pop                 0
inc_pop_10k                  0
inc_pop_1k_15k               0
inc_pop_15k_25k              0
inc_pop_25k_35k              0
inc_pop_35k_50k              0
inc_pop_50k_65k              0
inc_pop_65k_75k              0
inc_pop_75k                  0
inc_median_ind               0
travel_total_to_work         0
travel_driving_to_work       0
travel_pt_to_work            0
travel_taxi_to_work          0
travel_cycle_to_work         0
travel_walk_to_work          0
travel_work_from_home        0
edu_total_pop                0
bachelor_male_25_34          0
master_p

In [6]:
# download ma_ct_2016, ma_ct_2018, nh_ct_2016, nh_ct_2018.
ma_ct_2016 = censusdata.download('acs5', 2016, censusdata.censusgeo([('state', '25'), ('tract', '*')]), var_list)
ma_ct_2018 = censusdata.download('acs5', 2018, censusdata.censusgeo([('state', '25'), ('tract', '*')]), var_list)
nh_ct_2016 = censusdata.download('acs5', 2016, censusdata.censusgeo([('state', '33'), ('tract', '*')]), var_list)
nh_ct_2018 = censusdata.download('acs5', 2018, censusdata.censusgeo([('state', '33'), ('tract', '*')]), var_list)


In [7]:
# download fl_ct_2016, fl_ct_2018, il_ct_2016, il_ct_2018.
fl_ct_2016 = censusdata.download('acs5', 2016, censusdata.censusgeo([('state', '12'), ('tract', '*')]), var_list)
fl_ct_2018 = censusdata.download('acs5', 2018, censusdata.censusgeo([('state', '12'), ('tract', '*')]), var_list)
il_ct_2016 = censusdata.download('acs5', 2016, censusdata.censusgeo([('state', '17'), ('tract', '*')]), var_list)
il_ct_2018 = censusdata.download('acs5', 2018, censusdata.censusgeo([('state', '17'), ('tract', '*')]), var_list)


In [9]:
# replace the column names
ma_ct_2016.columns = var_names
ma_ct_2018.columns = var_names
nh_ct_2016.columns = var_names
nh_ct_2018.columns = var_names
fl_ct_2016.columns = var_names
fl_ct_2018.columns = var_names
il_ct_2016.columns = var_names
il_ct_2018.columns = var_names

# add year
ma_ct_2016['year'] = 2016
ma_ct_2018['year'] = 2018
nh_ct_2016['year'] = 2016
nh_ct_2018['year'] = 2018
fl_ct_2016['year'] = 2016
fl_ct_2018['year'] = 2018
il_ct_2016['year'] = 2016
il_ct_2018['year'] = 2018

# add state
ma_ct_2016['state'] = 'MA'
ma_ct_2018['state'] = 'MA'
nh_ct_2016['state'] = 'NH'
nh_ct_2018['state'] = 'NH'
fl_ct_2016['state'] = 'FL'
fl_ct_2018['state'] = 'FL'
il_ct_2016['state'] = 'IL'
il_ct_2018['state'] = 'IL'


In [11]:
# concat the MA and NH files, since the two cannot cover the area of the mobility data
# no need to concat the FL or IL files
boston_metro_ct_2016_df = pd.concat([ma_ct_2016, nh_ct_2016], axis = 0)
boston_metro_ct_2018_df = pd.concat([ma_ct_2018, nh_ct_2018], axis = 0)


In [12]:
# add the FIPS info. Change the idx. 
def add_fips(df_state):
    state_fips=[]
    county_fips=[]
    tract_fips=[]
    full_ct_fips=[]
    
    for i in range(df_state.shape[0]):
        state_fips.append(df_state.index[i].params()[0][1])  
        county_fips.append(df_state.index[i].params()[1][1])  
        tract_fips.append(df_state.index[i].params()[2][1])
        full_ct_fips.append(df_state.index[i].params()[0][1]
                            +df_state.index[i].params()[1][1]
                            +df_state.index[i].params()[2][1])
    
    df_state['state_fips'] = state_fips
    df_state['county_fips'] = county_fips
    df_state['tract_fips'] = tract_fips
    df_state['full_ct_fips'] = full_ct_fips
    
    df_state.reset_index(drop = True, inplace = True)
    return df_state

# apply to 2016 and 2018 
boston_metro_ct_2016_df_v1 = add_fips(boston_metro_ct_2016_df)
boston_metro_ct_2018_df_v1 = add_fips(boston_metro_ct_2018_df)
fl_ct_2016_df_v1 = add_fips(fl_ct_2016)
fl_ct_2018_df_v1 = add_fips(fl_ct_2018)
il_ct_2016_df_v1 = add_fips(il_ct_2016)
il_ct_2018_df_v1 = add_fips(il_ct_2018)


In [13]:
boston_metro_ct_2016_df_v1

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,rent_median,property_value_total,property_value_median,vehicle_total_imputed,year,state,state_fips,county_fips,tract_fips,full_ct_fips
0,6264,6264,3009,3255,32.0,1972,6264,2827,2542,8,...,1244,1257,273100,2786,2016,MA,25,023,510700,25023510700
1,4905,4905,2394,2511,39.2,1781,4905,2021,2264,0,...,505,948,188000,2180,2016,MA,25,023,511200,25023511200
2,4504,4504,2333,2171,47.2,1563,4504,4181,35,0,...,1563,1402,597600,2292,2016,MA,25,009,253203,25009253203
3,5287,5287,2689,2598,41.6,2157,5287,4058,38,0,...,958,1581,500200,2963,2016,MA,25,017,388100,25017388100
4,5125,5125,2555,2570,49.6,1762,5125,4390,33,9,...,2085,1694,757900,2447,2016,MA,25,017,362100,25017362100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1768,3276,3276,1757,1519,31.7,1716,3276,2095,116,12,...,1239,257,153200,1860,2016,NH,33,011,011102,33011011102
1769,3549,3549,1588,1961,31.8,1749,3549,2593,107,3,...,1216,330,122900,2368,2016,NH,33,011,011101,33011011101
1770,2078,2078,1027,1051,44.3,852,2078,2054,0,0,...,729,600,121700,1026,2016,NH,33,011,018501,33011018501
1771,1420,1420,706,714,46.2,559,1420,1365,0,0,...,1861,513,283800,785,2016,NH,33,011,018502,33011018502


In [14]:
fl_ct_2016_df_v1

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,rent_median,property_value_total,property_value_median,vehicle_total_imputed,year,state,state_fips,county_fips,tract_fips,full_ct_fips
0,6082,6082,3284,2798,49.3,2106,6082,5586,332,28,...,1108,1503,158600,2726,2016,FL,12,071,010303,12071010303
1,7379,7379,3212,4167,46.3,2365,7379,5841,825,7,...,914,1400,132400,2694,2016,FL,12,071,010307,12071010307
2,9873,9873,4798,5075,35.6,3035,9873,8954,422,15,...,1185,1851,165100,4013,2016,FL,12,071,010103,12071010103
3,2583,2583,1168,1415,58.1,1504,2583,2540,4,0,...,916,846,114400,1202,2016,FL,12,071,001701,12071001701
4,4566,4566,2209,2357,41.8,1921,4566,4263,9,0,...,1117,1082,237200,2320,2016,FL,12,071,001801,12071001801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4240,2441,2441,1205,1236,58.3,1305,2441,2272,26,0,...,1148,912,405600,1131,2016,FL,12,099,000511,12099000511
4241,3965,3965,1902,2063,50.3,1704,3965,3620,169,0,...,1453,1260,435500,2001,2016,FL,12,099,000703,12099000703
4242,3391,3391,1455,1936,59.4,1859,3391,3118,53,0,...,1638,1123,240500,1395,2016,FL,12,099,000803,12099000803
4243,3015,3015,1359,1656,47.5,1347,3015,2779,70,0,...,1337,980,199500,1855,2016,FL,12,099,000804,12099000804


In [15]:
# check the na.
pd.set_option('display.max_rows', 500)
# print(np.sum(boston_metro_bg_2016_df_v1.isna()))
print(np.sum(boston_metro_ct_2018_df_v1.isna()))
pd.set_option('display.max_rows', 10)

pop_total                    0
sex_total                    0
sex_male                     0
sex_female                   0
age_median                   0
households                   0
race_total                   0
race_white                   0
race_black                   0
race_native                  0
race_asian                   0
inc_total_pop                0
inc_no_pop                   0
inc_with_pop                 0
inc_pop_10k                  0
inc_pop_1k_15k               0
inc_pop_15k_25k              0
inc_pop_25k_35k              0
inc_pop_35k_50k              0
inc_pop_50k_65k              0
inc_pop_65k_75k              0
inc_pop_75k                  0
inc_median_ind               0
travel_total_to_work         0
travel_driving_to_work       0
travel_pt_to_work            0
travel_taxi_to_work          0
travel_cycle_to_work         0
travel_walk_to_work          0
travel_work_from_home        0
edu_total_pop                0
bachelor_male_25_34          0
master_p

# Save

In [13]:
with open('../../data/01_raw/boston_metro_ct_2016_df.pickle', 'wb') as f:
    pickle.dump(boston_metro_ct_2016_df_v1, f)
    
with open('../../data/01_raw/boston_metro_ct_2018_df.pickle', 'wb') as f:
    pickle.dump(boston_metro_ct_2018_df_v1, f)

In [16]:
with open('../../data/01_raw/florida_ct_2016_df.pickle', 'wb') as f:
    pickle.dump(fl_ct_2016_df_v1, f)
    
with open('../../data/01_raw/florida_ct_2018_df.pickle', 'wb') as f:
    pickle.dump(fl_ct_2018_df_v1, f)

In [17]:
with open('../../data/01_raw/illinois_ct_2016_df.pickle', 'wb') as f:
    pickle.dump(il_ct_2016_df_v1, f)
    
with open('../../data/01_raw/illinois_ct_2018_df.pickle', 'wb') as f:
    pickle.dump(il_ct_2018_df_v1, f)