# Collect econ attributes at two years (2016 and 2018)


- Download the socio-demographics of MA and NH census block groups.
- Two time points: ACS 2016 5y and ACS 2018 5y.
- Save two raw data files in the 01_raw data folder.
- Hard to get earlier data: variables and spatial indicators do not align.

In [3]:
import pandas as pd
import censusdata # to download ACS data
import copy
import numpy as np
import matplotlib.pyplot as plt
import pickle

## Check Information & Setup

In [4]:
# Check the ACS lookup table here:
# https://www.census.gov/programs-surveys/acs/technical-documentation/summary-file-documentation.2015.html
# https://jtleider.github.io/censusdata/

In [5]:
# 
censusdata.search('acs1', 2015, 'label', 'rent') # check acs1, acs3, acs5, 2010-2020

[('B05001PR_004E',
  'NATIVITY AND CITIZENSHIP STATUS IN PUERTO RICO',
  'Estimate!!Total!!U.S. citizen, born abroad of American parent(s)'),
 ('B05001_004E',
  'NATIVITY AND CITIZENSHIP STATUS IN THE UNITED STATES',
  'Estimate!!Total!!U.S. citizen, born abroad of American parent(s)'),
 ('B05002_012E',
  'PLACE OF BIRTH BY NATIVITY AND CITIZENSHIP STATUS',
  'Estimate!!Total!!Native!!Born outside the United States!!Born abroad of American parent(s)'),
 ('B05009_003E',
  'AGE AND NATIVITY OF OWN CHILDREN UNDER 18 YEARS IN FAMILIES AND SUBFAMILIES BY NUMBER AND NATIVITY OF PARENTS',
  'Estimate!!Total!!Under 6 years!!Living with two parents'),
 ('B05009_004E',
  'AGE AND NATIVITY OF OWN CHILDREN UNDER 18 YEARS IN FAMILIES AND SUBFAMILIES BY NUMBER AND NATIVITY OF PARENTS',
  'Estimate!!Total!!Under 6 years!!Living with two parents!!Child is native'),
 ('B05009_005E',
  'AGE AND NATIVITY OF OWN CHILDREN UNDER 18 YEARS IN FAMILIES AND SUBFAMILIES BY NUMBER AND NATIVITY OF PARENTS',
  'Est

In [6]:
# 
censusdata.search('acs1', 2019, 'label', 'rent') # check acs1, acs3, acs5, 2010-2020

[('B05001PR_004E',
  'NATIVITY AND CITIZENSHIP STATUS IN PUERTO RICO',
  'Estimate!!Total:!!U.S. citizen, born abroad of American parent(s)'),
 ('B05001_004E',
  'NATIVITY AND CITIZENSHIP STATUS IN THE UNITED STATES',
  'Estimate!!Total:!!U.S. citizen, born abroad of American parent(s)'),
 ('B05002_012E',
  'PLACE OF BIRTH BY NATIVITY AND CITIZENSHIP STATUS',
  'Estimate!!Total:!!Native:!!Born outside the United States:!!Born abroad of American parent(s)'),
 ('B05009_003E',
  'AGE AND NATIVITY OF OWN CHILDREN UNDER 18 YEARS IN FAMILIES AND SUBFAMILIES BY NUMBER AND NATIVITY OF PARENTS',
  'Estimate!!Total:!!Under 6 years:!!Living with two parents:'),
 ('B05009_004E',
  'AGE AND NATIVITY OF OWN CHILDREN UNDER 18 YEARS IN FAMILIES AND SUBFAMILIES BY NUMBER AND NATIVITY OF PARENTS',
  'Estimate!!Total:!!Under 6 years:!!Living with two parents:!!Child is native'),
 ('B05009_005E',
  'AGE AND NATIVITY OF OWN CHILDREN UNDER 18 YEARS IN FAMILIES AND SUBFAMILIES BY NUMBER AND NATIVITY OF PAREN

In [7]:
censusdata.printtable(censusdata.censustable('acs5', 2016, 'B01002'))

Variable     | Table                          | Label                                                    | Type 
-------------------------------------------------------------------------------------------------------------------
B01002_001E  | MEDIAN AGE BY SEX              | !! !! Estimate Median age Total                          | float
B01002_002E  | MEDIAN AGE BY SEX              | !! !! Estimate Median age Male                           | float
B01002_003E  | MEDIAN AGE BY SEX              | !! !! Estimate Median age Female                         | float
-------------------------------------------------------------------------------------------------------------------


In [8]:
# define variable list
# Include population, sex, age, race, income, travel, education, and properties.
# in 2009, the rows of B15003, B23035, B25031, B25111 did not exist. These rows exist in 2016 and 2018.

var_list = [
# population        
            'B01003_001E', 
            'B01001_001E', 'B01001_002E', 'B01001_026E', 
            'B01002_001E',
# households
            'B11001_001E',
# race
            'B02001_001E', 'B02001_002E', 'B02001_003E', 'B02001_004E', 'B02001_005E', 
# income info (a lot of NAs)    
            'B06010_001E', 'B06010_002E', 'B06010_003E', 'B06010_004E', 'B06010_005E', 'B06010_006E', 'B06010_007E', 'B06010_008E', 'B06010_009E', 'B06010_010E', 'B06010_011E',
            'B06011_001E',      
# travel
#             'B08101_001E', 'B08101_009E', 'B08101_017E', 'B08101_025E', 'B08101_033E', 'B08101_041E', 'B08101_049E', 
#             'B08015_001E', 
            'B08301_001E', 'B08301_002E', 'B08301_010E', 'B08301_016E', 'B08301_018E', 'B08301_019E', 'B08301_021E', 
# education                         
            'B15001_001E',
            'B15001_017E', 'B15001_018E', 'B15001_025E', 'B15001_026E', 'B15001_033E', 'B15001_034E', 'B15001_041E', 'B15001_042E', 
            'B15001_058E', 'B15001_059E', 'B15001_066E', 'B15001_067E', 'B15001_074E', 'B15001_075E', 'B15001_082E', 'B15001_083E',
            'B15003_001E', 'B15003_022E', 'B15003_023E', 'B15003_025E', 
# income info (more complete)                       
            'B19013_001E', 'B19301_001E', 
# employement
            'B23025_001E', 'B23025_002E', 'B23025_007E', 
# properties            
            'B25002_001E', 'B25002_002E', 'B25002_003E',             
            'B25064_001E', 
#             'B25031_002E', 'B25031_003E', 'B25031_004E', 'B25031_005E', 'B25031_006E', 'B25031_007E', 
#             'B25111_002E', 'B25111_003E', 'B25111_004E', 'B25111_005E', 'B25111_006E', 'B25111_007E', 'B25111_008E', 'B25111_009E', 'B25111_010E', 'B25111_011E',
            'B25075_001E', 'B25077_001E',
# imputation
            'B99082_001E'
           ]

var_names = [
# population
             'pop_total', 
             'sex_total', 'sex_male', 'sex_female',
             'age_median',
# hosueholds
             'households',
# race             
             'race_total', 'race_white', 'race_black', 'race_native', 'race_asian',
# income info (a lot of NAs)
             'inc_total_pop', 'inc_no_pop', 'inc_with_pop', 'inc_pop_10k', 'inc_pop_1k_15k', 'inc_pop_15k_25k', 'inc_pop_25k_35k', 'inc_pop_35k_50k', 'inc_pop_50k_65k', 'inc_pop_65k_75k', 'inc_pop_75k',
             'inc_median_ind', 
# travel
#              'travel_total_to_work', 'travel_single_driving_to_work', 'travel_carpool_to_work', 'travel_public_transit_to_work', 'travel_walking_to_work', 'travel_cycling_to_work', 'travel_work_from_home',  
#              'vehicle_total', 
             'travel_total_to_work', 'travel_driving_to_work', 'travel_pt_to_work', 'travel_taxi_to_work', 'travel_cycle_to_work', 'travel_walk_to_work', 'travel_work_from_home', 
# education             
             'edu_total_pop', 
             'bachelor_male_25_34', 'master_phd_male_25_34', 'bachelor_male_35_44', 'master_phd_male_35_44', 'bachelor_male_45_64', 'master_phd_male_45_64',  'bachelor_male_65_over', 'master_phd_male_65_over',
             'bachelor_female_25_34', 'master_phd_female_25_34', 'bachelor_female_35_44', 'master_phd_female_35_44', 'bachelor_female_45_64', 'master_phd_female_45_64',  'bachelor_female_65_over', 'master_phd_female_65_over',
             'edu_total', 'edu_bachelor', 'edu_master', 'edu_phd',  
# income info (more complete)                       
             'inc_median_household', 'inc_per_capita', 
# employement
            'employment_total_labor', 'employment_employed', 'employment_unemployed', 
# properties                                       
             'housing_units_total', 'housing_units_occupied', 'housing_units_vacant',
             'rent_median', 
#              'rent_0_bedroom', 'rent_1_bedroom', 'rent_2_bedroom', 'rent_3_bedroom', 'rent_4_bedroom', 'rent_5_bedroom', 
#              'rent_built_2014', 'rent_built_2010', 'rent_built_2000', 'rent_built_1990', 'rent_built_1980', 'rent_built_1970', 'rent_built_1960', 'rent_built_1950', 'rent_built_1940', 'rent_built_1930',
             'property_value_total', 'property_value_median', 
# imputation
            'vehicle_total_imputed'
            ]


## Download data (MA and NH; 2016 and 2018)

In [12]:
ma_county_list = ['003', '001', '023', '021', '019', '025', '007', '017', '005', '013', '009', '011', '027', '015']
nh_county_list = ['013', '001', '017', '005', '009', '015', '019', '011', '007', '003']

In [13]:
censusdata.geographies(censusdata.censusgeo([('state', '33'), ('county', '*')]), 'acs5', 2016)

{'Merrimack County, New Hampshire': censusgeo((('state', '33'), ('county', '013'))),
 'Belknap County, New Hampshire': censusgeo((('state', '33'), ('county', '001'))),
 'Strafford County, New Hampshire': censusgeo((('state', '33'), ('county', '017'))),
 'Cheshire County, New Hampshire': censusgeo((('state', '33'), ('county', '005'))),
 'Grafton County, New Hampshire': censusgeo((('state', '33'), ('county', '009'))),
 'Rockingham County, New Hampshire': censusgeo((('state', '33'), ('county', '015'))),
 'Sullivan County, New Hampshire': censusgeo((('state', '33'), ('county', '019'))),
 'Hillsborough County, New Hampshire': censusgeo((('state', '33'), ('county', '011'))),
 'Coos County, New Hampshire': censusgeo((('state', '33'), ('county', '007'))),
 'Carroll County, New Hampshire': censusgeo((('state', '33'), ('county', '003')))}

In [14]:
# test the variable availability using county = 001, before downloading all the variables.
test_df = censusdata.download('acs5', 2016, censusdata.censusgeo([('state', '25'), ('county', '001'), ('block group', '*')]),
                                            var_list)
test_df.columns = var_names
pd.set_option('display.max_rows', 500)
print(np.sum(test_df.isna()))
pd.set_option('display.max_rows', 10)


pop_total                      0
sex_total                      0
sex_male                       0
sex_female                     0
age_median                     0
households                     0
race_total                     0
race_white                     0
race_black                     0
race_native                    0
race_asian                     0
inc_total_pop                196
inc_no_pop                   196
inc_with_pop                 196
inc_pop_10k                  196
inc_pop_1k_15k               196
inc_pop_15k_25k              196
inc_pop_25k_35k              196
inc_pop_35k_50k              196
inc_pop_50k_65k              196
inc_pop_65k_75k              196
inc_pop_75k                  196
inc_median_ind               196
travel_total_to_work           0
travel_driving_to_work         0
travel_pt_to_work              0
travel_taxi_to_work            0
travel_cycle_to_work           0
travel_walk_to_work            0
travel_work_from_home          0
edu_total_

In [15]:
# download ma_bg_2016, ma_bg_2018, nh_bg_2016, nh_bg_2018.
ma_bg_2016_dic = {}
ma_bg_2018_dic = {}
nh_bg_2016_dic = {}
nh_bg_2018_dic = {}

for ma_county_idx in ma_county_list:
    # 25: MA 
    ma_bg_2016_dic[ma_county_idx] = censusdata.download('acs5', 2016,
                                                        censusdata.censusgeo([('state', '25'), ('county', ma_county_idx), ('block group', '*')]),
                                                        var_list) 
    ma_bg_2018_dic[ma_county_idx] = censusdata.download('acs5', 2018,
                                                        censusdata.censusgeo([('state', '25'), ('county', ma_county_idx), ('block group', '*')]),
                                                        var_list)

for nh_county_idx in nh_county_list:
    # 33: NH 
    nh_bg_2016_dic[nh_county_idx] = censusdata.download('acs5', 2016,
                                                        censusdata.censusgeo([('state', '33'), ('county', nh_county_idx), ('block group', '*')]),
                                                        var_list)
    nh_bg_2018_dic[nh_county_idx] = censusdata.download('acs5', 2018,
                                                        censusdata.censusgeo([('state', '33'), ('county', nh_county_idx), ('block group', '*')]),
                                                        var_list)


In [16]:
# process 2016 and 2018
# concat the files. 
ma_bg_2016_df = pd.concat(ma_bg_2016_dic.values())
ma_bg_2018_df = pd.concat(ma_bg_2018_dic.values())
nh_bg_2016_df = pd.concat(nh_bg_2016_dic.values())
nh_bg_2018_df = pd.concat(nh_bg_2018_dic.values())

# replace the column names 
ma_bg_2016_df.columns = var_names
ma_bg_2018_df.columns = var_names
nh_bg_2016_df.columns = var_names
nh_bg_2018_df.columns = var_names

# add year
ma_bg_2016_df['year'] = 2016
ma_bg_2018_df['year'] = 2018
nh_bg_2016_df['year'] = 2016
nh_bg_2018_df['year'] = 2018

# add state
ma_bg_2016_df['state'] = 'MA'
ma_bg_2018_df['state'] = 'MA'
nh_bg_2016_df['state'] = 'NH'
nh_bg_2018_df['state'] = 'NH'

# concat the four files into two files.
boston_metro_bg_2016_df = pd.concat([ma_bg_2016_df, nh_bg_2016_df], axis = 0)
boston_metro_bg_2018_df = pd.concat([ma_bg_2018_df, nh_bg_2018_df], axis = 0)


In [17]:
# add the FIPS info. Change the idx.
# boston_metro_bg_year_df = boston_metro_bg_2016_df
# boston_metro_bg_year_df = boston_metro_bg_2018_df

def add_fips(boston_metro_bg_year_df):
    state_fips=[]
    county_fips=[]
    tract_fips=[]
    bg_fips=[]
    full_bg_fips=[]

    for i in range(boston_metro_bg_year_df.shape[0]):
        state_fips.append(boston_metro_bg_year_df.index[i].params()[0][1])
        county_fips.append(boston_metro_bg_year_df.index[i].params()[1][1])
        tract_fips.append(boston_metro_bg_year_df.index[i].params()[2][1])
        bg_fips.append(boston_metro_bg_year_df.index[i].params()[3][1])
        full_bg_fips.append(boston_metro_bg_year_df.index[i].params()[0][1]+boston_metro_bg_year_df.index[i].params()[1][1]
                               +boston_metro_bg_year_df.index[i].params()[2][1]+boston_metro_bg_year_df.index[i].params()[3][1])

    boston_metro_bg_year_df['state_fips']=state_fips
    boston_metro_bg_year_df['county_fips']=county_fips
    boston_metro_bg_year_df['tract_fips']=tract_fips
    boston_metro_bg_year_df['bg_fips']=bg_fips
    boston_metro_bg_year_df['full_bg_fips']=full_bg_fips

    # reset index
    boston_metro_bg_year_df.reset_index(drop = True, inplace = True)
    return boston_metro_bg_year_df

# Apply to 2016 and 2018 data
boston_metro_bg_2016_df_v1 = add_fips(boston_metro_bg_2016_df)
boston_metro_bg_2018_df_v1 = add_fips(boston_metro_bg_2018_df)


In [18]:
boston_metro_bg_2016_df_v1.shape

(5907, 70)

In [19]:
boston_metro_bg_2016_df_v1

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,property_value_total,property_value_median,vehicle_total_imputed,year,state,state_fips,county_fips,tract_fips,bg_fips,full_bg_fips
0,1831,1831,862,969,39.7,799,1831,1609,21,28,...,346,168100,960,2016,MA,25,003,921300,2,250039213002
1,819,819,234,585,20.0,64,819,711,83,0,...,56,625000,342,2016,MA,25,003,921300,4,250039213004
2,769,769,340,429,33.0,308,769,567,117,5,...,126,134100,358,2016,MA,25,003,900200,5,250039002005
3,1025,1025,580,445,40.7,341,1025,886,50,0,...,283,283200,368,2016,MA,25,003,925100,7,250039251007
4,933,933,446,487,51.0,423,933,734,31,0,...,231,265900,561,2016,MA,25,003,925100,3,250039251003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5902,767,767,396,371,63.1,390,767,755,0,0,...,348,248500,293,2016,NH,33,003,956400,2,330039564002
5903,641,641,304,337,55.3,288,641,641,0,0,...,250,240000,353,2016,NH,33,003,956100,2,330039561002
5904,2276,2276,1173,1103,44.9,773,2276,2201,1,0,...,604,354100,958,2016,NH,33,003,956100,5,330039561005
5905,1462,1462,717,745,53.9,680,1462,1462,0,0,...,477,181500,831,2016,NH,33,003,955600,2,330039556002


In [20]:
boston_metro_bg_2018_df_v1

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,property_value_total,property_value_median,vehicle_total_imputed,year,state,state_fips,county_fips,tract_fips,bg_fips,full_bg_fips
0,421,421,212,209,62.6,191,421,406,0,0,...,184,611100,195,2018,MA,25,003,934300,1,250039343001
1,736,736,415,321,49.7,337,736,736,0,0,...,275,201000,455,2018,MA,25,003,922200,2,250039222002
2,923,923,527,396,44.6,385,923,910,0,0,...,271,181600,514,2018,MA,25,003,923100,1,250039231001
3,769,769,379,390,49.7,348,769,718,31,0,...,289,123500,455,2018,MA,25,003,900300,1,250039003001
4,459,459,208,251,55.1,195,459,417,12,0,...,178,113600,212,2018,MA,25,003,900400,3,250039004003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5902,705,705,353,352,29.4,293,705,705,0,0,...,178,156300,429,2018,NH,33,003,955400,2,330039554002
5903,807,807,349,458,58.1,407,807,688,0,0,...,205,188200,295,2018,NH,33,003,955400,1,330039554001
5904,642,642,325,317,57.1,385,642,642,0,0,...,307,132300,334,2018,NH,33,003,955400,4,330039554004
5905,985,985,479,506,41.5,410,985,955,0,0,...,356,246400,411,2018,NH,33,003,955400,5,330039554005


In [22]:
# check the na.
pd.set_option('display.max_rows', 500)
# print(np.sum(boston_metro_bg_2016_df_v1.isna()))
print(np.sum(boston_metro_bg_2018_df_v1.isna()))
pd.set_option('display.max_rows', 10)


pop_total                       0
sex_total                       0
sex_male                        0
sex_female                      0
age_median                      0
households                      0
race_total                      0
race_white                      0
race_black                      0
race_native                     0
race_asian                      0
inc_total_pop                5907
inc_no_pop                   5907
inc_with_pop                 5907
inc_pop_10k                  5907
inc_pop_1k_15k               5907
inc_pop_15k_25k              5907
inc_pop_25k_35k              5907
inc_pop_35k_50k              5907
inc_pop_50k_65k              5907
inc_pop_65k_75k              5907
inc_pop_75k                  5907
inc_median_ind               5907
travel_total_to_work            0
travel_driving_to_work          0
travel_pt_to_work               0
travel_taxi_to_work             0
travel_cycle_to_work            0
travel_walk_to_work             0
travel_work_fr

# Save

In [42]:
with open('../../data/01_raw/boston_metro_bg_2016_df.pickle', 'wb') as f:
    pickle.dump(boston_metro_bg_2016_df_v1, f)
    
with open('../../data/01_raw/boston_metro_bg_2018_df.pickle', 'wb') as f:
    pickle.dump(boston_metro_bg_2018_df_v1, f)    
    

## Light processing: Nan, new vars, and minor imputation.

In [23]:
# drop nans with a threshold 1500
boston_metro_bg_2016_df_v2 = boston_metro_bg_2016_df_v1.dropna(axis = 1, thresh = boston_metro_bg_2016_df_v1.shape[0] - 1500)
boston_metro_bg_2018_df_v2 = boston_metro_bg_2018_df_v1.dropna(axis = 1, thresh = boston_metro_bg_2018_df_v1.shape[0] - 1500)

In [24]:
boston_metro_bg_2016_df_v2.shape

(5907, 41)

In [26]:
# double check the na's - not much missing.
pd.set_option('display.max_rows', 500)
print(np.sum(boston_metro_bg_2016_df_v2.isna()))
# print(np.sum(boston_metro_bg_2018_df_v2.isna()))
pd.set_option('display.max_rows', 10)

pop_total                 0
sex_total                 0
sex_male                  0
sex_female                0
age_median                0
households                0
race_total                0
race_white                0
race_black                0
race_native               0
race_asian                0
travel_total_to_work      0
travel_driving_to_work    0
travel_pt_to_work         0
travel_taxi_to_work       0
travel_cycle_to_work      0
travel_walk_to_work       0
travel_work_from_home     0
edu_total                 0
edu_bachelor              0
edu_master                0
edu_phd                   0
inc_median_household      0
inc_per_capita            2
employment_total_labor    0
employment_employed       0
employment_unemployed     0
housing_units_total       0
housing_units_occupied    0
housing_units_vacant      0
rent_median               0
property_value_total      0
property_value_median     0
vehicle_total_imputed     0
year                      0
state               

In [27]:
boston_metro_bg_2016_df_v2.describe()
# plt.hist(boston_metro_bg_2016_df_v2.property_value_median)

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,employment_employed,employment_unemployed,housing_units_total,housing_units_occupied,housing_units_vacant,rent_median,property_value_total,property_value_median,vehicle_total_imputed,year
count,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,...,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0
mean,1366.115795,1366.115795,664.614525,701.50127,-2595749.0,521.459624,1366.115795,1115.161503,85.782292,2.711359,...,757.074826,363.124598,585.303369,521.459624,63.843745,-162856700.0,331.351955,-32955830.0,692.363467,2016.0
std,664.404782,664.404782,337.775906,349.236642,41522050.0,244.342306,664.404782,627.940892,189.079721,12.417277,...,383.294329,232.181664,282.677928,244.342306,133.58091,286467000.0,217.690087,145305000.0,354.211034,0.0
min,0.0,0.0,0.0,0.0,-666666700.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-666666700.0,0.0,-666666700.0,0.0,2016.0
25%,884.0,884.0,423.0,455.0,34.8,342.0,884.0,678.0,0.0,0.0,...,482.0,218.0,382.0,342.0,0.0,330.0,173.5,213750.0,437.0,2016.0
50%,1226.0,1226.0,595.0,630.0,41.5,473.0,1226.0,986.0,16.0,0.0,...,679.0,317.0,532.0,473.0,32.0,974.0,290.0,299100.0,623.0,2016.0
75%,1714.5,1714.5,844.5,881.5,47.0,653.0,1714.5,1435.0,82.0,0.0,...,960.0,451.0,729.0,653.0,68.0,1301.5,450.0,411950.0,889.5,2016.0
max,6816.0,6816.0,3389.0,3427.0,84.8,2317.0,6816.0,5033.0,2068.0,238.0,...,3399.0,3766.0,3397.0,2317.0,3028.0,3501.0,1549.0,2000001.0,2701.0,2016.0


In [29]:
# second type of NA: -99999999
# third type of issue: zeros in population, income, etc.
# print the counts of values = -inf
epsilon = -0.00001
print("Count of zero population", np.sum(boston_metro_bg_2016_df_v2.pop_total < epsilon))
print("Count of zero median household income", np.sum(boston_metro_bg_2016_df_v2.inc_median_household < epsilon))
print("Count of zero travels to work", np.sum(boston_metro_bg_2016_df_v2.travel_total_to_work < epsilon))
print("Count of zero income per capita", np.sum(boston_metro_bg_2016_df_v2.inc_per_capita < epsilon))
print("Count of zero (and NA) median rent", np.sum(boston_metro_bg_2016_df_v2.rent_median < epsilon))
print("Count of zero (and NA) total property values", np.sum(boston_metro_bg_2016_df_v2.property_value_total < epsilon))
print("Count of zero (and NA) median property values", np.sum(boston_metro_bg_2016_df_v2.property_value_median < epsilon))

# np.sum(boston_metro_bg_2016_df_v2.year == 0.0)

Count of zero population 0
Count of zero median household income 213
Count of zero travels to work 0
Count of zero income per capita 22
Count of zero (and NA) median rent 1443
Count of zero (and NA) total property values 0
Count of zero (and NA) median property values 295


In [30]:
# second type of NA: -99999999
# third type of issue: zeros in population, income, etc. 
# print the counts of values = 0.0 & -inf. 
# It looks like 22 CBGs do not have population. 
epsilon = 0.00001
print("Count of zero population", np.sum(boston_metro_bg_2016_df_v2.pop_total < epsilon))
print("Count of zero median household income", np.sum(boston_metro_bg_2016_df_v2.inc_median_household < epsilon))
print("Count of zero travels to work", np.sum(boston_metro_bg_2016_df_v2.travel_total_to_work < epsilon))
print("Count of zero income per capita", np.sum(boston_metro_bg_2016_df_v2.inc_per_capita < epsilon))
print("Count of zero (and NA) median rent", np.sum(boston_metro_bg_2016_df_v2.rent_median < epsilon))
print("Count of zero (and NA) total property values", np.sum(boston_metro_bg_2016_df_v2.property_value_total < epsilon))
print("Count of zero (and NA) median property values", np.sum(boston_metro_bg_2016_df_v2.property_value_median < epsilon))

# np.sum(boston_metro_bg_2016_df_v2.year == 0.0)

Count of zero population 22
Count of zero median household income 213
Count of zero travels to work 26
Count of zero income per capita 22
Count of zero (and NA) median rent 1443
Count of zero (and NA) total property values 92
Count of zero (and NA) median property values 295


In [32]:
# check the variable types.
pd.set_option('display.max_rows', 500)
boston_metro_bg_2016_df_v2.dtypes

pop_total                   int64
sex_total                   int64
sex_male                    int64
sex_female                  int64
age_median                float64
households                  int64
race_total                  int64
race_white                  int64
race_black                  int64
race_native                 int64
race_asian                  int64
travel_total_to_work        int64
travel_driving_to_work      int64
travel_pt_to_work           int64
travel_taxi_to_work         int64
travel_cycle_to_work        int64
travel_walk_to_work         int64
travel_work_from_home       int64
edu_total                   int64
edu_bachelor                int64
edu_master                  int64
edu_phd                     int64
inc_median_household        int64
inc_per_capita            float64
employment_total_labor      int64
employment_employed         int64
employment_unemployed       int64
housing_units_total         int64
housing_units_occupied      int64
housing_units_

In [33]:
# Processing...
# Drop bgs with zero population.
boston_metro_bg_2016_df_v2 = boston_metro_bg_2016_df_v2.loc[boston_metro_bg_2016_df_v2.pop_total > 0.0001, :].reset_index(drop = True)
boston_metro_bg_2018_df_v2 = boston_metro_bg_2018_df_v2.loc[boston_metro_bg_2018_df_v2.pop_total > 0.0001, :].reset_index(drop = True)

# replace the super large negative values (NaN notations) by NA.
var_list_to_replace_negative_values = ['age_median', 'inc_median_household', 'rent_median', 'property_value_median']
for var in var_list_to_replace_negative_values:
    boston_metro_bg_2016_df_v2.loc[boston_metro_bg_2016_df_v2[var] < -100, var] = np.nan # 2016
    boston_metro_bg_2018_df_v2.loc[boston_metro_bg_2018_df_v2[var] < -100, var] = np.nan # 2018

# impute the NAs with KNN. 
from sklearn.impute import KNNImputer
imp = KNNImputer(missing_values=np.nan, n_neighbors=5)

# only impute the numeric values
# It seems that 2016 and 2018 imputing vars are similar. 
imputing_vars = list(boston_metro_bg_2016_df_v2.dtypes[boston_metro_bg_2016_df_v2.dtypes != 'object'].index)

# imputing 2016 and 2018 data
imp.fit(boston_metro_bg_2016_df_v2[imputing_vars])
boston_metro_bg_2016_df_v2[imputing_vars] = imp.transform(boston_metro_bg_2016_df_v2[imputing_vars])

imp.fit(boston_metro_bg_2018_df_v2[imputing_vars])
boston_metro_bg_2018_df_v2[imputing_vars] = imp.transform(boston_metro_bg_2018_df_v2[imputing_vars])


In [34]:
boston_metro_bg_2016_df_v2.head(5)

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,property_value_total,property_value_median,vehicle_total_imputed,year,state,state_fips,county_fips,tract_fips,bg_fips,full_bg_fips
0,1831.0,1831.0,862.0,969.0,39.7,799.0,1831.0,1609.0,21.0,28.0,...,346.0,168100.0,960.0,2016.0,MA,25,3,921300,2,250039213002
1,819.0,819.0,234.0,585.0,20.0,64.0,819.0,711.0,83.0,0.0,...,56.0,625000.0,342.0,2016.0,MA,25,3,921300,4,250039213004
2,769.0,769.0,340.0,429.0,33.0,308.0,769.0,567.0,117.0,5.0,...,126.0,134100.0,358.0,2016.0,MA,25,3,900200,5,250039002005
3,1025.0,1025.0,580.0,445.0,40.7,341.0,1025.0,886.0,50.0,0.0,...,283.0,283200.0,368.0,2016.0,MA,25,3,925100,7,250039251007
4,933.0,933.0,446.0,487.0,51.0,423.0,933.0,734.0,31.0,0.0,...,231.0,265900.0,561.0,2016.0,MA,25,3,925100,3,250039251003


In [36]:
# We need to compute the per capita and per household variables.
# Then I lift the nominator variables by one unit to avoid weird inf and nan in division.
var_list_to_be_lifted_by_one = ['pop_total', 'sex_total', 'households', 'race_total',
                                'travel_total_to_work', 'edu_total', 'housing_units_total', 'property_value_total', 
                                'vehicle_total_imputed']

for var in var_list_to_be_lifted_by_one:
    for df in [boston_metro_bg_2016_df_v2, boston_metro_bg_2018_df_v2]: # boston_metro_bg_2009_df_v2, boston_metro_bg_2016_df_v2, boston_metro_bg_2018_df_v2
        try:
            df.loc[df[var] == 0.0, var] += 1.0
        except KeyError:
            pass


In [37]:
# Create the per capita and per household vars.
# household_size_avg, 
# sex_male_ratio, race_white_ratio, race_black_ratio, race_native_ratio, race_asian_ratio, 
# travel_driving_ratio, travel_pt_ratio, travel_taxi_ratio, travel_cycle_ratio, travel_walk_ratio, travel_work_home_ratio
# edu_bachelor_ratio, edu_master_ratio, edu_phd_ratio
# employment_unemployed_ratio
# vehicle_per_capita, vehicle_per_household. 

for df in [boston_metro_bg_2016_df_v2, boston_metro_bg_2018_df_v2]:
    df['household_size_avg'] = df['pop_total']/df['households']
    df['sex_male_ratio'] = df['sex_male']/df['sex_total']
    df['race_white_ratio'] = df['race_white']/df['race_total']
    df['race_black_ratio'] = df['race_black']/df['race_total']
    df['race_native_ratio'] = df['race_native']/df['race_total']
    df['race_asian_ratio'] = df['race_asian']/df['race_total']
    df['travel_driving_ratio'] = df['travel_driving_to_work']/df['travel_total_to_work']
    df['travel_pt_ratio'] = df['travel_pt_to_work']/df['travel_total_to_work']
    df['travel_taxi_ratio'] = df['travel_taxi_to_work']/df['travel_total_to_work']
    df['travel_cycle_ratio'] = df['travel_cycle_to_work']/df['travel_total_to_work']
    df['travel_walk_ratio'] = df['travel_walk_to_work']/df['travel_total_to_work']
    df['travel_work_home_ratio'] = df['travel_work_from_home']/df['travel_total_to_work']
    df['edu_bachelor_ratio'] = df['edu_bachelor']/df['edu_total']
    df['edu_master_ratio'] = df['edu_master']/df['edu_total']
    df['edu_phd_ratio'] = df['edu_phd']/df['edu_total']
    df['employment_unemployed_ratio'] = df['employment_unemployed']/df['employment_total_labor']
    df['vehicle_per_capita'] = df['vehicle_total_imputed']/df['pop_total']
    df['vehicle_per_household'] = df['vehicle_total_imputed']/df['households']


In [38]:
print(np.sum(boston_metro_bg_2018_df_v2.isna()))

pop_total                      0
sex_total                      0
sex_male                       0
sex_female                     0
age_median                     0
households                     0
race_total                     0
race_white                     0
race_black                     0
race_native                    0
race_asian                     0
travel_total_to_work           0
travel_driving_to_work         0
travel_pt_to_work              0
travel_taxi_to_work            0
travel_cycle_to_work           0
travel_walk_to_work            0
travel_work_from_home          0
edu_total                      0
edu_bachelor                   0
edu_master                     0
edu_phd                        0
inc_median_household           0
inc_per_capita                 0
employment_total_labor         0
employment_employed            0
employment_unemployed          0
housing_units_total            0
housing_units_occupied         0
housing_units_vacant           0
rent_media

In [39]:
# check whether the indices of 2016 and 2018 files are the same.
# same. We are cool.
print(boston_metro_bg_2016_df_v2.shape[0])
print(len(set(boston_metro_bg_2016_df_v2.index).intersection(set(boston_metro_bg_2018_df_v2.index))))

5885
5885


## Save as pickle

In [41]:
with open('../../data/01_raw/boston_metro_bg_2016_df.pickle', 'wb') as f:
    pickle.dump(boston_metro_bg_2016_df_v2, f)
    
with open('../../data/01_raw/boston_metro_bg_2018_df.pickle', 'wb') as f:
    pickle.dump(boston_metro_bg_2018_df_v2, f)


In [None]:
# export a meta dictionary (TBD)
# variable names are self-explainatary.