# Processing econ attribute (MA, NH; 16 and 18)

- Outputs:
    - Intermediate/boston_socioecon_2016
    - Intermediate/boston_socioecon_2018
    - Intermediate/boston_annual_growth_2016_2018


In [1]:
import pandas as pd
import copy
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
with open('../../data/01_raw/boston_metro_bg_2016_df.pickle', 'rb') as f:
    boston_metro_bg_2016_df_v1 = pickle.load(f)
    
with open('../../data/01_raw/boston_metro_bg_2018_df.pickle', 'rb') as f:
    boston_metro_bg_2018_df_v1 = pickle.load(f)
    

## Light processing: Nan, new vars, and minor imputation.

In [3]:
# drop nans with a threshold 1500
boston_metro_bg_2016_df_v2 = boston_metro_bg_2016_df_v1.dropna(axis = 1, thresh = boston_metro_bg_2016_df_v1.shape[0] - 1500)
boston_metro_bg_2018_df_v2 = boston_metro_bg_2018_df_v1.dropna(axis = 1, thresh = boston_metro_bg_2018_df_v1.shape[0] - 1500)

In [4]:
boston_metro_bg_2016_df_v2.shape

(5907, 41)

In [5]:
# double check the na's - not much missing.
pd.set_option('display.max_rows', 500)
print(np.sum(boston_metro_bg_2016_df_v2.isna()))
# print(np.sum(boston_metro_bg_2018_df_v2.isna()))
pd.set_option('display.max_rows', 10)

pop_total                 0
sex_total                 0
sex_male                  0
sex_female                0
age_median                0
households                0
race_total                0
race_white                0
race_black                0
race_native               0
race_asian                0
travel_total_to_work      0
travel_driving_to_work    0
travel_pt_to_work         0
travel_taxi_to_work       0
travel_cycle_to_work      0
travel_walk_to_work       0
travel_work_from_home     0
edu_total                 0
edu_bachelor              0
edu_master                0
edu_phd                   0
inc_median_household      0
inc_per_capita            2
employment_total_labor    0
employment_employed       0
employment_unemployed     0
housing_units_total       0
housing_units_occupied    0
housing_units_vacant      0
rent_median               0
property_value_total      0
property_value_median     0
vehicle_total_imputed     0
year                      0
state               

In [6]:
boston_metro_bg_2016_df_v2.describe()
# plt.hist(boston_metro_bg_2016_df_v2.property_value_median)

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,employment_employed,employment_unemployed,housing_units_total,housing_units_occupied,housing_units_vacant,rent_median,property_value_total,property_value_median,vehicle_total_imputed,year
count,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,...,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0,5907.0
mean,1366.115795,1366.115795,664.614525,701.50127,-2595749.0,521.459624,1366.115795,1115.161503,85.782292,2.711359,...,757.074826,363.124598,585.303369,521.459624,63.843745,-162856700.0,331.351955,-32955830.0,692.363467,2016.0
std,664.404782,664.404782,337.775906,349.236642,41522050.0,244.342306,664.404782,627.940892,189.079721,12.417277,...,383.294329,232.181664,282.677928,244.342306,133.58091,286467000.0,217.690087,145305000.0,354.211034,0.0
min,0.0,0.0,0.0,0.0,-666666700.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-666666700.0,0.0,-666666700.0,0.0,2016.0
25%,884.0,884.0,423.0,455.0,34.8,342.0,884.0,678.0,0.0,0.0,...,482.0,218.0,382.0,342.0,0.0,330.0,173.5,213750.0,437.0,2016.0
50%,1226.0,1226.0,595.0,630.0,41.5,473.0,1226.0,986.0,16.0,0.0,...,679.0,317.0,532.0,473.0,32.0,974.0,290.0,299100.0,623.0,2016.0
75%,1714.5,1714.5,844.5,881.5,47.0,653.0,1714.5,1435.0,82.0,0.0,...,960.0,451.0,729.0,653.0,68.0,1301.5,450.0,411950.0,889.5,2016.0
max,6816.0,6816.0,3389.0,3427.0,84.8,2317.0,6816.0,5033.0,2068.0,238.0,...,3399.0,3766.0,3397.0,2317.0,3028.0,3501.0,1549.0,2000001.0,2701.0,2016.0


In [7]:
# second type of NA: -99999999
# third type of issue: zeros in population, income, etc.
# print the counts of values = -inf
epsilon = -0.00001
print("Count of zero population", np.sum(boston_metro_bg_2016_df_v2.pop_total < epsilon))
print("Count of zero median household income", np.sum(boston_metro_bg_2016_df_v2.inc_median_household < epsilon))
print("Count of zero travels to work", np.sum(boston_metro_bg_2016_df_v2.travel_total_to_work < epsilon))
print("Count of zero income per capita", np.sum(boston_metro_bg_2016_df_v2.inc_per_capita < epsilon))
print("Count of zero (and NA) median rent", np.sum(boston_metro_bg_2016_df_v2.rent_median < epsilon))
print("Count of zero (and NA) total property values", np.sum(boston_metro_bg_2016_df_v2.property_value_total < epsilon))
print("Count of zero (and NA) median property values", np.sum(boston_metro_bg_2016_df_v2.property_value_median < epsilon))

# np.sum(boston_metro_bg_2016_df_v2.year == 0.0)

Count of zero population 0
Count of zero median household income 213
Count of zero travels to work 0
Count of zero income per capita 22
Count of zero (and NA) median rent 1443
Count of zero (and NA) total property values 0
Count of zero (and NA) median property values 295


In [8]:
# second type of NA: -99999999
# third type of issue: zeros in population, income, etc. 
# print the counts of values = 0.0 & -inf. 
# It looks like 22 CBGs do not have population. 
epsilon = 0.00001
print("Count of zero population", np.sum(boston_metro_bg_2016_df_v2.pop_total < epsilon))
print("Count of zero median household income", np.sum(boston_metro_bg_2016_df_v2.inc_median_household < epsilon))
print("Count of zero travels to work", np.sum(boston_metro_bg_2016_df_v2.travel_total_to_work < epsilon))
print("Count of zero income per capita", np.sum(boston_metro_bg_2016_df_v2.inc_per_capita < epsilon))
print("Count of zero (and NA) median rent", np.sum(boston_metro_bg_2016_df_v2.rent_median < epsilon))
print("Count of zero (and NA) total property values", np.sum(boston_metro_bg_2016_df_v2.property_value_total < epsilon))
print("Count of zero (and NA) median property values", np.sum(boston_metro_bg_2016_df_v2.property_value_median < epsilon))

# np.sum(boston_metro_bg_2016_df_v2.year == 0.0)

Count of zero population 22
Count of zero median household income 213
Count of zero travels to work 26
Count of zero income per capita 22
Count of zero (and NA) median rent 1443
Count of zero (and NA) total property values 92
Count of zero (and NA) median property values 295


In [9]:
# check the variable types.
pd.set_option('display.max_rows', 500)
boston_metro_bg_2016_df_v2.dtypes

pop_total                   int64
sex_total                   int64
sex_male                    int64
sex_female                  int64
age_median                float64
households                  int64
race_total                  int64
race_white                  int64
race_black                  int64
race_native                 int64
race_asian                  int64
travel_total_to_work        int64
travel_driving_to_work      int64
travel_pt_to_work           int64
travel_taxi_to_work         int64
travel_cycle_to_work        int64
travel_walk_to_work         int64
travel_work_from_home       int64
edu_total                   int64
edu_bachelor                int64
edu_master                  int64
edu_phd                     int64
inc_median_household        int64
inc_per_capita            float64
employment_total_labor      int64
employment_employed         int64
employment_unemployed       int64
housing_units_total         int64
housing_units_occupied      int64
housing_units_

In [10]:
# Processing...
# Drop bgs with zero population.
boston_metro_bg_2016_df_v2 = boston_metro_bg_2016_df_v2.loc[boston_metro_bg_2016_df_v2.pop_total > 0.0001, :].reset_index(drop = True)
boston_metro_bg_2018_df_v2 = boston_metro_bg_2018_df_v2.loc[boston_metro_bg_2018_df_v2.pop_total > 0.0001, :].reset_index(drop = True)

# replace the super large negative values (NaN notations) by NA.
var_list_to_replace_negative_values = ['age_median', 'inc_median_household', 'rent_median', 'property_value_median']
for var in var_list_to_replace_negative_values:
    boston_metro_bg_2016_df_v2.loc[boston_metro_bg_2016_df_v2[var] < -100, var] = np.nan # 2016
    boston_metro_bg_2018_df_v2.loc[boston_metro_bg_2018_df_v2[var] < -100, var] = np.nan # 2018

# impute the NAs with KNN. 
from sklearn.impute import KNNImputer
imp = KNNImputer(missing_values=np.nan, n_neighbors=5)

# only impute the numeric values
# It seems that 2016 and 2018 imputing vars are similar. 
imputing_vars = list(boston_metro_bg_2016_df_v2.dtypes[boston_metro_bg_2016_df_v2.dtypes != 'object'].index)

# imputing 2016 and 2018 data
imp.fit(boston_metro_bg_2016_df_v2[imputing_vars])
boston_metro_bg_2016_df_v2[imputing_vars] = imp.transform(boston_metro_bg_2016_df_v2[imputing_vars])

imp.fit(boston_metro_bg_2018_df_v2[imputing_vars])
boston_metro_bg_2018_df_v2[imputing_vars] = imp.transform(boston_metro_bg_2018_df_v2[imputing_vars])


In [11]:
boston_metro_bg_2016_df_v2.head(5)

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,property_value_total,property_value_median,vehicle_total_imputed,year,state,state_fips,county_fips,tract_fips,bg_fips,full_bg_fips
0,1831.0,1831.0,862.0,969.0,39.7,799.0,1831.0,1609.0,21.0,28.0,...,346.0,168100.0,960.0,2016.0,MA,25,3,921300,2,250039213002
1,819.0,819.0,234.0,585.0,20.0,64.0,819.0,711.0,83.0,0.0,...,56.0,625000.0,342.0,2016.0,MA,25,3,921300,4,250039213004
2,769.0,769.0,340.0,429.0,33.0,308.0,769.0,567.0,117.0,5.0,...,126.0,134100.0,358.0,2016.0,MA,25,3,900200,5,250039002005
3,1025.0,1025.0,580.0,445.0,40.7,341.0,1025.0,886.0,50.0,0.0,...,283.0,283200.0,368.0,2016.0,MA,25,3,925100,7,250039251007
4,933.0,933.0,446.0,487.0,51.0,423.0,933.0,734.0,31.0,0.0,...,231.0,265900.0,561.0,2016.0,MA,25,3,925100,3,250039251003


In [12]:
# We need to compute the per capita and per household variables.
# Then I lift the nominator variables by one unit to avoid weird inf and nan in division.
var_list_to_be_lifted_by_one = ['pop_total', 'sex_total', 'households', 'race_total',
                                'travel_total_to_work', 'edu_total', 'housing_units_total', 'property_value_total', 
                                'vehicle_total_imputed']

for var in var_list_to_be_lifted_by_one:
    for df in [boston_metro_bg_2016_df_v2, boston_metro_bg_2018_df_v2]: # boston_metro_bg_2009_df_v2, boston_metro_bg_2016_df_v2, boston_metro_bg_2018_df_v2
        try:
            df.loc[df[var] == 0.0, var] += 1.0
        except KeyError:
            pass

In [13]:
# Create the per capita and per household vars.
# household_size_avg, 
# sex_male_ratio, race_white_ratio, race_black_ratio, race_native_ratio, race_asian_ratio, 
# travel_driving_ratio, travel_pt_ratio, travel_taxi_ratio, travel_cycle_ratio, travel_walk_ratio, travel_work_home_ratio
# edu_bachelor_ratio, edu_master_ratio, edu_phd_ratio
# employment_unemployed_ratio
# vehicle_per_capita, vehicle_per_household. 

for df in [boston_metro_bg_2016_df_v2, boston_metro_bg_2018_df_v2]:
    df['household_size_avg'] = df['pop_total']/df['households']
    df['sex_male_ratio'] = df['sex_male']/df['sex_total']
    df['race_white_ratio'] = df['race_white']/df['race_total']
    df['race_black_ratio'] = df['race_black']/df['race_total']
    df['race_native_ratio'] = df['race_native']/df['race_total']
    df['race_asian_ratio'] = df['race_asian']/df['race_total']
    df['travel_driving_ratio'] = df['travel_driving_to_work']/df['travel_total_to_work']
    df['travel_pt_ratio'] = df['travel_pt_to_work']/df['travel_total_to_work']
    df['travel_taxi_ratio'] = df['travel_taxi_to_work']/df['travel_total_to_work']
    df['travel_cycle_ratio'] = df['travel_cycle_to_work']/df['travel_total_to_work']
    df['travel_walk_ratio'] = df['travel_walk_to_work']/df['travel_total_to_work']
    df['travel_work_home_ratio'] = df['travel_work_from_home']/df['travel_total_to_work']
    df['edu_bachelor_ratio'] = df['edu_bachelor']/df['edu_total']
    df['edu_master_ratio'] = df['edu_master']/df['edu_total']
    df['edu_phd_ratio'] = df['edu_phd']/df['edu_total']
    df['employment_unemployed_ratio'] = df['employment_unemployed']/df['employment_total_labor']
    df['vehicle_per_capita'] = df['vehicle_total_imputed']/df['pop_total']
    df['vehicle_per_household'] = df['vehicle_total_imputed']/df['households']
    df['vacancy_ratio'] = df['housing_units_vacant']/df['housing_units_total']


In [14]:
print(np.sum(boston_metro_bg_2018_df_v2.isna()))

pop_total                      0
sex_total                      0
sex_male                       0
sex_female                     0
age_median                     0
households                     0
race_total                     0
race_white                     0
race_black                     0
race_native                    0
race_asian                     0
travel_total_to_work           0
travel_driving_to_work         0
travel_pt_to_work              0
travel_taxi_to_work            0
travel_cycle_to_work           0
travel_walk_to_work            0
travel_work_from_home          0
edu_total                      0
edu_bachelor                   0
edu_master                     0
edu_phd                        0
inc_median_household           0
inc_per_capita                 0
employment_total_labor         0
employment_employed            0
employment_unemployed          0
housing_units_total            0
housing_units_occupied         0
housing_units_vacant           0
rent_media

In [15]:
# check whether the indices of 2016 and 2018 files are the same.
# same. We are cool.
print(boston_metro_bg_2016_df_v2.shape[0])
print(len(set(boston_metro_bg_2016_df_v2.index).intersection(set(boston_metro_bg_2018_df_v2.index))))

5885
5885


In [16]:
geoid_overlap = set(boston_metro_bg_2016_df_v2.full_bg_fips).intersection(set(boston_metro_bg_2018_df_v2.full_bg_fips))
print(len(geoid_overlap))
# Wow~ Only 5883 CBGs overlap...NOT 5885. 

5883


In [17]:
sorted_geoid = sorted(list(geoid_overlap))

In [18]:
# choose the overlapping geoids from the two df. 
# Also change the two df names. 
greater_boston_socioecon_2016_df = boston_metro_bg_2016_df_v2.loc[boston_metro_bg_2016_df_v2.full_bg_fips.isin(sorted_geoid), :]
greater_boston_socioecon_2018_df = boston_metro_bg_2018_df_v2.loc[boston_metro_bg_2018_df_v2.full_bg_fips.isin(sorted_geoid), :]

# sort according to geoid.
greater_boston_socioecon_2016_df.sort_values('full_bg_fips', inplace = True)
greater_boston_socioecon_2018_df.sort_values('full_bg_fips', inplace = True)

# replace index
greater_boston_socioecon_2016_df.set_index('full_bg_fips', inplace = True)
greater_boston_socioecon_2018_df.set_index('full_bg_fips', inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
greater_boston_socioecon_2016_df

Unnamed: 0_level_0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,travel_cycle_ratio,travel_walk_ratio,travel_work_home_ratio,edu_bachelor_ratio,edu_master_ratio,edu_phd_ratio,employment_unemployed_ratio,vehicle_per_capita,vehicle_per_household,vacancy_ratio
full_bg_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
250010101001,938.0,938.0,534.0,404.0,53.9,552.0,938.0,770.0,0.0,10.0,...,0.067358,0.202073,0.044905,0.326340,0.159674,0.046620,0.277904,0.617271,1.048913,0.528205
250010101002,289.0,289.0,162.0,127.0,63.3,202.0,289.0,237.0,12.0,0.0,...,0.000000,0.213793,0.241379,0.236162,0.118081,0.107011,0.470588,0.501730,0.717822,0.792608
250010101003,755.0,755.0,416.0,339.0,54.9,412.0,755.0,694.0,29.0,0.0,...,0.159420,0.123188,0.130435,0.265276,0.171386,0.032787,0.329912,0.548344,1.004854,0.381381
250010101004,499.0,499.0,225.0,274.0,59.6,300.0,499.0,438.0,9.0,0.0,...,0.000000,0.446541,0.213836,0.281659,0.109170,0.030568,0.550976,0.318637,0.530000,0.652375
250010101005,481.0,481.0,258.0,223.0,57.4,336.0,481.0,378.0,38.0,0.0,...,0.149425,0.260536,0.187739,0.211712,0.058559,0.020270,0.347458,0.542620,0.776786,0.597122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330199759013,1265.0,1265.0,590.0,675.0,49.2,649.0,1265.0,1265.0,0.0,0.0,...,0.000000,0.018692,0.030374,0.064963,0.000000,0.000000,0.555764,0.338340,0.659476,0.248843
330199759021,724.0,724.0,315.0,409.0,33.0,312.0,724.0,627.0,71.0,0.0,...,0.000000,0.274566,0.046243,0.150812,0.023202,0.000000,0.340866,0.477901,1.108974,0.187500
330199759022,987.0,987.0,490.0,497.0,41.8,440.0,987.0,976.0,0.0,0.0,...,0.000000,0.117302,0.032258,0.127695,0.041459,0.000000,0.428120,0.345491,0.775000,0.150579
330199759023,802.0,802.0,464.0,338.0,43.3,319.0,802.0,800.0,0.0,0.0,...,0.000000,0.007937,0.000000,0.102273,0.014610,0.011364,0.500759,0.314214,0.789969,0.381783


# Create the growth dataframe (16 - 18)

In [20]:
# annual growth_rate_16_18
var_of_interests = ['inc_per_capita', 'pop_total', 'property_value_median']
var_of_interests_ratio = ['race_black_ratio', 'race_white_ratio', 'vacancy_ratio']
growth_rate_16_18 = (greater_boston_socioecon_2018_df[var_of_interests] - greater_boston_socioecon_2016_df[var_of_interests])/greater_boston_socioecon_2016_df[var_of_interests]

for var_ratio in var_of_interests_ratio:
    growth_rate_16_18[var_ratio] = (greater_boston_socioecon_2018_df[var_ratio] - greater_boston_socioecon_2016_df[var_ratio])/2

growth_rate_16_18 = np.round(growth_rate_16_18/(18-16), decimals = 3)

# replace col names
growth_rate_16_18 = growth_rate_16_18.rename(columns = {'inc_per_capita': 'inc_per_capita_annual_growth',
                                                        'pop_total':'pop_total_annual_growth',
                                                        'property_value_median': 'property_value_median_annual_growth',
                                                        'race_black_ratio': 'race_black_ratio_annual_growth', 
                                                        'race_white_ratio': 'race_white_ratio_annual_growth', 
                                                        'vacancy_ratio': 'vacancy_ratio_annual_growth'})
growth_rate_16_18

Unnamed: 0_level_0,inc_per_capita_annual_growth,pop_total_annual_growth,property_value_median_annual_growth,race_black_ratio_annual_growth,race_white_ratio_annual_growth,vacancy_ratio_annual_growth
full_bg_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
250010101001,0.037,-0.015,0.067,0.000,0.035,0.007
250010101002,0.007,0.161,0.031,-0.010,0.016,-0.015
250010101003,-0.034,-0.004,0.147,0.000,0.000,0.036
250010101004,0.037,-0.069,0.102,0.005,-0.017,0.008
250010101005,0.161,0.009,0.203,-0.020,0.040,0.013
...,...,...,...,...,...,...
330199759013,0.063,0.080,-0.137,0.000,-0.006,-0.008
330199759021,0.028,0.182,0.111,-0.007,0.008,-0.018
330199759022,0.144,0.075,-0.000,0.000,0.003,0.001
330199759023,0.049,0.026,0.026,0.000,-0.003,-0.031


In [22]:
# cap the growth rate to 100%
# otherwise the outliers dominate the performance and the visualization does not show any info.
growth_rate_16_18.values[growth_rate_16_18.values > 1.0] = 1.0

In [23]:
growth_rate_16_18.describe()

Unnamed: 0,inc_per_capita_annual_growth,pop_total_annual_growth,property_value_median_annual_growth,race_black_ratio_annual_growth,race_white_ratio_annual_growth,vacancy_ratio_annual_growth
count,5883.0,5883.0,5883.0,5883.0,5883.0,5883.0
mean,0.057026,0.012387,0.046183,0.000391,-0.001409,-8e-05
std,0.099078,0.090016,0.101432,0.012675,0.018853,0.013989
min,-0.311,-0.308,-0.471,-0.096,-0.098,-0.074
25%,-0.0,-0.0405,0.003,-0.001,-0.009,-0.006
50%,0.047,0.005,0.033,0.0,-0.001,0.0
75%,0.1,0.056,0.071,0.002,0.006,0.005
max,1.0,1.0,1.0,0.092,0.119,0.146


# Save

In [24]:
with open('../../data/02_intermediate/boston_socioecon_2016.pickle', 'wb') as f:
    pickle.dump(greater_boston_socioecon_2016_df, f)
    
with open('../../data/02_intermediate/boston_socioecon_2018.pickle', 'wb') as f:
    pickle.dump(greater_boston_socioecon_2018_df, f)

with open('../../data/02_intermediate/boston_annual_growth_2016_2018.pickle', 'wb') as f:
    pickle.dump(growth_rate_16_18, f)
