# Processing econ attributes for census tracts (MA, NH, FL, and IL, 16 and 18)

- Inputs
    - six files from the three places * two years.

- Outputs
    - two dictionaries: socioeconomics and growth of the three cities.  
    - saved in data/02_intermediate folder
   

In [4]:
import pandas as pd
import copy
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [5]:
with open('../../data/01_raw/boston_metro_ct_2016_df.pickle', 'rb') as f:
    boston_metro_ct_2016_df_v1 = pickle.load(f)
    
with open('../../data/01_raw/boston_metro_ct_2018_df.pickle', 'rb') as f:
    boston_metro_ct_2018_df_v1 = pickle.load(f)

with open('../../data/01_raw/florida_ct_2016_df.pickle', 'rb') as f:
    florida_ct_2016_df_v1 = pickle.load(f)

with open('../../data/01_raw/florida_ct_2018_df.pickle', 'rb') as f:
    florida_ct_2018_df_v1 = pickle.load(f)

with open('../../data/01_raw/illinois_ct_2016_df.pickle', 'rb') as f:
    illinois_ct_2016_df_v1 = pickle.load(f)

with open('../../data/01_raw/illinois_ct_2018_df.pickle', 'rb') as f:
    illinois_ct_2018_df_v1 = pickle.load(f)
   

In [6]:
# save to a dic
dic = {} 
dic['boston_2016'] = boston_metro_ct_2016_df_v1
dic['boston_2018'] = boston_metro_ct_2018_df_v1
dic['florida_2016'] = florida_ct_2016_df_v1
dic['florida_2018'] = florida_ct_2018_df_v1
dic['illinois_2016'] = illinois_ct_2016_df_v1
dic['illinois_2018'] = illinois_ct_2018_df_v1


## Processing 16 and 18 data sets

In [7]:
# Drop nans
for key_ in dic.keys():
    dic[key_] = dic[key_].dropna(axis = 1, thresh = dic[key_].shape[0] - 1500)

# boston_metro_ct_2016_df_v2 = boston_metro_ct_2016_df_v1.dropna(axis = 1, thresh = boston_metro_ct_2016_df_v1.shape[0] - 1500)
# boston_metro_ct_2018_df_v2 = boston_metro_ct_2018_df_v1.dropna(axis = 1, thresh = boston_metro_ct_2018_df_v1.shape[0] - 1500)


In [8]:
dic['boston_2018'].describe()

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,employment_employed,employment_unemployed,housing_units_total,housing_units_occupied,housing_units_vacant,rent_median,property_value_total,property_value_median,vehicle_total_imputed,year
count,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0,...,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0,1773.0
mean,4610.160745,4610.160745,2244.197406,2365.963339,-6392139.0,1765.364918,4610.160745,3728.155668,299.534123,9.353074,...,2557.744501,1240.79357,1981.778906,1765.364918,216.413988,-27823550.0,1125.668923,-15422790.0,2372.961083,2018.0
std,1791.453362,1791.453362,891.431599,931.865658,64984420.0,676.849937,1791.453362,1755.061217,594.42066,28.41995,...,1048.571737,559.975546,785.549317,676.849937,388.844131,133363200.0,645.591302,101471300.0,987.676383,0.0
min,0.0,0.0,0.0,0.0,-666666700.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-666666700.0,0.0,-666666700.0,0.0,2018.0
25%,3309.0,3309.0,1594.0,1694.0,35.4,1307.0,3309.0,2498.0,29.0,0.0,...,1802.0,871.0,1446.0,1307.0,61.0,953.0,628.0,233600.0,1668.0,2018.0
50%,4495.0,4495.0,2188.0,2299.0,41.2,1728.0,4495.0,3652.0,110.0,0.0,...,2503.0,1174.0,1923.0,1728.0,113.0,1175.0,1084.0,327600.0,2298.0,2018.0
75%,5798.0,5798.0,2830.0,2980.0,45.3,2228.0,5798.0,4835.0,289.0,7.0,...,3242.0,1547.0,2494.0,2228.0,201.0,1488.0,1560.0,444800.0,3022.0,2018.0
max,13056.0,13056.0,6574.0,6482.0,67.8,4435.0,13056.0,9734.0,6666.0,450.0,...,5997.0,7226.0,6439.0,4435.0,4766.0,3257.0,3015.0,2000001.0,5717.0,2018.0


In [9]:
# data issues: NAs -99999999; zeros in population, income, etc.
epsilon = -0.00001
print("Count of negative population", np.sum(dic['boston_2016'].pop_total < epsilon))
print("Count of negative median household income", np.sum(dic['boston_2016'].inc_median_household < epsilon))
print("Count of negative travels to work", np.sum(dic['boston_2016'].travel_total_to_work < epsilon))
print("Count of negative income per capita", np.sum(dic['boston_2016'].inc_per_capita < epsilon))
print("Count of negative (and NA) median rent", np.sum(dic['boston_2016'].rent_median < epsilon))
print("Count of negative (and NA) total property values", np.sum(dic['boston_2016'].property_value_total < epsilon))
print("Count of negative (and NA) median property values", np.sum(dic['boston_2016'].property_value_median < epsilon))


Count of negative population 0
Count of negative median household income 26
Count of negative travels to work 0
Count of negative income per capita 16
Count of negative (and NA) median rent 69
Count of negative (and NA) total property values 0
Count of negative (and NA) median property values 40


In [10]:
# 
epsilon = 0.00001
print("Count of negative and zero population", np.sum(dic['boston_2016'].pop_total < epsilon))
print("Count of negative and zero median household income", np.sum(dic['boston_2016'].inc_median_household < epsilon))
print("Count of negative and zero travels to work", np.sum(dic['boston_2016'].travel_total_to_work < epsilon))
print("Count of negative and zero income per capita", np.sum(dic['boston_2016'].inc_per_capita < epsilon))
print("Count of negative and zero median rent", np.sum(dic['boston_2016'].rent_median < epsilon))
print("Count of negative and zero total property values", np.sum(dic['boston_2016'].property_value_total < epsilon))
print("Count of negative and zero median property values", np.sum(dic['boston_2016'].property_value_median < epsilon))


Count of negative and zero population 16
Count of negative and zero median household income 26
Count of negative and zero travels to work 18
Count of negative and zero income per capita 16
Count of negative and zero median rent 69
Count of negative and zero total property values 27
Count of negative and zero median property values 40


In [11]:
from sklearn.impute import KNNImputer

for key_ in dic.keys():
    # processing.
    dic[key_] = dic[key_].loc[dic[key_].pop_total > 0.0001, :].reset_index(drop = True)

    # replace the super large negative values (NaN notations) by NA.
    var_list_to_replace_negative_values = ['age_median', 'inc_median_household', 'rent_median', 'property_value_median']
    for var in var_list_to_replace_negative_values:
        dic[key_].loc[dic[key_][var] < -100, var] = np.nan # 2016

    # impute the NAs with KNN. 
    imp = KNNImputer(missing_values=np.nan, n_neighbors=5)

    # only impute the numeric values
    # It seems that 2016 and 2018 imputing vars are similar. 
    imputing_vars = list(dic[key_].dtypes[dic[key_].dtypes != 'object'].index)

    # imputing 2016 and 2018 data
    imp.fit(dic[key_][imputing_vars])
    dic[key_][imputing_vars] = imp.transform(dic[key_][imputing_vars])

    imp.fit(dic[key_][imputing_vars])
    dic[key_][imputing_vars] = imp.transform(dic[key_][imputing_vars])



In [12]:
dic['boston_2016'].head(5)

Unnamed: 0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,rent_median,property_value_total,property_value_median,vehicle_total_imputed,year,state,state_fips,county_fips,tract_fips,full_ct_fips
0,6264.0,6264.0,3009.0,3255.0,32.0,1972.0,6264.0,2827.0,2542.0,8.0,...,1244.0,1257.0,273100.0,2786.0,2016.0,MA,25,23,510700,25023510700
1,4905.0,4905.0,2394.0,2511.0,39.2,1781.0,4905.0,2021.0,2264.0,0.0,...,505.0,948.0,188000.0,2180.0,2016.0,MA,25,23,511200,25023511200
2,4504.0,4504.0,2333.0,2171.0,47.2,1563.0,4504.0,4181.0,35.0,0.0,...,1563.0,1402.0,597600.0,2292.0,2016.0,MA,25,9,253203,25009253203
3,5287.0,5287.0,2689.0,2598.0,41.6,2157.0,5287.0,4058.0,38.0,0.0,...,958.0,1581.0,500200.0,2963.0,2016.0,MA,25,17,388100,25017388100
4,5125.0,5125.0,2555.0,2570.0,49.6,1762.0,5125.0,4390.0,33.0,9.0,...,2085.0,1694.0,757900.0,2447.0,2016.0,MA,25,17,362100,25017362100


In [13]:
# We need to compute the per capita and per household variables.
# Then I lift the denominator variables by one unit to avoid weird inf and nan in division.
var_list_to_be_lifted_by_one = ['pop_total', 'sex_total', 'households', 'race_total',
                                'travel_total_to_work', 'edu_total', 'housing_units_total', 'property_value_total', 
                                'vehicle_total_imputed']

for var in var_list_to_be_lifted_by_one:
    for key_ in dic.keys(): 
        try:
            dic[key_].loc[dic[key_][var] == 0.0, var] += 1.0
        except KeyError:
            pass
        

In [14]:
# Create the per capita and per household vars.
# household_size_avg, 
# sex_male_ratio, race_white_ratio, race_black_ratio, race_native_ratio, race_asian_ratio, 
# travel_driving_ratio, travel_pt_ratio, travel_taxi_ratio, travel_cycle_ratio, travel_walk_ratio, travel_work_home_ratio
# edu_bachelor_ratio, edu_master_ratio, edu_phd_ratio
# employment_unemployed_ratio
# vehicle_per_capita, vehicle_per_household. 

for key_ in dic.keys(): 
    dic[key_]['household_size_avg'] = dic[key_]['pop_total']/dic[key_]['households']
    dic[key_]['sex_male_ratio'] = dic[key_]['sex_male']/dic[key_]['sex_total']
    dic[key_]['race_white_ratio'] = dic[key_]['race_white']/dic[key_]['race_total']
    dic[key_]['race_black_ratio'] = dic[key_]['race_black']/dic[key_]['race_total']
    dic[key_]['race_native_ratio'] = dic[key_]['race_native']/dic[key_]['race_total']
    dic[key_]['race_asian_ratio'] = dic[key_]['race_asian']/dic[key_]['race_total']
    dic[key_]['travel_driving_ratio'] = dic[key_]['travel_driving_to_work']/dic[key_]['travel_total_to_work']
    dic[key_]['travel_pt_ratio'] = dic[key_]['travel_pt_to_work']/dic[key_]['travel_total_to_work']
    dic[key_]['travel_taxi_ratio'] = dic[key_]['travel_taxi_to_work']/dic[key_]['travel_total_to_work']
    dic[key_]['travel_cycle_ratio'] = dic[key_]['travel_cycle_to_work']/dic[key_]['travel_total_to_work']
    dic[key_]['travel_walk_ratio'] = dic[key_]['travel_walk_to_work']/dic[key_]['travel_total_to_work']
    dic[key_]['travel_work_home_ratio'] = dic[key_]['travel_work_from_home']/dic[key_]['travel_total_to_work']
    dic[key_]['edu_bachelor_ratio'] = dic[key_]['edu_bachelor']/dic[key_]['edu_total']
    dic[key_]['edu_master_ratio'] = dic[key_]['edu_master']/dic[key_]['edu_total']
    dic[key_]['edu_phd_ratio'] = dic[key_]['edu_phd']/dic[key_]['edu_total']
    dic[key_]['edu_higher_edu_ratio'] = dic[key_]['edu_bachelor_ratio'] + dic[key_]['edu_master_ratio'] + dic[key_]['edu_phd_ratio']
    dic[key_]['employment_unemployed_ratio'] = dic[key_]['employment_unemployed']/dic[key_]['employment_total_labor']
    dic[key_]['vehicle_per_capita'] = dic[key_]['vehicle_total_imputed']/dic[key_]['pop_total']
    dic[key_]['vehicle_per_household'] = dic[key_]['vehicle_total_imputed']/dic[key_]['households']
    dic[key_]['vacancy_ratio'] = dic[key_]['housing_units_vacant']/dic[key_]['housing_units_total']


In [15]:
print(np.sum(dic['boston_2018'].isna()))

pop_total                      0
sex_total                      0
sex_male                       0
sex_female                     0
age_median                     0
                              ..
edu_higher_edu_ratio           0
employment_unemployed_ratio    0
vehicle_per_capita             0
vehicle_per_household          0
vacancy_ratio                  0
Length: 89, dtype: int64


In [16]:
# check whether the indices of 2016 and 2018 files are the same.
# no. We are not cool. 
print(dic['boston_2018'].shape[0])
print(len(set(dic['boston_2016'].index).intersection(set(dic['boston_2018'].index))))


1756
1756


In [17]:
# check the overlapping ct indices 
geoid_overlap_boston = set(dic['boston_2016'].full_ct_fips).intersection(set(dic['boston_2018'].full_ct_fips))
print(len(geoid_overlap_boston))
# Wow~ Only 1755 CTs overlap

# sort the overlapping geoid
sorted_geoid_overlap_boston = sorted(list(geoid_overlap_boston))

# choose the overlapping geoids from the two df. 
dic['boston_2016'] = dic['boston_2016'].loc[dic['boston_2016'].full_ct_fips.isin(sorted_geoid_overlap_boston), :]
dic['boston_2018'] = dic['boston_2018'].loc[dic['boston_2018'].full_ct_fips.isin(sorted_geoid_overlap_boston), :]

# sort according to geoid.
dic['boston_2016'].sort_values('full_ct_fips', inplace = True)
dic['boston_2018'].sort_values('full_ct_fips', inplace = True)

# replace index
dic['boston_2016'].set_index('full_ct_fips', inplace = True)
dic['boston_2018'].set_index('full_ct_fips', inplace = True)


1755


In [18]:
# check the overlapping ct indices 
geoid_overlap_illinois = set(dic['illinois_2016'].full_ct_fips).intersection(set(dic['illinois_2018'].full_ct_fips))
print(len(geoid_overlap_illinois))
# Wow~ Only 1755 CTs overlap

# sort the overlapping geoid
sorted_geoid_overlap_illinois = sorted(list(geoid_overlap_illinois))

# choose the overlapping geoids from the two df. 
dic['illinois_2016'] = dic['illinois_2016'].loc[dic['illinois_2016'].full_ct_fips.isin(sorted_geoid_overlap_illinois), :]
dic['illinois_2018'] = dic['illinois_2018'].loc[dic['illinois_2018'].full_ct_fips.isin(sorted_geoid_overlap_illinois), :]

# sort according to geoid.
dic['illinois_2016'].sort_values('full_ct_fips', inplace = True)
dic['illinois_2018'].sort_values('full_ct_fips', inplace = True)

# replace index
dic['illinois_2016'].set_index('full_ct_fips', inplace = True)
dic['illinois_2018'].set_index('full_ct_fips', inplace = True)


3116


In [19]:
# check the overlapping ct indices 
geoid_overlap_florida = set(dic['florida_2016'].full_ct_fips).intersection(set(dic['florida_2018'].full_ct_fips))
print(len(geoid_overlap_florida))
# Wow~ Only 1755 CTs overlap

# sort the overlapping geoid
sorted_geoid_overlap_florida = sorted(list(geoid_overlap_florida))

# choose the overlapping geoids from the two df. 
dic['florida_2016'] = dic['florida_2016'].loc[dic['florida_2016'].full_ct_fips.isin(sorted_geoid_overlap_florida), :]
dic['florida_2018'] = dic['florida_2018'].loc[dic['florida_2018'].full_ct_fips.isin(sorted_geoid_overlap_florida), :]

# sort according to geoid.
dic['florida_2016'].sort_values('full_ct_fips', inplace = True)
dic['florida_2018'].sort_values('full_ct_fips', inplace = True)

# replace index
dic['florida_2016'].set_index('full_ct_fips', inplace = True)
dic['florida_2018'].set_index('full_ct_fips', inplace = True)


4167


In [20]:
dic['boston_2016']

Unnamed: 0_level_0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,travel_walk_ratio,travel_work_home_ratio,edu_bachelor_ratio,edu_master_ratio,edu_phd_ratio,edu_higher_edu_ratio,employment_unemployed_ratio,vehicle_per_capita,vehicle_per_household,vacancy_ratio
full_ct_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
25001010100,2962.0,2962.0,1595.0,1367.0,57.0,1802.0,2962.0,2517.0,88.0,10.0,...,0.216945,0.127086,0.275722,0.133235,0.042191,0.451147,0.367721,0.525996,0.864595,0.600178
25001010206,3168.0,3168.0,1417.0,1751.0,58.3,1562.0,3168.0,2940.0,78.0,0.0,...,0.045455,0.137720,0.257198,0.218042,0.024568,0.499808,0.396359,0.465278,0.943662,0.652657
25001010208,1589.0,1589.0,827.0,762.0,57.9,820.0,1589.0,1563.0,11.0,9.0,...,0.007585,0.290771,0.276758,0.211774,0.017584,0.506116,0.394024,0.497797,0.964634,0.749924
25001010304,2107.0,2107.0,946.0,1161.0,61.0,1085.0,2107.0,2072.0,0.0,0.0,...,0.000000,0.087574,0.302654,0.164879,0.014116,0.481649,0.506941,0.401044,0.778802,0.600515
25001010306,2817.0,2817.0,1342.0,1475.0,56.4,1326.0,2817.0,2636.0,71.0,0.0,...,0.042130,0.069952,0.280639,0.156306,0.028419,0.465364,0.444972,0.446574,0.948718,0.599154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33019975600,3400.0,3400.0,1646.0,1754.0,48.9,1190.0,3400.0,3280.0,13.0,4.0,...,0.009919,0.064476,0.145180,0.054975,0.003871,0.204026,0.394001,0.474412,1.355462,0.228775
33019975700,5001.0,5001.0,2406.0,2595.0,45.7,2079.0,5001.0,4794.0,86.0,12.0,...,0.027409,0.023126,0.110240,0.016212,0.000000,0.126452,0.383535,0.466907,1.123136,0.088957
33019975800,5219.0,5219.0,2455.0,2764.0,38.6,2030.0,5219.0,5108.0,14.0,0.0,...,0.023659,0.029151,0.155026,0.040034,0.000000,0.195060,0.382737,0.453535,1.166010,0.079782
33019975901,4494.0,4494.0,2261.0,2233.0,45.9,1829.0,4494.0,4464.0,10.0,0.0,...,0.016529,0.060026,0.120805,0.006406,0.000000,0.127212,0.337924,0.511571,1.256971,0.165602


In [21]:
dic['illinois_2016']

Unnamed: 0_level_0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,travel_walk_ratio,travel_work_home_ratio,edu_bachelor_ratio,edu_master_ratio,edu_phd_ratio,edu_higher_edu_ratio,employment_unemployed_ratio,vehicle_per_capita,vehicle_per_household,vacancy_ratio
full_ct_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17001000100,4569.0,4569.0,2018.0,2551.0,49.7,2185.0,4569.0,4142.0,190.0,0.0,...,0.014939,0.030783,0.181128,0.047619,0.003798,0.232545,0.362019,0.483476,1.010984,0.037869
17001000201,1937.0,1937.0,965.0,972.0,43.1,858.0,1937.0,1767.0,63.0,9.0,...,0.021403,0.030916,0.147123,0.048798,0.008012,0.203933,0.441989,0.434177,0.980186,0.129817
17001000202,2563.0,2563.0,1188.0,1375.0,33.2,991.0,2563.0,2261.0,120.0,18.0,...,0.021214,0.038771,0.087252,0.044554,0.003094,0.134901,0.257215,0.533359,1.379415,0.142734
17001000400,3403.0,3403.0,1494.0,1909.0,41.9,1390.0,3403.0,2818.0,471.0,0.0,...,0.037376,0.025172,0.060504,0.015546,0.003782,0.079832,0.462751,0.385248,0.943165,0.240852
17001000500,2298.0,2298.0,1189.0,1109.0,39.1,861.0,2298.0,1864.0,400.0,0.0,...,0.099537,0.046296,0.062914,0.009272,0.000000,0.072185,0.462570,0.375979,1.003484,0.091772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17203030501,7732.0,7732.0,4044.0,3688.0,41.6,2743.0,7732.0,7398.0,75.0,0.0,...,0.003944,0.046542,0.305992,0.107039,0.018422,0.431452,0.326463,0.491852,1.386438,0.052831
17203030502,2692.0,2692.0,1415.0,1277.0,46.3,1159.0,2692.0,2611.0,6.0,4.0,...,0.007059,0.020392,0.148374,0.015752,0.005081,0.169207,0.376595,0.473626,1.100086,0.074281
17203030601,6673.0,6673.0,3081.0,3592.0,38.3,2490.0,6673.0,6465.0,46.0,0.0,...,0.062071,0.019027,0.251320,0.093642,0.023181,0.368143,0.390293,0.480444,1.287550,0.028861
17203030602,3494.0,3494.0,1780.0,1714.0,38.3,1231.0,3494.0,3430.0,0.0,0.0,...,0.003386,0.017494,0.195719,0.074705,0.016601,0.287025,0.328762,0.507155,1.439480,0.042768


In [22]:
dic['florida_2016']

Unnamed: 0_level_0,pop_total,sex_total,sex_male,sex_female,age_median,households,race_total,race_white,race_black,race_native,...,travel_walk_ratio,travel_work_home_ratio,edu_bachelor_ratio,edu_master_ratio,edu_phd_ratio,edu_higher_edu_ratio,employment_unemployed_ratio,vehicle_per_capita,vehicle_per_household,vacancy_ratio
full_ct_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12001000200,7295.0,7295.0,3456.0,3839.0,21.7,2311.0,7295.0,5666.0,1057.0,0.0,...,0.164133,0.051831,0.182698,0.169038,0.021059,0.372795,0.520614,0.396710,1.252272,0.272126
12001000301,3801.0,3801.0,1889.0,1912.0,27.8,1736.0,3801.0,1933.0,1455.0,13.0,...,0.020822,0.010678,0.196581,0.120558,0.026991,0.344130,0.345348,0.492765,1.078917,0.138462
12001000302,2426.0,2426.0,1092.0,1334.0,41.6,1156.0,2426.0,1498.0,512.0,17.0,...,0.012036,0.090271,0.136955,0.113186,0.019808,0.269949,0.387422,0.410965,0.862457,0.229333
12001000400,5658.0,5658.0,2623.0,3035.0,33.8,1983.0,5658.0,1929.0,3178.0,0.0,...,0.011979,0.018208,0.173762,0.064292,0.008978,0.247032,0.395999,0.368858,1.052446,0.192918
12001000500,5026.0,5026.0,2435.0,2591.0,33.0,2588.0,5026.0,3887.0,892.0,0.0,...,0.060218,0.063714,0.252083,0.161012,0.063988,0.477083,0.308218,0.512137,0.994590,0.057881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12133970104,2718.0,2718.0,1355.0,1363.0,42.9,1003.0,2718.0,1750.0,934.0,2.0,...,0.000000,0.039116,0.070227,0.031652,0.007913,0.109792,0.395126,0.432671,1.172483,0.181224
12133970200,3476.0,3476.0,1787.0,1689.0,43.6,1339.0,3476.0,2983.0,231.0,91.0,...,0.014641,0.010249,0.034943,0.036514,0.002748,0.074205,0.469787,0.392980,1.020164,0.219697
12133970301,2565.0,2565.0,1132.0,1433.0,43.9,924.0,2565.0,2302.0,231.0,17.0,...,0.010078,0.004479,0.083473,0.044818,0.007283,0.135574,0.522738,0.348148,0.966450,0.256637
12133970302,6814.0,6814.0,4635.0,2179.0,41.1,1613.0,6814.0,4561.0,1852.0,129.0,...,0.007126,0.017221,0.051478,0.021354,0.000000,0.072831,0.683897,0.247138,1.044017,0.298391


# Create the growth dataframe (16 - 18)

In [23]:
# annual growth_rate_16_18
var_of_interests = ['inc_per_capita', 'pop_total', 'property_value_median']
# var_of_interests_ratio = ['race_black_ratio', 'race_white_ratio', 'vacancy_ratio']

growth_rate_dic = {}
growth_rate_dic['boston'] = (dic['boston_2018'][var_of_interests] - dic['boston_2016'][var_of_interests])/dic['boston_2016'][var_of_interests]
growth_rate_dic['illinois'] = (dic['illinois_2018'][var_of_interests] - dic['illinois_2016'][var_of_interests])/dic['illinois_2016'][var_of_interests]
growth_rate_dic['florida'] = (dic['florida_2018'][var_of_interests] - dic['florida_2016'][var_of_interests])/dic['florida_2016'][var_of_interests]

for key_ in growth_rate_dic.keys():
    growth_rate_dic[key_] = np.round(growth_rate_dic[key_]/(18-16), decimals = 5)
    growth_rate_dic[key_] = growth_rate_dic[key_].rename(columns = {'inc_per_capita': 'inc_per_capita_annual_growth',
                                                                    'pop_total':'pop_total_annual_growth',
                                                                    'property_value_median': 'property_value_median_annual_growth'})
    
growth_rate_dic

{'boston':               inc_per_capita_annual_growth  pop_total_annual_growth  \
 full_ct_fips                                                          
 25001010100                        0.03916                 -0.00034   
 25001010206                        0.04476                  0.04940   
 25001010208                       -0.00791                 -0.10541   
 25001010304                        0.12505                  0.07689   
 25001010306                        0.05647                 -0.06301   
 ...                                    ...                      ...   
 33019975600                        0.03934                 -0.03250   
 33019975700                       -0.00130                  0.00180   
 33019975800                        0.07481                 -0.01399   
 33019975901                        0.02663                 -0.01057   
 33019975902                        0.04907                  0.03551   
 
               property_value_median_annual_growth  

In [24]:
# # annual growth_rate_16_18
# var_of_interests = ['inc_per_capita', 'pop_total', 'property_value_median']
# var_of_interests_ratio = ['race_black_ratio', 'race_white_ratio', 'vacancy_ratio']
# growth_rate_16_18 = (greater_boston_socioecon_2018_df[var_of_interests] - greater_boston_socioecon_2016_df[var_of_interests])/greater_boston_socioecon_2016_df[var_of_interests]

# for var_ratio in var_of_interests_ratio:
#     growth_rate_16_18[var_ratio] = (greater_boston_socioecon_2018_df[var_ratio] - greater_boston_socioecon_2016_df[var_ratio])/2

# growth_rate_16_18 = np.round(growth_rate_16_18/(18-16), decimals = 5)

# # replace col names
# growth_rate_16_18 = growth_rate_16_18.rename(columns = {'inc_per_capita': 'inc_per_capita_annual_growth',
#                                                         'pop_total':'pop_total_annual_growth',
#                                                         'property_value_median': 'property_value_median_annual_growth',
#                                                         'race_black_ratio': 'race_black_ratio_annual_growth', 
#                                                         'race_white_ratio': 'race_white_ratio_annual_growth', 
#                                                         'vacancy_ratio': 'vacancy_ratio_annual_growth'})

# growth_rate_16_18

In [25]:
# growth rates seem quite reasonable. 
growth_rate_dic['boston'].describe()

Unnamed: 0,inc_per_capita_annual_growth,pop_total_annual_growth,property_value_median_annual_growth
count,1755.0,1755.0,1755.0
mean,0.05114,0.007059,0.041253
std,0.049441,0.029748,0.060258
min,-0.29893,-0.25472,-0.20832
25%,0.020685,-0.0088,0.01519
50%,0.04882,0.00563,0.03443
75%,0.078285,0.021425,0.059015
max,0.38499,0.33274,1.47153


#### combine three cities into total

# Save

In [26]:
with open('../../data/02_intermediate/socioecon_boston_florida_illinois_ct_dic.pickle', 'wb') as f:
    pickle.dump(dic, f)

with open('../../data/02_intermediate/growth_boston_florida_illinois_ct_dic.pickle', 'wb') as f:
    pickle.dump(growth_rate_dic, f)
    

In [27]:
# with open('../../data/02_intermediate/boston_socioecon_ct_2016.pickle', 'wb') as f:
#     pickle.dump(greater_boston_socioecon_2016_df, f)
    
# with open('../../data/02_intermediate/boston_socioecon_ct_2018.pickle', 'wb') as f:
#     pickle.dump(greater_boston_socioecon_2018_df, f)

# with open('../../data/02_intermediate/boston_annual_growth_ct_2016_2018.pickle', 'wb') as f:
#     pickle.dump(growth_rate_16_18, f)
    