In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.stats.api as sms
from scipy.stats import ttest_ind
from scipy.stats.mstats import zscore
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import scale
from statsmodels.tools.tools import add_constant

%matplotlib inline
tracts_data_path = 'data/features.csv'

  from pandas.core import datetools


In [2]:
dtypes = {'GEOID' : str,
          'place_geoid' : str,
          'state' : str,
          'county' : str}

df = pd.read_csv(tracts_data_path, encoding='utf-8', dtype=dtypes)
df = df.rename(columns={'GEOID' : 'tract'}).set_index('tract')
assert df.index.is_unique

In [3]:
print(len(df))
df.head()

12328


Unnamed: 0_level_0,land_area,place_geoid,place_name,total_pop,median_age,prop_hispanic,prop_white,prop_black,prop_asian,prop_single_fam_detached,...,"Salt Lake City, UT","San Antonio, TX","San Diego, CA","San Francisco, CA","San Jose, CA","Seattle, WA","St. Louis, MO","Tampa, FL","Virginia Beach, VA","Washington, DC"
tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1073000100,7549578,107000,"Birmingham, AL",2970.0,32.2,0.046,0.171,0.785,0.0,0.705,...,0,0,0,0,0,0,0,0,0,0
1073000300,2093104,107000,"Birmingham, AL",2494.0,36.5,0.18,0.046,0.672,0.084,0.326,...,0,0,0,0,0,0,0,0,0,0
1073000400,8001582,107000,"Birmingham, AL",3437.0,30.6,0.007,0.079,0.908,0.0,0.897,...,0,0,0,0,0,0,0,0,0,0
1073000500,4819145,107000,"Birmingham, AL",3735.0,35.8,0.014,0.05,0.929,0.0,0.546,...,0,0,0,0,0,0,0,0,0,0
1073000700,3520564,107000,"Birmingham, AL",2562.0,25.4,0.025,0.0,0.977,0.0,0.569,...,0,0,0,0,0,0,0,0,0,0


## Organize our variables

In [4]:
# get a set of all predictor variables
not_predictors = ['bias_diff', 'bias_ratio', 'bias_log', 'bias_bc', 'centroid', 
                  'clist_count', 'county', 'geometry', 'is_over', 'land_area',
                  'lat_city_center', 'lng_city_center', 'place_geoid', 'place_name', 
                  'proportionate_count', 'state', 'prop_white_change_2012_2015']

predictors_all = df.drop(columns=not_predictors).columns

In [5]:
# get all the predictors that are not dummy variables
predictors_no_dummies = sorted([p for p in predictors_all if 'dummy' not in p 
                                                          and ',' not in p
                                                          and '_log' not in p])

In [6]:
# the independent variables to include in our model (plus the city dummies)
predictors_reduced = ['distance_to_center_km_log', 'mean_travel_time_work_log', 'med_income_k_log', 
                      'med_rooms_in_house', 'median_gross_rent_k', 'prop_20_34', 'prop_built_before_1940', 
                      'prop_college_grad_student', 'prop_english_only', 'prop_same_residence_year_ago', 
                      'prop_white', 'prop_white*income_log', 'renter_household_size_log']

In [7]:
# get a smaller subset of key variables of interest
predictors_key = ['prop_white', 'renter_household_size', 'prop_below_poverty', 'prop_20_34',
                  'prop_bachelors_or_higher', 'med_income_k', 'median_gross_rent_k']

In [8]:
# the city dummies to control for regional differences
city_dummies = ['Atlanta, GA', 'Austin, TX', 'Baltimore, MD', 'Birmingham, AL', 'Boston, MA', 
                'Buffalo, NY', 'Charlotte, NC', 'Chicago, IL', 'Cincinnati, OH', 'Cleveland, OH', 
                'Columbus, OH', 'Dallas, TX', 'Denver, CO', 'Detroit, MI', 'Hartford, CT', 'Houston, TX', 
                'Indianapolis, IN', 'Jacksonville, FL', 'Kansas City, MO', 'Las Vegas, NV', 
                'Los Angeles, CA', 'Louisville, KY', 'Memphis, TN', 'Miami, FL', 'Milwaukee, WI', 
                'Minneapolis, MN', 'Nashville, TN', 'New Orleans, LA', 'New York, NY', 
                'Oklahoma City, OK', 'Orlando, FL', 'Philadelphia, PA', 'Phoenix, AZ', 
                'Pittsburgh, PA', 'Portland, OR', 'Providence, RI', 'Raleigh, NC', 'Richmond, VA', 
                'Riverside, CA', 'Sacramento, CA', 'Salt Lake City, UT', 'San Antonio, TX', 
                'San Diego, CA', 'San Francisco, CA', 'San Jose, CA', 'Seattle, WA', 'St. Louis, MO', 
                'Tampa, FL', 'Virginia Beach, VA']

## Race

In [9]:
# in how many tracts is each race the majority?
print(df['dummy_white'].sum())
print(df['dummy_black'].sum())
print(df['dummy_hispanic'].sum())
print(df['dummy_asian'].sum())
len(df)

4868
2767
2266
231


12328

In [10]:
# what proportion of tracts with each of these races as the majority is over-represented?
white_tracts = df[df['dummy_white']==1]
white_odds = round(white_tracts['is_over'].sum() / len(white_tracts), 4)

asian_tracts = df[df['dummy_asian']==1]
asian_odds = round(asian_tracts['is_over'].sum() / len(asian_tracts), 4)

black_tracts = df[df['dummy_black']==1]
black_odds = round(black_tracts['is_over'].sum() / len(black_tracts), 4)

hisp_tracts = df[df['dummy_hispanic']==1]
hisp_odds = round(hisp_tracts['is_over'].sum() / len(hisp_tracts), 4)

print(white_odds, asian_odds, black_odds, hisp_odds)
print(round(white_odds / asian_odds, 4))
print(round(white_odds / black_odds, 4))
print(round(white_odds / hisp_odds, 4))

0.3667 0.2165 0.1684 0.1112
1.6938
2.1776
3.2977


Majority white tracts are overrepresented on Craigslist 2x as often as majority black tracts and 3x  as often as majority hispanic tracts.

## Gini

In [11]:
# gini coefficient measures how evenly some value is distributed among a set of buckets
# we can measure how evenly listings are distributed among tracts
def gini(list_of_values):
    sorted_list = sorted(list_of_values)
    height, area = 0, 0
    for value in sorted_list:
        height += value
        area += height - value / 2.
    fair_area = height * len(list_of_values) / 2.
    return round((fair_area - area) / fair_area, 4)

In [12]:
# nationwide
print(gini(df['clist_count']))
print(gini(df['proportionate_count']))
print(gini(df['count_renter_occupied_units']))

0.7958
0.5397
0.3838


The proportionate_count is a function of per-city count_renter_occupied_units, but their gini coefficients don't match nationwide because proportionate_count is assigned as per-city proportions, not nationwide.

In [13]:
# now examine gini coefficients for each city
data = {}
for name, group in df.groupby('place_name'):
    
    data[name] = {'clist_gini' : gini(group['clist_count']),
                  'prop_gini' : gini(group['proportionate_count'])}
    
ginis = pd.DataFrame(data).T
ginis['ratio'] = ginis['clist_gini'] / ginis['prop_gini']
ginis.sort_values(by='ratio', ascending=False).round(3)

Unnamed: 0,clist_gini,prop_gini,ratio
"Hartford, CT",0.826,0.262,3.155
"Miami, FL",0.789,0.284,2.783
"Philadelphia, PA",0.756,0.274,2.762
"Boston, MA",0.721,0.266,2.709
"Milwaukee, WI",0.769,0.29,2.654
"Buffalo, NY",0.616,0.236,2.607
"Providence, RI",0.591,0.229,2.579
"Detroit, MI",0.781,0.31,2.516
"Sacramento, CA",0.679,0.27,2.511
"Cleveland, OH",0.727,0.29,2.508


Higher gini coefficient for actual craigslist listings suggests they are more concentrated into fewer tracts than a proportional distribution would be.

## *t*-tests and effect sizes for significant differences in variables

Divide the data into two subsets: overrepresented and underrepresented, then test if variables' means differ significantly between them.

In [14]:
# effect size: as cohen's d
def cohen_d(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    d = (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)
    return d

def test_differences(subset1, subset2, variables):
    test_results = {}
    for var in variables:
        a = subset1[var]
        b = subset2[var]
        t_statistic, p_value = ttest_ind(a=a, b=b, equal_var=False, nan_policy='omit')
        diff = subset1[var].mean() - subset2[var].mean()
        d_value = cohen_d(x=a, y=b)
        test_results[var] = {'diff_mean' : round(diff, 3),
                             't_stat' : round(t_statistic, 2),
                             'p_val' : round(p_value, 3),
                             'cohen_d' : round(d_value, 2)}
    return test_results

In [15]:
# divide dataset into overrepresented tracks and not overrepresented
over = df[df['is_over']==1]
under = df[df['is_over']==0]

In [16]:
# variables' effect sizes between over and underrepresented tracts
results = test_differences(over, under, predictors_no_dummies)
effect_sizes = pd.DataFrame(results).T.sort_values('cohen_d', ascending=False)
effect_sizes.reindex(columns=['cohen_d', 'diff_mean', 'p_val'])

Unnamed: 0,cohen_d,diff_mean,p_val
prop_bachelors_or_higher,0.8,0.171,0.0
med_income_k,0.59,17.104,0.0
prop_white,0.58,0.17,0.0
prop_college_grad_student,0.53,0.098,0.0
median_gross_rent_k,0.49,0.179,0.0
prop_20_34,0.39,0.039,0.0
prop_english_only,0.37,0.093,0.0
prop_nonrelatives_household,0.36,0.022,0.0
med_home_value_k,0.33,74.523,0.0
count_renter_occupied_units,0.13,80.796,0.0


"Cohen suggested that d=0.2 be considered a 'small' effect size, 0.5 represents a 'medium' effect size and 0.8 a 'large' effect size. This means that if two groups' means don't differ by 0.2 standard deviations or more, the difference is trivial, even if it is statistically signficant."

Perhaps we can interpret small-medium effect size as absolute value 0.3 <= x < 0.5?

d is not affected by units/sizes. So income and income_k will have same d.

In [17]:
# look at some smaller subset of key variables of interest, per city
def significance(p):
    if p <= 0.01:
        return '*'
    elif p <= 0.05:
        return '*'
    else:
        return ''

city_results = {}
for city, group in df.groupby('place_name'):
    group_over = group[group['is_over']==1]
    group_under = group[group['is_over']==0]
    group_results = test_differences(group_over, group_under, predictors_key)
    var_d = {k:'{:.2f}{}'.format(v['cohen_d'], significance(v['p_val'])) for k, v in group_results.items()}
    city_results[city] = var_d

In [18]:
city_effect_sizes = pd.DataFrame(city_results).T
city_effect_sizes.index = city_effect_sizes.index.map(lambda x: x.split(', ')[0])
city_effect_sizes.head()
#city_effect_sizes.sort_values(by='prop_white', ascending=False)

Unnamed: 0,med_income_k,median_gross_rent_k,prop_20_34,prop_bachelors_or_higher,prop_below_poverty,prop_white,renter_household_size
Atlanta,0.37,0.55*,0.31,0.58*,-0.37*,0.48*,-0.38*
Austin,0.44*,0.45*,-0.01,0.69*,-0.23,0.69*,-0.38*
Baltimore,0.80*,0.71*,1.07*,1.32*,-0.59*,1.15*,-0.71*
Birmingham,0.79*,0.89*,0.01,0.71*,-0.87*,0.50*,-0.10
Boston,0.41*,1.14*,1.26*,1.36*,0.03,0.98*,-1.24*


## Estimate regression models to predict Craigslist over- or under-representation

In [19]:
# define the response variable and the predictor variables
response = 'bias_log'
predictors = predictors_reduced + city_dummies

In [20]:
X = df[predictors]
print(len(X))
X = X.dropna()
y = df.loc[X.index][response]
print(len(X))

12328
12224


In [21]:
# estimate a model across the full data set (all cities)
Xc = add_constant(X)
model = sm.OLS(y, Xc)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:               bias_log   R-squared:                       0.296
Model:                            OLS   Adj. R-squared:                  0.292
Method:                 Least Squares   F-statistic:                     82.30
Date:                Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                        17:17:49   Log-Likelihood:                -15804.
No. Observations:               12224   AIC:                         3.173e+04
Df Residuals:                   12161   BIC:                         3.220e+04
Df Model:                          62                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

^^ if we get warnings about multicollinearity, but have good VIF scores and significant variables, then check a standardized regression (below) to see if it's just scaling or the intercept/constant causing it (intercept shouldn't cause high condition number if we center/standardize our predictors). A high condition number indicates multicollinearity. Rule of thumb, you want this to be below ~20.

durbin-watson tests for autocorrelation. a value around 1.5 to 2.5 is considered fine.

omnibus tests for normality of residuals; if prob < 0.05, we reject the null hypothesis that they are normally distributed. skew and kurtosis describe their distribution.

jarque-bera tests for normality of residuals; if prob < 0.05, we reject the null hypothesis that they are normally distributed

Interaction term shows that the positive effect of income matters less as tract gets whiter and that the positive effect of white matters less as tract gets richer.

In [22]:
# estimate a standardized model across the full data set (all cities)
y_stdrd = pd.Series(data=zscore(y), index=y.index, name=y.name)
X_stdrd = pd.DataFrame(data=zscore(X), index=X.index, columns=X.columns)
Xc_stdrd = add_constant(X_stdrd)
model_stdrd = sm.OLS(y_stdrd, Xc_stdrd)
result_stdrd = model_stdrd.fit()
print(result_stdrd.summary())

                            OLS Regression Results                            
Dep. Variable:               bias_log   R-squared:                       0.296
Model:                            OLS   Adj. R-squared:                  0.292
Method:                 Least Squares   F-statistic:                     82.30
Date:                Tue, 13 Mar 2018   Prob (F-statistic):               0.00
Time:                        17:17:49   Log-Likelihood:                -15204.
No. Observations:               12224   AIC:                         3.053e+04
Df Residuals:                   12161   BIC:                         3.100e+04
Df Model:                          62                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

## Regression Diagnostics

In [23]:
# condition number to test for multicollinearity
# rule of thumb, you want this below 20
np.linalg.cond(model_stdrd.exog)

28.582493738292488

In [24]:
# plot observed (y-axis) vs fitted (x-axis)
observed = model.endog #actual response var
fitted = result.fittedvalues #predicted response var

fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x=fitted, y=observed, s=0.2)
ax.set_xlabel('fitted')
ax.set_ylabel('observed')
ax.set_title('actual vs predicted')

# draw a 45° y=x line
ax.set_xlim((min(np.append(observed, fitted)), max(np.append(observed, fitted))))
ax.set_ylim((min(np.append(observed, fitted)), max(np.append(observed, fitted))))
ax.plot(ax.get_xlim(), ax.get_ylim(), ls='--', c='k', alpha=0.5)

fig.savefig('images/diagnostic_actual_vs_predicted.png', dpi=300, bbox_inches='tight')
plt.close()

In [25]:
# standardized residuals: the internally studentized residuals
resids_stud = result.get_influence().resid_studentized_internal

In [26]:
# residuals plot for heteroskedasticity
# want this to look like a random point pattern with no discernable trend
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x=result.fittedvalues, y=resids_stud, s=0.2)
ax.axhline(y=0, ls='--', c='k', alpha=0.5)
ax.set_title('residuals vs fitted plot')
ax.set_xlabel('fitted values')
ax.set_ylabel('standardized residuals')

fig.savefig('images/diagnostic_residuals_vs_fitted.png', dpi=300, bbox_inches='tight')
plt.close()

In [27]:
# scale-location plot (aka spread-location plot)
# want this to look like a random point pattern with no discernable trend
resids_stud_abs_sqrt = np.sqrt(np.abs(resids_stud))
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x=result.fittedvalues, y=resids_stud_abs_sqrt, s=0.2)
ax.set_title('scale-location plot')
ax.set_xlabel('fitted values')
ax.set_ylabel('square-root absolute standardized residuals ')

fig.savefig('images/diagnostic_scale_location.png', dpi=300, bbox_inches='tight')
plt.close()

In [28]:
# are residuals approximately normally distributed?
# null hypothesis is normal dist, p-value < 0.05 means reject null
# typically want skew and kurtosis to be within -2 to 2
# but with sufficiently large sample size, we'll always reject the null
jb, jb_p, skew, kurtosis = sms.jarque_bera(resids_stud)
print([round(x, 3) for x in [jb, jb_p, skew, kurtosis]])

[1078.97, 0.0, 0.558, 3.934]


In [29]:
# are residuals approximately normally distributed?
# visuals can be more useful than test-statistics
fig, ax = plt.subplots(figsize=(6, 6))
ax = pd.Series(resids_stud).hist(bins=30, ax=ax)
ax.set_title('standardized residuals histogram')
fig.savefig('images/diagnostic_residuals_histogram.png', dpi=300, bbox_inches='tight')
plt.close()

In [30]:
# are residuals approximately normally distributed?
# you want the points to tightly follow the line
# the hist above and qq plot below are ok, not terrible
fig, ax = plt.subplots(figsize=(6, 6))
fig = sm.qqplot(resids_stud, line='45', ax=ax)
ax.set_title('normal probability plot of the standardized residuals')
fig.savefig('images/diagnostic_residuals_qq_plot.png', dpi=300, bbox_inches='tight')
plt.close()

In [31]:
# create figure and axes
n = len(predictors_reduced)
ncols = int(np.ceil(np.sqrt(n)))
nrows = int(np.ceil(n / ncols))
fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*5, nrows*5))
axes = [item for sublist in axes for item in sublist]

resids_stud = result.get_influence().resid_studentized_internal

# for each axis and variable, scatterplot the resids
for ax, var in zip(axes, sorted(predictors_reduced)):
    ax.scatter(x=X[var], y=resids_stud, s=0.2)
    ax.set_xlabel(var)
    ax.set_ylabel('standardized residuals')

# save to disk and show
fig.savefig('images/scatter_resids_vs_predictors.jpg', bbox_inches='tight', dpi=150)
plt.close()

## Regression model for just one city

In [32]:
# subset data for a single city
place_name = 'New York, NY'
df_city = df[df['place_name']==place_name]
print(sum(df_city['bias_ratio']>1), sum(df_city['bias_ratio']<=1))

449 1663


In [33]:
X_city = df_city[predictors_reduced]
print(len(X_city))
X_city = X_city.dropna()
y_city = df_city.loc[X_city.index][response]
print(len(X_city))
Xc_city = add_constant(X_city)

2112
2097


In [34]:
# estimate a model for this single city
model_city = sm.OLS(y_city, Xc_city)
result_city = model_city.fit()
print(result_city.summary())

                            OLS Regression Results                            
Dep. Variable:               bias_log   R-squared:                       0.384
Model:                            OLS   Adj. R-squared:                  0.380
Method:                 Least Squares   F-statistic:                     99.77
Date:                Tue, 13 Mar 2018   Prob (F-statistic):          8.10e-208
Time:                        17:17:54   Log-Likelihood:                -2161.9
No. Observations:                2097   AIC:                             4352.
Df Residuals:                    2083   BIC:                             4431.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

## Logit models

In [35]:
X_logit = df[predictors_reduced]
print(len(X_logit))
X_logit = X_logit.dropna()
y_logit = df.loc[X_logit.index]['is_over']
print(len(X_logit))
Xc_logit = add_constant(X_logit)

12328
12224


In [36]:
%%time
# predict whether or not tract is overrepresented on craigslist (yes/no)
model_logit = sm.Logit(y_logit, Xc_logit)
result_logit = model_logit.fit()
print(result_logit.summary())

Optimization terminated successfully.
         Current function value: 0.491901
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                is_over   No. Observations:                12224
Model:                          Logit   Df Residuals:                    12210
Method:                           MLE   Df Model:                           13
Date:                Tue, 13 Mar 2018   Pseudo R-squ.:                  0.1234
Time:                        17:17:54   Log-Likelihood:                -6013.0
converged:                       True   LL-Null:                       -6859.7
                                        LLR p-value:                     0.000
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const                           -0.1203      0.549     -0.219      0.827

## Dimensionality reduction

### PCA with all the predictors

In [37]:
X = df[predictors_reduced].dropna()
X = pd.DataFrame(scale(X.values), columns=X.columns)

In [38]:
# n dimensions
n = 5
pca = PCA(n_components=n)
pca.fit(X=X)

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [39]:
# amount of variance that each component explains
pca.explained_variance_ratio_

array([0.28991701, 0.23720171, 0.13056765, 0.09509767, 0.06001126])

In [40]:
# cumulative variance explained
np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3))

array([0.29 , 0.527, 0.658, 0.753, 0.813])

In [41]:
labels = ['PC{}'.format(i+1) for i in range(n)]
pd.DataFrame(pca.components_, columns=X.columns, index=labels).T.sort_values('PC1', ascending=False).round(3)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
prop_white,0.41,-0.262,-0.037,-0.056,-0.095
prop_white*income_log,0.4,-0.309,-0.004,-0.066,-0.061
prop_college_grad_student,0.372,0.195,0.228,0.084,-0.067
prop_20_34,0.286,0.326,0.235,0.187,0.114
med_income_k_log,0.231,-0.415,0.275,0.042,0.117
prop_english_only,0.23,-0.085,-0.464,-0.276,-0.089
median_gross_rent_k,0.152,-0.295,0.494,0.092,0.244
prop_built_before_1940,0.03,0.164,0.211,-0.722,0.29
med_rooms_in_house,-0.057,-0.427,-0.308,-0.085,0.351
distance_to_center_km_log,-0.194,-0.34,0.044,0.289,-0.438


### Factor analysis with all the predictors

In [42]:
# n factors
n = 5
fa = FactorAnalysis(n_components=n, max_iter=5000)
fa.fit(X=X)

FactorAnalysis(copy=True, iterated_power=3, max_iter=5000, n_components=5,
        noise_variance_init=None, random_state=0, svd_method='randomized',
        tol=0.01)

In [43]:
labels = ['Fac{}'.format(i+1) for i in range(n)]
pd.DataFrame(fa.components_, columns=X.columns, index=labels).T.sort_values('Fac1', ascending=False).round(3)

Unnamed: 0,Fac1,Fac2,Fac3,Fac4,Fac5
prop_white*income_log,0.998,-0.056,-0.002,0.001,-0.002
prop_white,0.987,-0.015,0.113,-0.044,-0.002
med_income_k_log,0.672,-0.274,-0.496,0.393,0.053
median_gross_rent_k,0.402,-0.302,-0.332,0.582,-0.025
prop_english_only,0.367,0.26,-0.133,-0.412,0.425
prop_college_grad_student,0.326,0.433,0.269,0.464,0.321
med_rooms_in_house,0.235,-0.519,-0.484,-0.458,0.328
prop_20_34,0.115,0.334,0.518,0.528,0.343
distance_to_center_km_log,-0.01,-0.418,-0.314,-0.103,-0.152
prop_built_before_1940,-0.039,0.176,0.084,0.071,-0.05
