In [1]:
# Load modules

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import geopandas as gpd
import mapclassify
import matplotlib.pyplot as plt
%matplotlib inline
from regressors import stats


# ML packages
import sklearn.metrics as metrics 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
ih = pd.read_csv("../data/final project/CLEAN_DATA/joined_data/ih_all.csv")
ih = ih.iloc[:, 1:]

nyc = pd.read_csv("../data/final project/CLEAN_DATA/joined_data/nyc.csv")
chicago = pd.read_csv("../data/final project/CLEAN_DATA/joined_data/chicago.csv")
sf = pd.read_csv("../data/final project/CLEAN_DATA/joined_data/sf.csv")
boston = pd.read_csv("../data/final project/CLEAN_DATA/joined_data/boston.csv")
dc = pd.read_csv("../data/final project/CLEAN_DATA/joined_data/dc.csv")

In [3]:
ih.head()

Unnamed: 0,zip,aff_units,aff_dev_rate,ih_prop,rental_all,rental_2014plus,med_hh_inc,pop,med_age,youth_pop,...,simpson,citizen_pop,housing_total,tot_dev_rate,dev_residual,resid_bin,aff_dev_bin,aff_unit_bin,ih_prop_bin,city
0,10001.0,1362.0,0.056475,0.007883,9712.0,1229.0,122776.0,24117.0,36.0,2101.0,...,0.623194,18676.0,14746.0,0.05096,0.005515,0.0,1.0,1.0,1.0,nyc
1,10002.0,876.0,0.011762,0.003167,28933.0,92.0,40045.0,74479.0,44.0,9477.0,...,0.763648,53646.0,35921.0,0.001235,0.010526,0.0,0.0,1.0,0.0,nyc
2,10003.0,0.0,0.0,0.0,17598.0,98.0,183787.0,53977.0,32.0,3374.0,...,0.4883,44096.0,30931.0,0.001816,-0.001816,0.0,0.0,0.0,0.0,nyc
3,10004.0,0.0,0.0,0.0,1104.0,0.0,205202.0,3335.0,37.0,382.0,...,0.564544,2537.0,2264.0,0.0,0.0,0.0,0.0,0.0,0.0,nyc
4,10005.0,0.0,0.0,0.0,3444.0,4.0,241094.0,8701.0,30.0,1037.0,...,0.46853,6542.0,5889.0,0.00046,-0.00046,0.0,0.0,0.0,0.0,nyc


### Affordable Development Rate

In [27]:
X = ih.iloc[:,6:-7]
y = ih["aff_dev_rate"]

X.head()

Unnamed: 0,med_hh_inc,pop,med_age,youth_pop,elderly_pop,white_pop,black_pop,asian_pop,pacisl_pop,other_race_pop,multiracial_pop,latinx_pop,simpson,citizen_pop,housing_total
0,122776.0,24117.0,36.0,2101.0,3490.0,15677.0,1510.0,5377.0,21.0,799.0,717.0,3741.0,0.623194,18676.0,14746.0
1,40045.0,74479.0,44.0,9477.0,16931.0,23849.0,6635.0,31198.0,22.0,8924.0,3127.0,19554.0,0.763648,53646.0,35921.0
2,183787.0,53977.0,32.0,3374.0,7215.0,40681.0,3007.0,7812.0,73.0,1155.0,1168.0,4559.0,0.4883,44096.0,30931.0
3,205202.0,3335.0,37.0,382.0,201.0,2033.0,142.0,992.0,17.0,35.0,116.0,108.0,0.564544,2537.0,2264.0
4,241094.0,8701.0,30.0,1037.0,93.0,6532.0,94.0,1543.0,0.0,217.0,315.0,552.0,0.46853,6542.0,5889.0


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

Train sample (X): 284
Train sample (y): 284
Test sample (X): 95
Test sample (y): 95
Training set score: 0.10
Mean squared error (training set): 0.00
Test set score: 0.02
Mean squared error (test set): 0.00


In [29]:
print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(20)

List of predictors: ['med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 2.79417493e-08  5.69981916e-06 -2.02824977e-04 -3.65184850e-09
 -2.40698088e-06 -5.50573262e-06 -5.24950390e-06 -5.54750852e-06
 -1.11807792e-05 -5.17141811e-06 -6.29162410e-06 -5.05846278e-07
  2.97092562e-02 -2.99463783e-07  5.94957497e-07]
p-values of predictors: [0.54782    0.06028491 0.27381612 0.11340151 0.99704445 0.00856617
 0.27077722 0.29357236 0.27674377 0.67474543 0.34422216 0.33843017
 0.0855498  0.00552137 0.58763985 0.22386657]
Constant (intercept): 0.00291759197587331


Unnamed: 0,Features,Coef,p-value
0,med_hh_inc,2.794175e-08,0.060285
1,pop,5.699819e-06,0.273816
2,med_age,-0.000202825,0.113402
3,youth_pop,-3.651849e-09,0.997044
4,elderly_pop,-2.406981e-06,0.008566
5,white_pop,-5.505733e-06,0.270777
6,black_pop,-5.249504e-06,0.293572
7,asian_pop,-5.547509e-06,0.276744
8,pacisl_pop,-1.118078e-05,0.674745
9,other_race_pop,-5.171418e-06,0.344222


In [25]:
X = nyc.iloc[:,6:-7]
y = nyc["aff_dev_rate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(20)

Train sample (X): 132
Train sample (y): 132
Test sample (X): 45
Test sample (y): 45
Training set score: 0.19
Mean squared error (training set): 0.00
Test set score: 0.03
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 1.46999770e-05  3.51084125e-08 -3.85268507e-06 -1.81161977e-03
  7.49796648e-07  1.85537194e-06  2.92092706e-06  3.16899693e-06
  3.35835662e-06 -1.59220162e-05  2.98575861e-06  1.51443956e-06
  2.75776693e-07 -4.58318021e-03 -2.16955868e-09  5.44183884e-07]
p-values of predictors: [0.00723389 0.02251643 0.04466902 0.46521379 0.         0.47095584
 0.08422886 0.56651432 0.53348753 0.51764892 0.66250054 0.58628056
 0.81396612 0.32509789 0.78126596 0.99713358 0.30620962]
Constant (intercept): 0.07960839936315

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,1.469998e-05,0.022516
1,med_hh_inc,3.510841e-08,0.044669
2,pop,-3.852685e-06,0.465214
3,med_age,-0.00181162,0.0
4,youth_pop,7.497966e-07,0.470956
5,elderly_pop,1.855372e-06,0.084229
6,white_pop,2.920927e-06,0.566514
7,black_pop,3.168997e-06,0.533488
8,asian_pop,3.358357e-06,0.517649
9,pacisl_pop,-1.592202e-05,0.662501


In [31]:
X = chicago.iloc[:,6:-7]
y = chicago["aff_dev_rate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(10)

Train sample (X): 45
Train sample (y): 45
Test sample (X): 15
Test sample (y): 15
Training set score: 0.49
Mean squared error (training set): 0.00
Test set score: 0.14
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 1.46854295e-06  1.06336225e-08 -3.06958834e-06 -1.18708992e-03
 -2.54390039e-06  1.01583676e-06  3.98711001e-06  4.11775110e-06
  3.89254458e-06 -2.03761640e-05  3.53459055e-06  3.97210010e-06
  3.28857326e-07 -6.21841732e-03 -1.49843128e-07 -1.26283815e-06]
p-values of predictors: [1.15974870e-02 6.99223789e-01 1.51631968e-01 3.14567503e-01
 1.11022302e-15 6.26889408e-03 2.84242339e-02 1.53929715e-01
 1.37910745e-01 1.96853179e-01 5.88061205e-01 2.23792016e-01
 2.70666143e-01 2.41991262e-02 3.31050287e-01 7.

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,1.4685e-06,0.699224
1,med_hh_inc,1.06e-08,0.151632
2,pop,-3.0696e-06,0.314568
3,med_age,-0.00118709,0.0
4,youth_pop,-2.5439e-06,0.006269
5,elderly_pop,1.0158e-06,0.028424
6,white_pop,3.9871e-06,0.15393
7,black_pop,4.1178e-06,0.137911
8,asian_pop,3.8925e-06,0.196853
9,pacisl_pop,-2.03762e-05,0.588061


In [32]:
X = dc.iloc[:,6:-7]
y = dc["aff_dev_rate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(10)

Train sample (X): 39
Train sample (y): 39
Test sample (X): 14
Test sample (y): 14
Training set score: 0.84
Mean squared error (training set): 0.00
Test set score: -0.52
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 2.09770784e-05  2.49717994e-08  4.28610737e-07 -8.45287071e-05
 -1.48458602e-05  7.05353400e-06  9.80346770e-06  1.18290070e-05
  9.42875942e-06  1.40752401e-04  3.85754892e-05  5.01116415e-05
 -2.68274743e-05  1.14872120e-02 -1.47056041e-05  4.30696154e-06]
p-values of predictors: [8.95456449e-01 6.44508461e-02 1.22093154e-02 9.69537999e-01
 1.39263461e-01 6.60830295e-04 2.52871276e-04 2.69679701e-01
 1.84583302e-01 4.95773485e-01 1.29346927e-01 7.82946531e-03
 9.96132072e-03 2.45087313e-06 4.87116234e-02 1

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,2.09771e-05,0.064451
1,med_hh_inc,2.5e-08,0.012209
2,pop,4.286e-07,0.969538
3,med_age,-8.45287e-05,0.139263
4,youth_pop,-1.48459e-05,0.000661
5,elderly_pop,7.0535e-06,0.000253
6,white_pop,9.8035e-06,0.26968
7,black_pop,1.1829e-05,0.184583
8,asian_pop,9.4288e-06,0.495773
9,pacisl_pop,0.0001407524,0.129347


In [36]:
X = boston.iloc[:,6:-7]
y = boston["aff_dev_rate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(5)

Train sample (X): 32
Train sample (y): 32
Test sample (X): 11
Test sample (y): 11
Training set score: 0.87
Mean squared error (training set): 0.00
Test set score: -5.32
Mean squared error (test set): 0.02
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [-1.42966373e-05  2.05616713e-08 -2.91096817e-04  6.67786238e-04
  5.98841618e-05 -3.97321368e-05  2.55901138e-04  2.53552379e-04
  2.82792137e-04 -1.69031515e-04  2.46870381e-04  2.95294111e-04
  5.33351726e-06  7.36661756e-02  1.66337144e-05  3.80352644e-05]
p-values of predictors: [3.98803591e-01 5.06365060e-01 3.32712700e-01 7.83128170e-04
 1.68652359e-01 2.90958735e-07 1.54840327e-04 1.67000480e-03
 1.73493146e-03 1.13554478e-03 6.60446463e-01 2.53929887e-03
 2.96142214e-04 5.48052668e-02 9.66085246e-02 2

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,-1e-05,0.50637
1,med_hh_inc,0.0,0.33271
2,pop,-0.00029,0.00078
3,med_age,0.00067,0.16865
4,youth_pop,6e-05,0.0
5,elderly_pop,-4e-05,0.00015
6,white_pop,0.00026,0.00167
7,black_pop,0.00025,0.00173
8,asian_pop,0.00028,0.00114
9,pacisl_pop,-0.00017,0.66045


In [11]:
X = sf.iloc[:,6:-7]
y = sf["aff_dev_rate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 34
Train sample (y): 34
Test sample (X): 12
Test sample (y): 12
Training set score: 0.50
Mean squared error (training set): 0.00
Test set score: 0.08
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 1.78875650e-05  5.11290353e-09  9.84158121e-06  2.30719116e-04
  1.94627958e-06 -3.81276594e-06 -1.04219603e-05 -1.15447278e-05
 -9.95710429e-06 -1.99459008e-05 -8.35719431e-06 -1.48812864e-05
 -1.64255017e-06  3.18965830e-02  1.24972937e-06  2.93420110e-07]
p-values of predictors: [0.585434   0.11811174 0.61500024 0.26816847 0.104321   0.48582715
 0.17125322 0.19062078 0.1837284  0.21122016 0.48279578 0.56500888
 0.19078618 0.62279554 0.4640692  0.52493781 0.63248824]
Constant (intercept): -0.026227712108246

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,0.0,0.118
1,med_hh_inc,0.0,0.615
2,pop,0.0,0.268
3,med_age,0.0,0.104
4,youth_pop,0.0,0.486
5,elderly_pop,-0.0,0.171
6,white_pop,-0.0,0.191
7,black_pop,-0.0,0.184
8,asian_pop,-0.0,0.211
9,pacisl_pop,-0.0,0.483


### Residual Development

In [37]:
X = ih.iloc[:,6:-7]
y = ih["dev_residual"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(10)

Train sample (X): 284
Train sample (y): 284
Test sample (X): 95
Test sample (y): 95
Training set score: 0.04
Mean squared error (training set): 0.00
Test set score: 0.03
Mean squared error (test set): 0.00
List of predictors: ['med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [-5.50748017e-08 -1.91850930e-07 -6.25655772e-05 -1.19660076e-07
 -8.27163707e-07  4.27416041e-07  4.83286532e-07  1.31033413e-07
 -5.08442157e-06  9.96914169e-07  5.81062416e-07 -5.97818550e-07
  1.55389843e-02  8.19416519e-08 -1.25894662e-07]
p-values of predictors: [4.66477157e-01 9.21878674e-04 9.73505925e-01 6.59395131e-01
 9.12945880e-01 4.13221857e-01 9.38554781e-01 9.30529409e-01
 9.81519356e-01 8.63511812e-01 8.69449779e-01 9.36480316e-01
 6.73145375e-02 1.88773357e-01 8.93658058e-01 8.16451388e-01]
Constant (intercept

Unnamed: 0,Features,Coef,p-value
0,med_hh_inc,-5.51e-08,0.000922
1,pop,-1.919e-07,0.973506
2,med_age,-6.25656e-05,0.659395
3,youth_pop,-1.197e-07,0.912946
4,elderly_pop,-8.272e-07,0.413222
5,white_pop,4.274e-07,0.938555
6,black_pop,4.833e-07,0.930529
7,asian_pop,1.31e-07,0.981519
8,pacisl_pop,-5.0844e-06,0.863512
9,other_race_pop,9.969e-07,0.86945


#### Individual Cities

In [13]:
X = nyc.iloc[:,6:-7]
y = nyc["dev_residual"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 132
Train sample (y): 132
Test sample (X): 45
Test sample (y): 45
Training set score: 0.12
Mean squared error (training set): 0.00
Test set score: -0.19
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [-1.48287201e-05  2.51379131e-08 -3.25777623e-06 -1.38685966e-03
  8.47767108e-07  7.67820373e-07  2.28348021e-06  2.48840225e-06
  2.71177010e-06 -9.48972719e-06  2.49987878e-06  9.92010981e-07
  1.98378913e-07 -2.11157645e-03  3.49796186e-07  6.97126725e-07]
p-values of predictors: [4.24686383e-02 1.85319109e-02 1.39658743e-01 5.27128118e-01
 2.70894418e-14 4.04133620e-01 4.62331180e-01 6.46318838e-01
 6.16554924e-01 5.92654166e-01 7.89933720e-01 6.40788974e-01
 8.74598951e-01 4.68242287e-01 8.95781811e-01

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,-0.0,0.019
1,med_hh_inc,0.0,0.14
2,pop,-0.0,0.527
3,med_age,-0.001,0.0
4,youth_pop,0.0,0.404
5,elderly_pop,0.0,0.462
6,white_pop,0.0,0.646
7,black_pop,0.0,0.617
8,asian_pop,0.0,0.593
9,pacisl_pop,-0.0,0.79


In [14]:
X = chicago.iloc[:,6:-7]
y = chicago["dev_residual"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 45
Train sample (y): 45
Test sample (X): 15
Test sample (y): 15
Training set score: 0.62
Mean squared error (training set): 0.00
Test set score: 0.04
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [-3.55196562e-05 -5.42667906e-08 -5.20164355e-06 -1.43109050e-03
 -1.75043581e-06  2.18052691e-06  4.69499140e-06  4.68118951e-06
  4.79641744e-06  2.32741141e-05  4.65807142e-06  5.57679911e-06
  4.56068650e-07 -6.08730386e-03  1.16994270e-06 -6.89922336e-07]
p-values of predictors: [6.13764517e-02 4.31272732e-07 2.64401963e-05 2.83249049e-01
 7.65676411e-12 2.19757539e-01 3.71049531e-03 2.87553480e-01
 2.84915412e-01 3.14469936e-01 6.96401900e-01 3.11109238e-01
 3.29051958e-01 4.73668326e-02 5.47433906e-01 1.

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,-0.0,0.0
1,med_hh_inc,-0.0,0.0
2,pop,-0.0,0.283
3,med_age,-0.001,0.0
4,youth_pop,-0.0,0.22
5,elderly_pop,0.0,0.004
6,white_pop,0.0,0.288
7,black_pop,0.0,0.285
8,asian_pop,0.0,0.314
9,pacisl_pop,0.0,0.696


In [15]:
X = dc.iloc[:,6:-7]
y = dc["dev_residual"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 39
Train sample (y): 39
Test sample (X): 14
Test sample (y): 14
Training set score: 0.89
Mean squared error (training set): 0.00
Test set score: 0.22
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 1.10391437e-05  3.29388164e-09  1.22654705e-05 -4.07344673e-05
 -1.41727732e-05  3.74344913e-06  1.43526144e-06  4.52381318e-06
  8.02301892e-07 -1.33010365e-04 -5.24543442e-06  2.26431955e-05
 -3.51774420e-07  8.42736206e-03 -1.98130722e-05  3.42887017e-06]
p-values of predictors: [8.75719557e-01 1.84039022e-01 6.41990236e-01 1.45753349e-01
 3.32041325e-01 2.61962356e-05 6.28922058e-03 8.25960879e-01
 4.89601835e-01 9.37437883e-01 5.52146171e-02 6.09205190e-01
 1.06138251e-01 9.22419254e-01 5.07902646e-02 1.

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,0.0,0.184
1,med_hh_inc,0.0,0.642
2,pop,0.0,0.146
3,med_age,-0.0,0.332
4,youth_pop,-0.0,0.0
5,elderly_pop,0.0,0.006
6,white_pop,0.0,0.826
7,black_pop,0.0,0.49
8,asian_pop,0.0,0.937
9,pacisl_pop,-0.0,0.055


In [16]:
X = boston.iloc[:,6:-7]
y = boston["dev_residual"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 32
Train sample (y): 32
Test sample (X): 11
Test sample (y): 11
Training set score: 0.68
Mean squared error (training set): 0.00
Test set score: -7.52
Mean squared error (test set): 0.03
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [-1.95252900e-04 -3.90749487e-08 -2.23785243e-04 -8.47324703e-04
  8.06085974e-05 -3.74446383e-05  1.73618921e-04  1.68257926e-04
  2.11522813e-04 -5.40841013e-04  1.65901645e-04  2.11038177e-04
  1.27411169e-05  5.77969073e-02  2.44671335e-05  5.01650452e-05]
p-values of predictors: [8.87108372e-01 2.80718174e-05 3.25389155e-01 1.36271029e-01
 3.46636235e-01 5.33591029e-05 3.78950637e-02 2.21317848e-01
 2.33258322e-01 1.61724911e-01 4.53951642e-01 2.47204134e-01
 1.29773603e-01 1.60203726e-02 4.77901398e-01 3

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,-0.0,0.0
1,med_hh_inc,-0.0,0.325
2,pop,-0.0,0.136
3,med_age,-0.001,0.347
4,youth_pop,0.0,0.0
5,elderly_pop,-0.0,0.038
6,white_pop,0.0,0.221
7,black_pop,0.0,0.233
8,asian_pop,0.0,0.162
9,pacisl_pop,-0.001,0.454


In [17]:
X = sf.iloc[:,6:-7]
y = sf["dev_residual"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 34
Train sample (y): 34
Test sample (X): 12
Test sample (y): 12
Training set score: 0.57
Mean squared error (training set): 0.00
Test set score: -9.78
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [-1.93737211e-05 -5.40352838e-08  5.23316480e-05 -2.89888308e-03
  1.94709208e-06  1.56853585e-05 -5.46496535e-05 -5.62186242e-05
 -5.74947486e-05 -3.32081854e-06 -7.19481716e-05 -6.16975823e-05
  1.12829017e-05  1.43561012e-02  1.89321751e-06 -1.93090232e-06]
p-values of predictors: [2.26512097e-01 3.07145845e-01 3.01037558e-03 1.10533073e-03
 4.35207426e-14 6.76563753e-01 1.62098103e-03 2.00142805e-04
 3.91528944e-04 1.08576731e-04 9.44192281e-01 5.28327052e-03
 2.29332839e-03 4.98625168e-02 8.43485874e-01 5

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,-0.0,0.307
1,med_hh_inc,-0.0,0.003
2,pop,0.0,0.001
3,med_age,-0.003,0.0
4,youth_pop,0.0,0.677
5,elderly_pop,0.0,0.002
6,white_pop,-0.0,0.0
7,black_pop,-0.0,0.0
8,asian_pop,-0.0,0.0
9,pacisl_pop,-0.0,0.944


### IH Allocation

In [38]:
X = ih.iloc[:,6:-7]
y = ih["ih_prop"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(10)

Train sample (X): 284
Train sample (y): 284
Test sample (X): 95
Test sample (y): 95
Training set score: 0.14
Mean squared error (training set): 0.00
Test set score: 0.09
Mean squared error (test set): 0.00
List of predictors: ['med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 2.72724840e-09  1.24290001e-05 -1.05890782e-04  8.73923001e-07
 -5.58062972e-06 -1.31847148e-05 -1.27376710e-05 -1.25529774e-05
 -1.01050193e-05 -1.27797956e-05 -1.46607039e-05 -5.43932451e-07
  1.74869948e-02  1.33677716e-06  1.04031198e-06]
p-values of predictors: [6.53341166e-01 8.87894810e-01 6.79522547e-02 5.25695756e-01
 4.97083250e-01 3.99302170e-06 4.37969949e-02 5.13561240e-02
 5.98016261e-02 7.71291414e-01 7.38230415e-02 8.79178836e-02
 1.56169626e-01 2.08188709e-01 6.43199534e-02 1.03477312e-01]
Constant (intercept

Unnamed: 0,Features,Coef,p-value
0,med_hh_inc,2.7e-09,0.887895
1,pop,1.2429e-05,0.067952
2,med_age,-0.0001058908,0.525696
3,youth_pop,8.739e-07,0.497083
4,elderly_pop,-5.5806e-06,4e-06
5,white_pop,-1.31847e-05,0.043797
6,black_pop,-1.27377e-05,0.051356
7,asian_pop,-1.2553e-05,0.059802
8,pacisl_pop,-1.0105e-05,0.771291
9,other_race_pop,-1.27798e-05,0.073823


#### Individual Cities

In [19]:
X = nyc.iloc[:,6:-7]
y = nyc["ih_prop"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)

print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 132
Train sample (y): 132
Test sample (X): 45
Test sample (y): 45
Training set score: 0.22
Mean squared error (training set): 0.00
Test set score: 0.18
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 2.77031764e-06 -2.94247667e-09 -6.03158158e-07 -7.69347507e-04
  1.15169313e-06 -1.03802555e-07 -2.85459282e-07 -1.01978426e-07
  5.50285032e-08 -1.32326616e-05 -1.32090448e-07 -1.28583323e-06
  1.12588665e-07  1.86447850e-03  3.08972622e-07  9.60323012e-07]
p-values of predictors: [8.46696356e-02 4.72544209e-01 7.78918448e-01 8.49719507e-01
 3.37307959e-12 6.82162932e-02 8.72210468e-01 9.26058869e-01
 9.73513780e-01 9.85985031e-01 5.48245266e-01 9.68183589e-01
 7.40826364e-01 5.05519205e-01 8.51629810e-01 

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,0.0,0.473
1,med_hh_inc,-0.0,0.779
2,pop,-0.0,0.85
3,med_age,-0.001,0.0
4,youth_pop,0.0,0.068
5,elderly_pop,-0.0,0.872
6,white_pop,-0.0,0.926
7,black_pop,-0.0,0.974
8,asian_pop,0.0,0.986
9,pacisl_pop,-0.0,0.548


In [20]:
X = chicago.iloc[:,6:-7]
y = chicago["ih_prop"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 45
Train sample (y): 45
Test sample (X): 15
Test sample (y): 15
Training set score: 0.60
Mean squared error (training set): 0.00
Test set score: 0.17
Mean squared error (test set): 0.01
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 4.03464087e-05 -6.57804468e-08  2.36546214e-07 -5.52313603e-03
 -1.09879897e-05  8.06810975e-07  5.11115620e-06  4.93653597e-06
  4.07559594e-06 -2.76126599e-04  4.33983862e-06 -3.68664889e-06
 -2.57724814e-07 -2.24599848e-02  1.16140932e-06 -7.53361617e-06]
p-values of predictors: [1.26226501e-02 2.01181919e-02 4.76537338e-02 9.85962118e-01
 2.22044605e-16 7.57927573e-03 6.86545907e-01 6.76683501e-01
 6.84594558e-01 7.58256555e-01 1.02212624e-01 7.34008570e-01
 8.16298111e-01 6.81646392e-01 4.27275797e-01 5.

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,0.0,0.02
1,med_hh_inc,-0.0,0.048
2,pop,0.0,0.986
3,med_age,-0.006,0.0
4,youth_pop,-0.0,0.008
5,elderly_pop,0.0,0.687
6,white_pop,0.0,0.677
7,black_pop,0.0,0.685
8,asian_pop,0.0,0.758
9,pacisl_pop,-0.0,0.102


In [21]:
X = dc.iloc[:,6:-7]
y = dc["ih_prop"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 39
Train sample (y): 39
Test sample (X): 14
Test sample (y): 14
Training set score: 0.97
Mean squared error (training set): 0.00
Test set score: 0.70
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [-1.43546617e-05  8.53493986e-09 -1.38455440e-05 -6.55350282e-06
 -1.69512515e-05 -3.37126791e-06  3.04765371e-05  3.25173872e-05
  1.00172419e-05  7.14080614e-06  4.38379303e-05  6.25927444e-05
 -2.23638382e-05  8.37466957e-03 -1.67787030e-05  2.94290442e-06]
p-values of predictors: [8.13648905e-01 3.09295434e-01 4.81269208e-01 3.32354845e-01
 9.26719135e-01 1.83702286e-03 1.35336852e-01 8.99145475e-03
 5.59654853e-03 5.66807394e-01 9.50732920e-01 1.59325335e-02
 1.08321005e-02 7.84596937e-04 2.47689250e-01 2.

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,-0.0,0.309
1,med_hh_inc,0.0,0.481
2,pop,-0.0,0.332
3,med_age,-0.0,0.927
4,youth_pop,-0.0,0.002
5,elderly_pop,-0.0,0.135
6,white_pop,0.0,0.009
7,black_pop,0.0,0.006
8,asian_pop,0.0,0.567
9,pacisl_pop,0.0,0.951


In [22]:
X = boston.iloc[:,6:-7]
y = boston["ih_prop"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 32
Train sample (y): 32
Test sample (X): 11
Test sample (y): 11
Training set score: 0.96
Mean squared error (training set): 0.00
Test set score: -4.74
Mean squared error (test set): 0.01
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [-1.28174802e-05 -5.86181565e-08 -7.61210569e-05  9.02119452e-04
  2.74285208e-05 -1.93211634e-05  5.45723044e-05  5.44918364e-05
  6.86336966e-05 -1.53899076e-04  5.42653745e-05  9.57509332e-05
  1.97468414e-06  1.82724952e-02  1.42238979e-05  1.79287351e-05]
p-values of predictors: [3.46995455e-02 1.15844227e-01 1.74061867e-08 1.37146100e-02
 1.56503302e-05 4.92073071e-09 3.62468478e-06 5.77648675e-02
 5.69241219e-02 2.61206587e-02 2.86767069e-01 6.18210045e-02
 1.26396762e-03 5.62578834e-02 2.62715359e-01 1

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,-0.0,0.116
1,med_hh_inc,-0.0,0.0
2,pop,-0.0,0.014
3,med_age,0.001,0.0
4,youth_pop,0.0,0.0
5,elderly_pop,-0.0,0.0
6,white_pop,0.0,0.058
7,black_pop,0.0,0.057
8,asian_pop,0.0,0.026
9,pacisl_pop,-0.0,0.287


In [23]:
X = sf.iloc[:,6:-7]
y = sf["ih_prop"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state=0)
print ("Train sample (X):", len(X_train))
print ("Train sample (y):", len(y_train))
print ("Test sample (X):", len(X_test))
print ("Test sample (y):", len(y_test))

lr = LinearRegression().fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print ("Training set score: %.2f"% lr.score(X_train, y_train))
print('Mean squared error (training set): %.2f'% mean_squared_error(y_train, y_pred_train))
y_pred_test = lr.predict(X_test)
print ("Test set score: %.2f"% lr.score(X_test, y_test))
print('Mean squared error (test set): %.2f'% mean_squared_error(y_test, y_pred_test))

print ("List of predictors:", X.columns.tolist())
print ("List of regression coefficients:", lr.coef_)
print ("p-values of predictors:", stats.coef_pval(lr, X_train, y_train))
print ("Constant (intercept):",lr.intercept_)

result_ols = pd.DataFrame(columns=['Features', 'Coef', 'p-value'])
result_ols['Features'] = X.columns.tolist()
result_ols['Coef'] = lr.coef_
result_ols['p-value'] = stats.coef_pval(lr, X_train, y_train)[1:]
result_ols.round(3)

Train sample (X): 34
Train sample (y): 34
Test sample (X): 12
Test sample (y): 12
Training set score: 0.50
Mean squared error (training set): 0.00
Test set score: -0.42
Mean squared error (test set): 0.00
List of predictors: ['rental_2014plus', 'med_hh_inc', 'pop', 'med_age', 'youth_pop', 'elderly_pop', 'white_pop', 'black_pop', 'asian_pop', 'pacisl_pop', 'other_race_pop', 'multiracial_pop', 'latinx_pop', 'simpson', 'citizen_pop', 'housing_total']
List of regression coefficients: [ 9.47041201e-05 -1.13004972e-07  1.20712087e-04 -1.95023307e-03
  1.69483481e-05 -3.47669887e-06 -1.33669429e-04 -1.43361373e-04
 -1.34305264e-04 -1.07507765e-04 -1.43040084e-04 -1.65592599e-04
  7.90925772e-06  2.12046016e-01  1.64715761e-05  1.91222300e-07]
p-values of predictors: [0.91796448 0.16365752 0.06869158 0.02687851 0.02391865 0.31081022
 0.83196708 0.00704107 0.00791131 0.00686597 0.52557415 0.10470819
 0.01788324 0.6910365  0.41483646 0.16492    0.95824655]
Constant (intercept): -0.02946810746844

Unnamed: 0,Features,Coef,p-value
0,rental_2014plus,0.0,0.164
1,med_hh_inc,-0.0,0.069
2,pop,0.0,0.027
3,med_age,-0.002,0.024
4,youth_pop,0.0,0.311
5,elderly_pop,-0.0,0.832
6,white_pop,-0.0,0.007
7,black_pop,-0.0,0.008
8,asian_pop,-0.0,0.007
9,pacisl_pop,-0.0,0.526
