# Regression for Econ Growth with Mobility Flow

Inputs and outputs
- $X$: features of CBGs. e.g. age, gender, etc.
- $A$: adjacency matrix of CBGs, created by using mobility flow data. (weighted or unweighted)
- $Y$: output variables we want to predict. (e.g. growth of inc, pop, and property values.

Main question (comparison):
- $Y \sim f([X])$
- $Y \sim f([X, A])$


In [12]:
import numpy as np
import pandas as pd
import geopandas as gpd
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import copy
import scipy.sparse as sp
from scipy.sparse import csr_matrix

# regression tools
import statsmodels.api as sm
from sklearn.preprocessing import normalize

In [2]:
# read files
with open("../../data/03_processed/place_graph_X.pickle", 'rb') as f:
    X_place = pickle.load(f) # data frame

with open("../../data/03_processed/place_graph_A.pickle", 'rb') as f:
    A_place = pickle.load(f) # sparse matrix

with open("../../data/03_processed/place_graph_weighted_A.pickle", 'rb') as f:
    A_weighted_place = pickle.load(f) # sparse matrix    
    
with open("../../data/03_processed/place_graph_Y.pickle", 'rb') as f:
    Y_place = pickle.load(f) # data frame
    

In [3]:
X_place

Unnamed: 0_level_0,inc_per_capita,property_value_median,pop_total,households,race_white_ratio,race_black_ratio,age_median,travel_driving_ratio,edu_bachelor_ratio
full_bg_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
250092011001,46400.0,521300.0,544.0,259.0,1.000000,0.000000,52.8,0.728395,0.239669
250092021011,54513.0,464100.0,721.0,248.0,0.970874,0.000000,47.4,0.737931,0.334669
250092021012,48486.0,461900.0,518.0,202.0,0.967181,0.000000,39.9,0.836538,0.413408
250092021013,43408.0,391000.0,805.0,288.0,0.822360,0.045963,35.4,0.761261,0.250000
250092021021,35731.0,403800.0,1181.0,402.0,0.957663,0.029636,33.8,0.902357,0.204301
...,...,...,...,...,...,...,...,...,...
330170870001,25345.0,218500.0,1479.0,549.0,1.000000,0.000000,33.2,0.926868,0.100338
330170870002,24643.0,158700.0,1612.0,630.0,0.984491,0.000000,38.5,0.869505,0.127907
330170870003,28067.0,169300.0,1657.0,597.0,1.000000,0.000000,35.9,0.896261,0.098936
330170870004,20110.0,93200.0,1087.0,561.0,1.000000,0.000000,54.6,1.000000,0.179310


In [4]:
X_place.shape

(3102, 9)

In [5]:
X_place.shape

(3102, 9)

In [6]:
Y_place.shape

(3102, 3)

## Compute the correlation between variables

In [9]:
var_m = pd.concat([X_place, Y_place], axis = 1)

In [10]:
np.round(var_m.corr(), decimals = 2)

Unnamed: 0,inc_per_capita,property_value_median,pop_total,households,race_white_ratio,race_black_ratio,age_median,travel_driving_ratio,edu_bachelor_ratio,inc_per_capita_annual_growth,pop_total_annual_growth,property_value_median_annual_growth
inc_per_capita,1.0,0.75,-0.07,0.06,0.43,-0.35,0.32,-0.05,0.53,-0.2,0.03,-0.05
property_value_median,0.75,1.0,-0.05,-0.05,0.21,-0.19,0.11,-0.26,0.42,-0.05,-0.02,-0.08
pop_total,-0.07,-0.05,1.0,0.83,0.05,-0.08,-0.09,0.12,0.01,-0.01,-0.16,-0.06
households,0.06,-0.05,0.83,1.0,0.12,-0.13,0.09,0.12,0.07,-0.05,-0.09,-0.03
race_white_ratio,0.43,0.21,0.05,0.12,1.0,-0.79,0.42,0.36,0.44,-0.05,-0.05,-0.15
race_black_ratio,-0.35,-0.19,-0.08,-0.13,-0.79,1.0,-0.23,-0.21,-0.36,0.05,0.05,0.09
age_median,0.32,0.11,-0.09,0.09,0.42,-0.23,1.0,0.43,0.05,-0.12,0.04,-0.11
travel_driving_ratio,-0.05,-0.26,0.12,0.12,0.36,-0.21,0.43,1.0,-0.13,-0.08,-0.05,-0.19
edu_bachelor_ratio,0.53,0.42,0.01,0.07,0.44,-0.36,0.05,-0.13,1.0,-0.04,-0.01,-0.05
inc_per_capita_annual_growth,-0.2,-0.05,-0.01,-0.05,-0.05,0.05,-0.12,-0.08,-0.04,1.0,-0.18,0.18


In [11]:
Y_place.columns

Index(['inc_per_capita_annual_growth', 'pop_total_annual_growth',
       'property_value_median_annual_growth'],
      dtype='object')

# Three Groups of Regressions

1. Growth vs. Static. 
2. $[X]$ vs. $[X, A]$. 
3. Weighted vs. Unweighted adjacency matrices.

## 1. Prediction of growth vs. static values (income, population, and property values)

Predicting the growth is SO MUCH HARDER than predicting the static values!

In [13]:
# 1 - delta income (7.7%)
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(normalize(X_place[input_vars], axis = 0))
output_var = 'inc_per_capita_annual_growth'
y = Y_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                                 OLS Regression Results                                 
Dep. Variable:     inc_per_capita_annual_growth   R-squared:                       0.080
Model:                                      OLS   Adj. R-squared:                  0.077
Method:                           Least Squares   F-statistic:                     29.89
Date:                          Sun, 07 Nov 2021   Prob (F-statistic):           1.82e-50
Time:                                  16:53:03   Log-Likelihood:                 2860.9
No. Observations:                          3102   AIC:                            -5702.
Df Residuals:                              3092   BIC:                            -5641.
Df Model:                                     9                                         
Covariance Type:                      nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [14]:
# 1 - income (71%)
input_vars = ['property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(X_place[input_vars])
output_var = 'inc_per_capita'
y = X_place[output_var] # here use X_place

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:         inc_per_capita   R-squared:                       0.711
Model:                            OLS   Adj. R-squared:                  0.710
Method:                 Least Squares   F-statistic:                     951.1
Date:                Sun, 07 Nov 2021   Prob (F-statistic):               0.00
Time:                        16:53:12   Log-Likelihood:                -33359.
No. Observations:                3102   AIC:                         6.674e+04
Df Residuals:                    3093   BIC:                         6.679e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.536e+

In [15]:
# 2 - delta population (3.6%)
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(normalize(X_place[input_vars], axis = 0))
output_var = 'pop_total_annual_growth'
y = Y_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                               OLS Regression Results                              
Dep. Variable:     pop_total_annual_growth   R-squared:                       0.039
Model:                                 OLS   Adj. R-squared:                  0.036
Method:                      Least Squares   F-statistic:                     13.81
Date:                     Sun, 07 Nov 2021   Prob (F-statistic):           5.12e-22
Time:                             16:53:18   Log-Likelihood:                 3111.4
No. Observations:                     3102   AIC:                            -6203.
Df Residuals:                         3092   BIC:                            -6142.
Df Model:                                9                                         
Covariance Type:                 nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
co

In [16]:
# 2 - population (75.6%)
input_vars = ['inc_per_capita', 'property_value_median', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(X_place[input_vars])
output_var = 'pop_total'
y = X_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:              pop_total   R-squared:                       0.757
Model:                            OLS   Adj. R-squared:                  0.756
Method:                 Least Squares   F-statistic:                     1204.
Date:                Sun, 07 Nov 2021   Prob (F-statistic):               0.00
Time:                        16:53:22   Log-Likelihood:                -22299.
No. Observations:                3102   AIC:                         4.462e+04
Df Residuals:                    3093   BIC:                         4.467e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   475.80

In [17]:
# 3 - delta property value (6.7%)
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(normalize(X_place[input_vars], axis = 0))
output_var = 'property_value_median_annual_growth'
y = Y_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                                     OLS Regression Results                                    
Dep. Variable:     property_value_median_annual_growth   R-squared:                       0.070
Model:                                             OLS   Adj. R-squared:                  0.067
Method:                                  Least Squares   F-statistic:                     25.77
Date:                                 Sun, 07 Nov 2021   Prob (F-statistic):           3.16e-43
Time:                                         16:53:28   Log-Likelihood:                 2620.8
No. Observations:                                 3102   AIC:                            -5222.
Df Residuals:                                     3092   BIC:                            -5161.
Df Model:                                            9                                         
Covariance Type:                             nonrobust                                         
                 coef    std err        

In [18]:
# 3 - property value (41.6%)
input_vars = ['inc_per_capita', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(X_place[input_vars])
output_var = 'property_value_median'
y = X_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                              OLS Regression Results                             
Dep. Variable:     property_value_median   R-squared:                       0.648
Model:                               OLS   Adj. R-squared:                  0.647
Method:                    Least Squares   F-statistic:                     711.7
Date:                   Sun, 07 Nov 2021   Prob (F-statistic):               0.00
Time:                           16:53:31   Log-Likelihood:                -40869.
No. Observations:                   3102   AIC:                         8.176e+04
Df Residuals:                       3093   BIC:                         8.181e+04
Df Model:                              8                                         
Covariance Type:               nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
co

## 2 - Comparing X and [X, A] (A is the unweighted adjacency matrix)

A naive linear regression on [X, A] does not show any improvement through Z

In [19]:
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [20]:
# 1 - delta income
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'inc_per_capita_annual_growth'

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place, axis = 0)])
y = Y_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.00005)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=2.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square.
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))


Training R2 (LASSO):  0.08743786823576116
Testing R2 (LASSO):  0.025191227180781373
Training R2 (Ridge):  0.18461032340195083
Testing R2 (Ridge):  -0.0014985189796172804


In [22]:
# 2 - delta population
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'pop_total_annual_growth' 

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place, axis = 0)])
y = Y_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.00003)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=3.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square. 
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))


Training R2 (LASSO):  0.1697912939654449
Testing R2 (LASSO):  -0.007107344860349807
Training R2 (Ridge):  0.13876223588212833
Testing R2 (Ridge):  -0.006110813041268148


In [23]:
# 3 - delta property value growth
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'property_value_median_annual_growth'

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place, axis = 0)])
y = Y_place[output_var]

# separate training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.00005)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=2.0)
model_l2.fit(X_train, y_train)

# return R square.
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square.
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))


Training R2 (LASSO):  0.14103641192685923
Testing R2 (LASSO):  0.015766787629139523
Training R2 (Ridge):  0.23006270065779122
Testing R2 (Ridge):  0.025046648020793927


## 3 - Comparing X and [X, A] (A is weighted)

The weighted adjacency matrix is not any better than the unweighted one.

In [27]:
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [28]:
# 1 - delta income
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'inc_per_capita_annual_growth'

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place, axis = 0)])
y = Y_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.0002)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=3.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square.
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))


Training R2 (LASSO):  0.014663778308202202
Testing R2 (LASSO):  -0.00029377559216059446
Training R2 (Ridge):  0.3691546862915215
Testing R2 (Ridge):  -0.009373868462205337


In [29]:
# 2 - delta population
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'pop_total_annual_growth' 

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place, axis = 0)])
y = Y_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.0002)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=1.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square. 
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))

Training R2 (LASSO):  0.06215525779492914
Testing R2 (LASSO):  -0.0049611044490582845
Training R2 (Ridge):  0.6098892165352885
Testing R2 (Ridge):  -0.08253025724532281


In [31]:
# 3 - delta property value
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'property_value_median_annual_growth' 

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place, axis = 0)])
y = Y_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.0003)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=4.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square. 
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))

Training R2 (LASSO):  0.08014913736551121
Testing R2 (LASSO):  0.001401582674185331
Training R2 (Ridge):  0.2974587650030327
Testing R2 (Ridge):  0.020150543607634352


## Comparing X and [X, f(A)] (A can be weighted or unweighted)

- Use some feature transformation.
- Finding: signal EXISTS for the growth of population.

In [32]:
# 1 - delta income (7.8%) +0.1% 
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(np.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place.mean(axis = 1), axis = 0)])) # you cannot normalize A_place.mean()
output_var = 'inc_per_capita_annual_growth'
y = Y_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                                 OLS Regression Results                                 
Dep. Variable:     inc_per_capita_annual_growth   R-squared:                       0.081
Model:                                      OLS   Adj. R-squared:                  0.078
Method:                           Least Squares   F-statistic:                     27.11
Date:                          Sun, 07 Nov 2021   Prob (F-statistic):           3.87e-50
Time:                                  17:02:10   Log-Likelihood:                 2861.9
No. Observations:                          3102   AIC:                            -5702.
Df Residuals:                              3091   BIC:                            -5635.
Df Model:                                    10                                         
Covariance Type:                      nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------



In [33]:
# 1 - delta income (7.7%) +0.0% 
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(np.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place.mean(axis = 1), axis = 0)])) # you cannot normalize A_place.mean()
output_var = 'inc_per_capita_annual_growth'
y = Y_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                                 OLS Regression Results                                 
Dep. Variable:     inc_per_capita_annual_growth   R-squared:                       0.080
Model:                                      OLS   Adj. R-squared:                  0.077
Method:                           Least Squares   F-statistic:                     27.02
Date:                          Sun, 07 Nov 2021   Prob (F-statistic):           5.76e-50
Time:                                  17:02:21   Log-Likelihood:                 2861.5
No. Observations:                          3102   AIC:                            -5701.
Df Residuals:                              3091   BIC:                            -5634.
Df Model:                                    10                                         
Covariance Type:                      nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------



In [34]:
# 2 - delta population (3.9%) +0.3%
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(np.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place.mean(axis = 1), axis = 0)])) # you cannot normalize A_place.mean()
output_var = 'pop_total_annual_growth'
y = Y_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                               OLS Regression Results                              
Dep. Variable:     pop_total_annual_growth   R-squared:                       0.042
Model:                                 OLS   Adj. R-squared:                  0.039
Method:                      Least Squares   F-statistic:                     13.47
Date:                     Sun, 07 Nov 2021   Prob (F-statistic):           1.83e-23
Time:                             17:02:28   Log-Likelihood:                 3116.4
No. Observations:                     3102   AIC:                            -6211.
Df Residuals:                         3091   BIC:                            -6144.
Df Model:                               10                                         
Covariance Type:                 nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
co



In [35]:
# 3 - delta property value (6.7%) +0.0%
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(np.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place.mean(axis = 1), axis = 0)])) # you cannot normalize A_place.mean()
output_var = 'property_value_median_annual_growth'
y = Y_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                                     OLS Regression Results                                    
Dep. Variable:     property_value_median_annual_growth   R-squared:                       0.070
Model:                                             OLS   Adj. R-squared:                  0.067
Method:                                  Least Squares   F-statistic:                     23.23
Date:                                 Sun, 07 Nov 2021   Prob (F-statistic):           1.36e-42
Time:                                         17:02:34   Log-Likelihood:                 2621.0
No. Observations:                                 3102   AIC:                            -5220.
Df Residuals:                                     3091   BIC:                            -5153.
Df Model:                                           10                                         
Covariance Type:                             nonrobust                                         
                 coef    std err        

