# Regression for Econ Growth with Mobility Flow Graph

- $X$: features of CBGs. e.g. age, gender, etc.
- $A$: adjacency matrix of CBGs, created by using mobility flow data. (weighted or unweighted)
- $Z$: output variables we want to predict. (e.g. growth of inc, pop, and property values.

Main question (comparison):
- $Z \sim f([X])$
- $Z \sim f([X, A])$


In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import copy
import scipy.sparse as sp
from scipy.sparse import csr_matrix

In [2]:
# read files
with open("../data/02_intermediate/place_graph_X.pickle", 'rb') as f:
    X_place = pickle.load(f) # data frame

with open("../data/02_intermediate/place_graph_A.pickle", 'rb') as f:
    A_place = pickle.load(f) # sparse matrix

with open("../data/02_intermediate/place_graph_weighted_A.pickle", 'rb') as f:
    A_weighted_place = pickle.load(f) # sparse matrix    
    
with open("../data/02_intermediate/place_graph_Z.pickle", 'rb') as f:
    Z_place = pickle.load(f) # data frame
    

In [3]:
X_place

Unnamed: 0_level_0,inc_per_capita,property_value_median,pop_total,households,race_white_ratio,race_black_ratio,age_median,travel_driving_ratio,edu_bachelor_ratio
full_bg_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
250092011001,46400.0,521300.0,544.0,259.0,1.000000,0.000000,52.8,0.728395,0.239669
250092021011,54513.0,464100.0,721.0,248.0,0.970874,0.000000,47.4,0.737931,0.334669
250092021012,48486.0,461900.0,518.0,202.0,0.967181,0.000000,39.9,0.836538,0.413408
250092021013,43408.0,391000.0,805.0,288.0,0.822360,0.045963,35.4,0.761261,0.250000
250092021021,35731.0,403800.0,1181.0,402.0,0.957663,0.029636,33.8,0.902357,0.204301
...,...,...,...,...,...,...,...,...,...
330170870001,25345.0,218500.0,1479.0,549.0,1.000000,0.000000,33.2,0.926868,0.100338
330170870002,24643.0,158700.0,1612.0,630.0,0.984491,0.000000,38.5,0.869505,0.127907
330170870003,28067.0,169300.0,1657.0,597.0,1.000000,0.000000,35.9,0.896261,0.098936
330170870004,20110.0,93200.0,1087.0,561.0,1.000000,0.000000,54.6,1.000000,0.179310


In [4]:
X_place.shape

(3101, 9)

In [5]:
X_place.shape

(3101, 9)

In [6]:
Z_place.shape

(3101, 3)

## Correlation across variables

In [7]:
var_m = pd.concat([X_place, Z_place], axis = 1)

In [10]:
np.round(var_m.corr(), decimals = 2)

Unnamed: 0,inc_per_capita,property_value_median,pop_total,households,race_white_ratio,race_black_ratio,age_median,travel_driving_ratio,edu_bachelor_ratio,inc_per_capita_annual_growth,pop_total_annual_growth,property_value_median_annual_growth
inc_per_capita,1.0,0.75,-0.07,0.06,0.43,-0.35,0.32,-0.05,0.53,-0.2,0.03,-0.05
property_value_median,0.75,1.0,-0.05,-0.05,0.21,-0.19,0.11,-0.26,0.42,-0.05,-0.02,-0.08
pop_total,-0.07,-0.05,1.0,0.83,0.05,-0.08,-0.09,0.13,0.01,-0.01,-0.16,-0.06
households,0.06,-0.05,0.83,1.0,0.12,-0.13,0.09,0.12,0.07,-0.05,-0.09,-0.03
race_white_ratio,0.43,0.21,0.05,0.12,1.0,-0.79,0.42,0.36,0.44,-0.05,-0.05,-0.15
race_black_ratio,-0.35,-0.19,-0.08,-0.13,-0.79,1.0,-0.23,-0.21,-0.36,0.05,0.05,0.09
age_median,0.32,0.11,-0.09,0.09,0.42,-0.23,1.0,0.43,0.05,-0.12,0.04,-0.11
travel_driving_ratio,-0.05,-0.26,0.13,0.12,0.36,-0.21,0.43,1.0,-0.13,-0.08,-0.05,-0.19
edu_bachelor_ratio,0.53,0.42,0.01,0.07,0.44,-0.36,0.05,-0.13,1.0,-0.04,-0.01,-0.05
inc_per_capita_annual_growth,-0.2,-0.05,-0.01,-0.05,-0.05,0.05,-0.12,-0.08,-0.04,1.0,-0.18,0.18


In [38]:
Z_place.columns

Index(['inc_per_capita_annual_growth', 'pop_total_annual_growth',
       'property_value_median_annual_growth'],
      dtype='object')

# Three Groups of Regressions

1. Growth vs. Static. 
2. $[X]$ vs. $[X, Z]$. 
3. Weighted vs. Unweighted adjacency matrices.

In [19]:
import statsmodels.api as sm
from sklearn.preprocessing import normalize

## 1. Compare the prediction of growth vs. static values (income, population, and property values)

Predicting the growth is SO MUCH HARDER than predicting the static values!

In [207]:
# 1 - delta income (7.7%)
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(normalize(X_place[input_vars], axis = 0))
output_var = 'inc_per_capita_annual_growth'
y = Z_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                                 OLS Regression Results                                 
Dep. Variable:     inc_per_capita_annual_growth   R-squared:                       0.080
Model:                                      OLS   Adj. R-squared:                  0.077
Method:                           Least Squares   F-statistic:                     29.84
Date:                          Wed, 08 Sep 2021   Prob (F-statistic):           2.26e-50
Time:                                  20:12:49   Log-Likelihood:                 2859.6
No. Observations:                          3101   AIC:                            -5699.
Df Residuals:                              3091   BIC:                            -5639.
Df Model:                                     9                                         
Covariance Type:                      nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [41]:
# 1 - income (71%)
input_vars = ['property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(X_place[input_vars])
output_var = 'inc_per_capita'
y = X_place[output_var] # here use X_place

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:         inc_per_capita   R-squared:                       0.711
Model:                            OLS   Adj. R-squared:                  0.710
Method:                 Least Squares   F-statistic:                     950.7
Date:                Wed, 08 Sep 2021   Prob (F-statistic):               0.00
Time:                        19:20:01   Log-Likelihood:                -33348.
No. Observations:                3101   AIC:                         6.671e+04
Df Residuals:                    3092   BIC:                         6.677e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.536e+

In [210]:
# 2 - delta population (3.6%)
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(normalize(X_place[input_vars], axis = 0))
output_var = 'pop_total_annual_growth'
y = Z_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                               OLS Regression Results                              
Dep. Variable:     pop_total_annual_growth   R-squared:                       0.038
Model:                                 OLS   Adj. R-squared:                  0.036
Method:                      Least Squares   F-statistic:                     13.75
Date:                     Wed, 08 Sep 2021   Prob (F-statistic):           6.70e-22
Time:                             20:13:30   Log-Likelihood:                 3111.7
No. Observations:                     3101   AIC:                            -6203.
Df Residuals:                         3091   BIC:                            -6143.
Df Model:                                9                                         
Covariance Type:                 nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
co

In [42]:
# 2 - population (75.6%)
input_vars = ['inc_per_capita', 'property_value_median', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(X_place[input_vars])
output_var = 'pop_total'
y = X_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                            OLS Regression Results                            
Dep. Variable:              pop_total   R-squared:                       0.757
Model:                            OLS   Adj. R-squared:                  0.756
Method:                 Least Squares   F-statistic:                     1203.
Date:                Wed, 08 Sep 2021   Prob (F-statistic):               0.00
Time:                        19:20:48   Log-Likelihood:                -22292.
No. Observations:                3101   AIC:                         4.460e+04
Df Residuals:                    3092   BIC:                         4.466e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   475.80

In [211]:
# 3 - delta property value (6.7%)
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(normalize(X_place[input_vars], axis = 0))
output_var = 'property_value_median_annual_growth'
y = Z_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                                     OLS Regression Results                                    
Dep. Variable:     property_value_median_annual_growth   R-squared:                       0.070
Model:                                             OLS   Adj. R-squared:                  0.067
Method:                                  Least Squares   F-statistic:                     25.75
Date:                                 Wed, 08 Sep 2021   Prob (F-statistic):           3.42e-43
Time:                                         20:13:39   Log-Likelihood:                 2619.7
No. Observations:                                 3101   AIC:                            -5219.
Df Residuals:                                     3091   BIC:                            -5159.
Df Model:                                            9                                         
Covariance Type:                             nonrobust                                         
                 coef    std err        

In [198]:
# 3 - property value (41.6%)
input_vars = ['inc_per_capita', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(X_place[input_vars])
output_var = 'property_value_median'
y = X_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())


                              OLS Regression Results                             
Dep. Variable:     property_value_median   R-squared:                       0.417
Model:                               OLS   Adj. R-squared:                  0.416
Method:                    Least Squares   F-statistic:                     276.7
Date:                   Wed, 08 Sep 2021   Prob (F-statistic):               0.00
Time:                           20:07:17   Log-Likelihood:                -41638.
No. Observations:                   3101   AIC:                         8.329e+04
Df Residuals:                       3092   BIC:                         8.335e+04
Df Model:                              8                                         
Covariance Type:               nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.417e+06 

## 2 - Comparing X and [X, Z] (Z is unweighted)

A naive linear regression does not show any improvement through Z

In [47]:
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [212]:
# 1 - delta income
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'inc_per_capita_annual_growth'

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place, axis = 0)])
y = Z_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.00005)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=2.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square.
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))


Training R2 (LASSO):  0.08812475370514128
Testing R2 (LASSO):  0.030361015813589742
Training R2 (Ridge):  0.17916525252215187
Testing R2 (Ridge):  0.012116365196984669


In [213]:
# 2 - delta population
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'pop_total_annual_growth' 

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place, axis = 0)])
y = Z_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.00003)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=3.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square. 
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))


Training R2 (LASSO):  0.16582229258911485
Testing R2 (LASSO):  0.0015574678638632378
Training R2 (Ridge):  0.14006952253106686
Testing R2 (Ridge):  -0.00904146627980107


In [214]:
# 3 - delta property value growth
nput_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'property_value_median_annual_growth'

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place, axis = 0)])
y = Z_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.00005)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=2.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square. 
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))



Training R2 (LASSO):  0.15171854346444258
Testing R2 (LASSO):  0.03368806223134213
Training R2 (Ridge):  0.226521743705418
Testing R2 (Ridge):  0.04662943359491223


## 3 - Comparing X and [X, Z] (Z is weighted)

The weighted adjacency matrix is not any better than the unweighted one.

In [115]:
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn import linear_model

In [215]:
# 1 - delta income
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'inc_per_capita_annual_growth'

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place, axis = 0)])
y = Z_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.0002)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=3.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square.
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))


Training R2 (LASSO):  0.05353195293999391
Testing R2 (LASSO):  -0.0023309797869557336
Training R2 (Ridge):  0.382137027584759
Testing R2 (Ridge):  -0.012446518753920222


In [216]:
# 2 - delta population
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'pop_total_annual_growth' 

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place, axis = 0)])
y = Z_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.0002)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=1.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square. 
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))

Training R2 (LASSO):  0.056857523809702215
Testing R2 (LASSO):  -0.009849419131050752
Training R2 (Ridge):  0.6325753218222661
Testing R2 (Ridge):  -0.08800258316594745


In [217]:
# 3 - delta property value
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

output_var = 'property_value_median_annual_growth' 

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place, axis = 0)])
y = Z_place[output_var]

# separate training and testing

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

model_l1 = linear_model.Lasso(alpha=0.0003)
model_l1.fit(X_train, y_train)

model_l2 = linear_model.Ridge(alpha=4.0)
model_l2.fit(X_train, y_train)

# return R square. 
print("Training R2 (LASSO): ", model_l1.score(X_train, y_train))
print("Testing R2 (LASSO): ", model_l1.score(X_test, y_test))

# return R square. 
print("Training R2 (Ridge): ", model_l2.score(X_train, y_train))
print("Testing R2 (Ridge): ", model_l2.score(X_test, y_test))

Training R2 (LASSO):  0.13142887487808863
Testing R2 (LASSO):  0.0011230101240650159
Training R2 (Ridge):  0.31636539440822076
Testing R2 (Ridge):  0.016537286762008496


## Comparing X and [X, f(Z)] (Z can be weighted or unweighted)

- Use some feature transformation.
- Finding: signal EXISTS for the growth of population.

In [221]:
# 1 - delta income (7.8%) +0.1% 
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(np.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place.mean(axis = 1), axis = 0)])) # you cannot normalize A_place.mean()
output_var = 'inc_per_capita_annual_growth'
y = Z_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                                 OLS Regression Results                                 
Dep. Variable:     inc_per_capita_annual_growth   R-squared:                       0.081
Model:                                      OLS   Adj. R-squared:                  0.078
Method:                           Least Squares   F-statistic:                     27.07
Date:                          Wed, 08 Sep 2021   Prob (F-statistic):           4.68e-50
Time:                                  20:16:24   Log-Likelihood:                 2860.7
No. Observations:                          3101   AIC:                            -5699.
Df Residuals:                              3090   BIC:                            -5633.
Df Model:                                    10                                         
Covariance Type:                      nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [222]:
# 1 - delta income (7.7%) +0.0% 
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(np.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place.mean(axis = 1), axis = 0)])) # you cannot normalize A_place.mean()
output_var = 'inc_per_capita_annual_growth'
y = Z_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                                 OLS Regression Results                                 
Dep. Variable:     inc_per_capita_annual_growth   R-squared:                       0.080
Model:                                      OLS   Adj. R-squared:                  0.077
Method:                           Least Squares   F-statistic:                     26.97
Date:                          Wed, 08 Sep 2021   Prob (F-statistic):           7.20e-50
Time:                                  20:16:50   Log-Likelihood:                 2860.2
No. Observations:                          3101   AIC:                            -5698.
Df Residuals:                              3090   BIC:                            -5632.
Df Model:                                    10                                         
Covariance Type:                      nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

In [223]:
# 2 - delta population (3.9%) +0.3%
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(np.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_place.mean(axis = 1), axis = 0)])) # you cannot normalize A_place.mean()
output_var = 'pop_total_annual_growth'
y = Z_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                               OLS Regression Results                              
Dep. Variable:     pop_total_annual_growth   R-squared:                       0.042
Model:                                 OLS   Adj. R-squared:                  0.039
Method:                      Least Squares   F-statistic:                     13.45
Date:                     Wed, 08 Sep 2021   Prob (F-statistic):           1.97e-23
Time:                             20:17:33   Log-Likelihood:                 3116.9
No. Observations:                     3101   AIC:                            -6212.
Df Residuals:                         3090   BIC:                            -6145.
Df Model:                               10                                         
Covariance Type:                 nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
co

In [228]:
# 3 - delta property value (6.7%) +0.0%
input_vars = ['inc_per_capita', 'property_value_median', 'pop_total', 
              'households', 'race_white_ratio', 'race_black_ratio', 'age_median', 
              'travel_driving_ratio', 'edu_bachelor_ratio']

# specify X and y
X = sm.add_constant(np.hstack([normalize(X_place[input_vars], axis = 0), normalize(A_weighted_place.mean(axis = 1), axis = 0)])) # you cannot normalize A_place.mean()
output_var = 'property_value_median_annual_growth'
y = Z_place[output_var]

# regression on y and X
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                                     OLS Regression Results                                    
Dep. Variable:     property_value_median_annual_growth   R-squared:                       0.070
Model:                                             OLS   Adj. R-squared:                  0.067
Method:                                  Least Squares   F-statistic:                     23.21
Date:                                 Wed, 08 Sep 2021   Prob (F-statistic):           1.46e-42
Time:                                         20:20:04   Log-Likelihood:                 2619.9
No. Observations:                                 3101   AIC:                            -5218.
Df Residuals:                                     3090   BIC:                            -5151.
Df Model:                                           10                                         
Covariance Type:                             nonrobust                                         
                 coef    std err        

## 5 - How to boost the prediction accuracy first with A?

### 1. Use sociodemo to predict income

In [119]:
X_place = sm.add_constant(X_place)

In [120]:
mod = sm.OLS(Z_place , X_place)

In [121]:
res = mod.fit()

In [123]:
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.516
Model:                            OLS   Adj. R-squared:                  0.515
Method:                 Least Squares   F-statistic:                     773.4
Date:                Wed, 01 Sep 2021   Prob (F-statistic):               0.00
Time:                        20:18:25   Log-Likelihood:                -33809.
No. Observations:                2905   AIC:                         6.763e+04
Df Residuals:                    2900   BIC:                         6.766e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                -4.099e+04 

### 2. Use unweighted adjacency matrix to predict income

In [124]:
# create row vars for A.
A_mean = A_place.mean(axis = 1)

In [125]:
1 - A_mean

matrix([[0.43063683],
        [0.2313253 ],
        [0.61893287],
        ...,
        [0.76144578],
        [0.85748709],
        [0.74457831]])

In [126]:
# A_var
A_var = np.multiply(A_mean, 1 - A_mean)
A_attributes = np.hstack([A_mean, A_var])
A_attributes

matrix([[0.56936317, 0.24518875],
        [0.7686747 , 0.17781391],
        [0.38106713, 0.23585497],
        ...,
        [0.23855422, 0.1816461 ],
        [0.14251291, 0.12220298],
        [0.25542169, 0.19018145]])

In [127]:
# 
A_attributes = sm.add_constant(A_attributes)

In [128]:
mod = sm.OLS(Z_place , A_attributes)

In [129]:
res = mod.fit()

In [130]:
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.059
Model:                            OLS   Adj. R-squared:                  0.058
Method:                 Least Squares   F-statistic:                     90.41
Date:                Wed, 01 Sep 2021   Prob (F-statistic):           8.13e-39
Time:                        20:18:34   Log-Likelihood:                -34776.
No. Observations:                2905   AIC:                         6.956e+04
Df Residuals:                    2902   BIC:                         6.958e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.213e+05   2882.111     42.071      0.0

### 3. Use weighted adjacency matrix to predict income

Odd. Unweighted adjacency matrix - much worse than the weighted ones.

In [131]:
# 
A_weighted_mean = A_weighted_place.mean(axis = 1)
A_weighted_max = A_weighted_place.max(axis = 1)
# A_weighted_min = A_weighted_place.min(axis = 1)
A_weighted_attributes = np.hstack([A_weighted_mean, A_weighted_max.toarray()])
A_weighted_attributes

matrix([[5.41912909e+02, 6.31019000e+05],
        [9.28446816e+02, 1.25282800e+06],
        [6.22685026e+01, 4.49500000e+04],
        ...,
        [6.92961446e+02, 1.66637000e+06],
        [1.60317040e+02, 2.74405000e+05],
        [7.63872633e+01, 1.32089000e+05]])

In [132]:
# 
A_weighted_attributes = sm.add_constant(A_weighted_attributes)

In [133]:
# 
mod = sm.OLS(Z_place , A_weighted_attributes)

In [134]:
res = mod.fit()

In [135]:
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 income   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     19.72
Date:                Wed, 01 Sep 2021   Prob (F-statistic):           3.11e-09
Time:                        20:18:46   Log-Likelihood:                -34844.
No. Observations:                2905   AIC:                         6.969e+04
Df Residuals:                    2902   BIC:                         6.971e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       8.808e+04    875.474    100.613      0.0

### 4. Some preprocessing & regressions

Impressive - Combining sociodemographics and adjacency matrix. We reach 40% R2 in the testing set.

In [136]:
from sklearn.preprocessing import normalize

# augment the X attributes and A. 
X = sp.hstack([normalize(X_place), normalize(A_place)])

# separate training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, Z_place_train, Z_place_test = train_test_split(
    X, Z_place, test_size=0.33, random_state=42)

from sklearn import linear_model
model = linear_model.Lasso(alpha=10.0)
model.fit(X_train, Z_place_train)

# return R square. sw: roughly 55% R2 with strong regularization. 
print("Training R2: ", model.score(X_train, Z_place_train))
print("Testing R2: ", model.score(X_test, Z_place_test))


In [138]:
X

<2905x2910 sparse matrix of type '<class 'numpy.float64'>'
	with 4608427 stored elements in COOrdinate format>

Lasso(alpha=10.0)

Training R2:  0.6408793369558956
Testing R2:  0.5507143251867689


In [152]:
model.coef_

array([-242468.06214344,    7124.8911622 ,  142400.60363479, ...,
            -0.        ,      -0.        ,  -65308.40855613])

# Graph properties

In [159]:
G_place = nx.from_scipy_sparse_matrix(A_place)

# Q: Wait it is not connected? I remember that the place graph IS connected. Something is wrong.

In [160]:
# 
print(nx.number_connected_components(G_place))
nx.connected_components(G_place)
# nx.diameter(G_place)

2


<generator object connected_components at 0x7f5dc42eeed0>

In [161]:
G_place_components = nx.connected_components(G_place)

Need to double check: How can you get two components? I get only one component last time...

In [162]:
component_1, component_2 = sorted(G_place_components, key=len, reverse=True)

In [163]:
component_1

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [164]:
component_2

{211}

In [165]:
X_place.index[211]

'250092151011'